@Override public DocumentSample read() throws IOException { if (catFileTupleIterator.hasNext()) { Map.Entry<Path, String> catFileTuple = catFileTupleIterator.next(); String text = new String(Files.readAllBytes(catFileTuple.getKey())); return new DocumentSample(catFileTuple.getValue(), tokenizer.tokenize(text)); } return null; }
/** * Evaluates the given reference {@link DocumentSample} object. * * This is done by categorizing the document from the provided * {@link DocumentSample}. The detected category is then used * to calculate and update the score. * * @param sample the reference {@link TokenSample}. */ public DocumentSample processSample(DocumentSample sample) { String[] document = sample.getText(); double[] probs = categorizer.categorize(document); String cat = categorizer.getBestCategory(probs); if (sample.getCategory().equals(cat)) { accuracy.add(1); } else { accuracy.add(0); } return new DocumentSample(cat, sample.getText()); }
public Event next() { isVirgin = false; return new Event(sample.getCategory(), mContextGenerator.getContext(sample.getText(), sample.getExtraInformation())); }
@Override public boolean equals(Object obj) { if (this == obj) { return true; } if (obj instanceof DocumentSample) { DocumentSample a = (DocumentSample) obj; return getCategory().equals(a.getCategory()) && Arrays.equals(getText(), a.getText()); } return false; } }
String category = documentCategorizerME.getBestCategory(prob); DocumentSample sample = new DocumentSample(category, tokens); System.out.println(sample.toString());
private List<DocumentSample> extractSamples(List<CAS> aCasses) { List<DocumentSample> samples = new ArrayList<>(); for (CAS cas : aCasses) { Type sentenceType = getType(cas, Sentence.class); Type tokenType = getType(cas, Token.class); Map<AnnotationFS, Collection<AnnotationFS>> sentences = indexCovered(cas, sentenceType, tokenType); for (Entry<AnnotationFS, Collection<AnnotationFS>> e : sentences.entrySet()) { AnnotationFS sentence = e.getKey(); Collection<AnnotationFS> tokens = e.getValue(); String[] tokenTexts = tokens.stream() .map(AnnotationFS::getCoveredText) .toArray(String[]::new); Type annotationType = getType(cas, layerName); Feature feature = annotationType.getFeatureByBaseName(featureName); for (AnnotationFS annotation : selectCovered(annotationType, sentence)) { String label = annotation.getFeatureValueAsString(feature); DocumentSample nameSample = new DocumentSample( label != null ? label : NO_CATEGORY, tokenTexts); if (nameSample.getCategory() != null) { samples.add(nameSample); } } } } return samples; }
@Override public int hashCode() { return Objects.hash(getCategory(), Arrays.hashCode(getText())); }
String category = documentCategorizerME.getBestCategory(prob); DocumentSample sample = new DocumentSample(category, tokens); System.out.println(sample.toString());
public static DocumentSample createGoldSample() { return new DocumentSample("aCategory", new String[]{"a", "small", "text"}); }
/** * Evaluates the given reference {@link DocumentSample} object. * * This is done by categorizing the document from the provided * {@link DocumentSample}. The detected category is then used * to calculate and update the score. * * @param sample the reference {@link TokenSample}. */ public DocumentSample processSample(DocumentSample sample) { String[] document = sample.getText(); double[] probs = categorizer.categorize(document); String cat = categorizer.getBestCategory(probs); if (sample.getCategory().equals(cat)) { accuracy.add(1); } else { accuracy.add(0); } return new DocumentSample(cat, sample.getText()); }
private void statsAdd(DocumentSample reference, DocumentSample prediction) { getStats().add(reference.getText(), reference.getCategory(), prediction.getCategory()); }
public Event next() { isVirgin = false; return new Event(sample.getCategory(), mContextGenerator.getContext(sample.getText(), sample.getExtraInformation())); }
String category = documentCategorizerME.getBestCategory(prob); DocumentSample sample = new DocumentSample(category, tokens); System.out.println(sample.toString());
public static DocumentSample createPredSample() { return new DocumentSample("anotherCategory", new String[]{"a", "small", "text"}); }
/** * Evaluates the given reference {@link DocumentSample} object. * * This is done by categorizing the document from the provided * {@link DocumentSample}. The detected category is then used * to calculate and update the score. * * @param sample the reference {@link TokenSample}. */ public DocumentSample processSample(DocumentSample sample) { String[] document = sample.getText(); double[] probs = categorizer.categorize(document); String cat = categorizer.getBestCategory(probs); if (sample.getCategory().equals(cat)) { accuracy.add(1); } else { accuracy.add(0); } return new DocumentSample(cat, sample.getText()); }
@Test public void testDocumentSampleSerDe() throws IOException { DocumentSample documentSample = createGoldSample(); ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); ObjectOutput out = new ObjectOutputStream(byteArrayOutputStream); out.writeObject(documentSample); out.flush(); byte[] bytes = byteArrayOutputStream.toByteArray(); ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(bytes); ObjectInput objectInput = new ObjectInputStream(byteArrayInputStream); DocumentSample deSerializedDocumentSample = null; try { deSerializedDocumentSample = (DocumentSample) objectInput.readObject(); } catch (ClassNotFoundException e) { // do nothing } Assert.assertNotNull(deSerializedDocumentSample); Assert.assertEquals(documentSample.getCategory(), deSerializedDocumentSample.getCategory()); Assert.assertArrayEquals(documentSample.getText(), deSerializedDocumentSample.getText()); }
public Event next() { isVirgin = false; return new Event(sample.getCategory(), mContextGenerator.getContext(sample.getText(), sample.getExtraInformation())); }
public DocumentSample read() throws IOException { String sampleString = samples.read(); if (sampleString != null) { // Whitespace tokenize entire string String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(sampleString); DocumentSample sample; if (tokens.length > 1) { String category = tokens[0]; String[] docTokens = new String[tokens.length - 1]; System.arraycopy(tokens, 1, docTokens, 0, tokens.length - 1); sample = new DocumentSample(category, docTokens); } else { throw new IOException("Empty lines, or lines with only a category string are not allowed!"); } return sample; } return null; } }