@Override public DocumentSample read() throws IOException { if (catFileTupleIterator.hasNext()) { Map.Entry<Path, String> catFileTuple = catFileTupleIterator.next(); String text = new String(Files.readAllBytes(catFileTuple.getKey())); return new DocumentSample(catFileTuple.getValue(), tokenizer.tokenize(text)); } return null; }
public static DocumentSample createGoldSample() { return new DocumentSample("aCategory", new String[]{"a", "small", "text"}); }
public static DocumentSample createPredSample() { return new DocumentSample("anotherCategory", new String[]{"a", "small", "text"}); }
public DocumentSample read() throws IOException { String sampleString = samples.read(); if (sampleString != null) { // Whitespace tokenize entire string String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(sampleString); DocumentSample sample; if (tokens.length > 1) { String category = tokens[0]; String[] docTokens = new String[tokens.length - 1]; System.arraycopy(tokens, 1, docTokens, 0, tokens.length - 1); sample = new DocumentSample(category, docTokens); } else { throw new IOException("Empty lines, or lines with only a category string are not allowed!"); } return sample; } return null; } }
/** * Evaluates the given reference {@link DocumentSample} object. * * This is done by categorizing the document from the provided * {@link DocumentSample}. The detected category is then used * to calculate and update the score. * * @param sample the reference {@link TokenSample}. */ public DocumentSample processSample(DocumentSample sample) { String[] document = sample.getText(); double[] probs = categorizer.categorize(document); String cat = categorizer.getBestCategory(probs); if (sample.getCategory().equals(cat)) { accuracy.add(1); } else { accuracy.add(0); } return new DocumentSample(cat, sample.getText()); }
@Test public void testSimpleTraining() throws IOException { ObjectStream<DocumentSample> samples = ObjectStreamUtils.createObjectStream( new DocumentSample("1", new String[]{"a", "b", "c"}), new DocumentSample("1", new String[]{"a", "b", "c", "1", "2"}), new DocumentSample("1", new String[]{"a", "b", "c", "3", "4"}), new DocumentSample("0", new String[]{"x", "y", "z"}), new DocumentSample("0", new String[]{"x", "y", "z", "5", "6"}), new DocumentSample("0", new String[]{"x", "y", "z", "7", "8"})); TrainingParameters params = new TrainingParameters(); params.put(TrainingParameters.ITERATIONS_PARAM, 100); params.put(TrainingParameters.CUTOFF_PARAM, 0); DoccatModel model = DocumentCategorizerME.train("x-unspecified", samples, params, new DoccatFactory()); DocumentCategorizer doccat = new DocumentCategorizerME(model); double[] aProbs = doccat.categorize(new String[]{"a"}); Assert.assertEquals("1", doccat.getBestCategory(aProbs)); double[] bProbs = doccat.categorize(new String[]{"x"}); Assert.assertEquals("0", doccat.getBestCategory(bProbs)); //test to make sure sorted map's last key is cat 1 because it has the highest score. SortedMap<Double, Set<String>> sortedScoreMap = doccat.sortedScoreMap(new String[]{"a"}); Set<String> cat = sortedScoreMap.get(sortedScoreMap.lastKey()); Assert.assertEquals(1, cat.size()); }
new DocumentSample("1", new String[]{"a", "b", "c"}), new DocumentSample("1", new String[]{"a", "b", "c", "1", "2"}), new DocumentSample("1", new String[]{"a", "b", "c", "3", "4"}), new DocumentSample("0", new String[]{"x", "y", "z"}), new DocumentSample("0", new String[]{"x", "y", "z", "5", "6"}), new DocumentSample("0", new String[]{"x", "y", "z", "7", "8"}));
String category = documentCategorizerME.getBestCategory(prob); DocumentSample sample = new DocumentSample(category, tokens); System.out.println(sample.toString());
@Test(expected = InsufficientTrainingDataException.class) public void insufficientTestData() throws IOException { ObjectStream<DocumentSample> samples = ObjectStreamUtils.createObjectStream( new DocumentSample("1", new String[]{"a", "b", "c"})); TrainingParameters params = new TrainingParameters(); params.put(TrainingParameters.ITERATIONS_PARAM, 100); params.put(TrainingParameters.CUTOFF_PARAM, 0); DocumentCategorizerME.train("x-unspecified", samples, params, new DoccatFactory()); }
@Override public DocumentSample read() throws IOException { if (catFileTupleIterator.hasNext()) { Map.Entry<Path, String> catFileTuple = catFileTupleIterator.next(); String text = new String(Files.readAllBytes(catFileTuple.getKey())); return new DocumentSample(catFileTuple.getValue(), tokenizer.tokenize(text)); } return null; }
public DocumentSample read() { if (line == null && !hasNext()) { //<co id="mee.train.read"/> return null; } int split = line.indexOf('\t'); //<co id="mee.train.cat"/> if (split < 0) throw new RuntimeException("Invalid line in " + inputFiles[inputFilesIndex]); String category = line.substring(0,split); String document = line.substring(split+1); line = null; // mark line as consumed String[] tokens = tokenizer.tokenize(document); //<co id="mee.train.tok"/> return new DocumentSample(category, tokens); //<co id="mee.train.sample"/> } /*<calloutlist>
public DocumentSample read() throws IOException { int count = 0; List<String> tokensList = new ArrayList<>(); String line; while (count < sentencesPerDocument && (line = samples.read()) != null) { String[] tokens = tokenizer.tokenize(line); if (tokens.length == 0) { throw new IOException("Empty lines are not allowed!"); } // Always skip first token, that is the sentence number! tokensList.addAll(Arrays.asList(tokens).subList(1, tokens.length)); count++; } if (tokensList.size() > 0) { return new DocumentSample(language, tokensList.toArray(new String[tokensList.size()])); } return null; } }
return new DocumentSample(language, sampleText.toString());
public DocumentSample read() throws IOException { String sampleString = samples.read(); if (sampleString != null) { // Whitespace tokenize entire string String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(sampleString); DocumentSample sample; if (tokens.length > 1) { String category = tokens[0]; String[] docTokens = new String[tokens.length - 1]; System.arraycopy(tokens, 1, docTokens, 0, tokens.length - 1); sample = new DocumentSample(category, docTokens); } else { throw new IOException("Empty lines, or lines with only a category string are not allowed!"); } return sample; } return null; } }
public DocumentSample read() throws IOException { String sampleString = stream.read(); if (sampleString != null) { // Whitespace tokenize entire string String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(sampleString); //remove entities Vector<String> vector = new Vector<String>(tokens.length); boolean skip = false; for (String token : tokens) { if (!token.startsWith("<")) { vector.add(token); } } tokens = new String[vector.size()]; vector.copyInto(tokens); DocumentSample sample; if (tokens.length > 0) { sample = new DocumentSample(category, tokens); } else { throw new IOException("Empty lines are not allowed!"); } return sample; } else { return null; } }
public DocumentSample read() throws IOException { String sampleString = samples.read(); if (sampleString != null) { // Whitespace tokenize entire string String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(sampleString); DocumentSample sample; if (tokens.length > 1) { String category = tokens[0]; String[] docTokens = new String[tokens.length - 1]; System.arraycopy(tokens, 1, docTokens, 0, tokens.length - 1); sample = new DocumentSample(category, docTokens); } else { throw new IOException("Empty lines, or lines with only a category string are not allowed!"); } return sample; } return null; } }
private List<DocumentSample> extractSamples(List<CAS> aCasses) { List<DocumentSample> samples = new ArrayList<>(); for (CAS cas : aCasses) { Type sentenceType = getType(cas, Sentence.class); Type tokenType = getType(cas, Token.class); Map<AnnotationFS, Collection<AnnotationFS>> sentences = indexCovered(cas, sentenceType, tokenType); for (Entry<AnnotationFS, Collection<AnnotationFS>> e : sentences.entrySet()) { AnnotationFS sentence = e.getKey(); Collection<AnnotationFS> tokens = e.getValue(); String[] tokenTexts = tokens.stream() .map(AnnotationFS::getCoveredText) .toArray(String[]::new); Type annotationType = getType(cas, layerName); Feature feature = annotationType.getFeatureByBaseName(featureName); for (AnnotationFS annotation : selectCovered(annotationType, sentence)) { String label = annotation.getFeatureValueAsString(feature); DocumentSample nameSample = new DocumentSample( label != null ? label : NO_CATEGORY, tokenTexts); if (nameSample.getCategory() != null) { samples.add(nameSample); } } } } return samples; }
public void processCas(CAS cas) throws ResourceProcessException { FSIndex categoryIndex = cas.getAnnotationIndex(mCategoryType); if (categoryIndex.size() > 0) { AnnotationFS categoryAnnotation = (AnnotationFS) categoryIndex.iterator().next(); // add to event collection DocumentSample sample = new DocumentSample( categoryAnnotation.getStringValue(mCategoryFeature), cas.getDocumentText()); documentSamples.add(sample); } }
/** * Evaluates the given reference {@link DocumentSample} object. * * This is done by categorizing the document from the provided * {@link DocumentSample}. The detected category is then used * to calculate and update the score. * * @param sample the reference {@link TokenSample}. */ public DocumentSample processSample(DocumentSample sample) { String[] document = sample.getText(); double[] probs = categorizer.categorize(document); String cat = categorizer.getBestCategory(probs); if (sample.getCategory().equals(cat)) { accuracy.add(1); } else { accuracy.add(0); } return new DocumentSample(cat, sample.getText()); }
/** * Evaluates the given reference {@link DocumentSample} object. * * This is done by categorizing the document from the provided * {@link DocumentSample}. The detected category is then used * to calculate and update the score. * * @param sample the reference {@link TokenSample}. */ public DocumentSample processSample(DocumentSample sample) { String[] document = sample.getText(); double[] probs = categorizer.categorize(document); String cat = categorizer.getBestCategory(probs); if (sample.getCategory().equals(cat)) { accuracy.add(1); } else { accuracy.add(0); } return new DocumentSample(cat, sample.getText()); }