DocumentCategorizerME documentCategorizerME = new DocumentCategorizerME(model); String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(document); double[] prob = documentCategorizerME.categorize(tokens); String category = documentCategorizerME.getBestCategory(prob);
/** * Starts the evaluation. * * @param samples * the data to train and test * @param nFolds * number of folds * * @throws IOException */ public void evaluate(ObjectStream<DocumentSample> samples, int nFolds) throws IOException { CrossValidationPartitioner<DocumentSample> partitioner = new CrossValidationPartitioner<>( samples, nFolds); while (partitioner.hasNext()) { CrossValidationPartitioner.TrainingSampleStream<DocumentSample> trainingSampleStream = partitioner .next(); DoccatModel model = DocumentCategorizerME.train(languageCode, trainingSampleStream, params, factory); DocumentCategorizerEvaluator evaluator = new DocumentCategorizerEvaluator( new DocumentCategorizerME(model), listeners); evaluator.evaluate(trainingSampleStream.getTestSampleStream()); documentAccuracy.add(evaluator.getAccuracy(), evaluator.getDocumentCount()); } }
/** * Returns a map in which the key is the category name and the value is the score * * @param text the input text to classify * @return the score map */ @Override public Map<String, Double> scoreMap(String[] text) { Map<String, Double> probDist = new HashMap<>(); double[] categorize = categorize(text); int catSize = getNumberOfCategories(); for (int i = 0; i < catSize; i++) { String category = getCategory(i); probDist.put(category, categorize[getIndex(category)]); } return probDist; }
trainingParams.put(TrainingParameters.CUTOFF_PARAM, 0); DoccatModel doccatModel = DocumentCategorizerME.train("en", combinedDocumentSampleStream, trainingParams, new DoccatFactory()); combinedDocumentSampleStream.close(); DocumentCategorizerME categorizer = new DocumentCategorizerME(doccatModel); NameFinderME[] nameFinderMEs = new NameFinderME[tokenNameFinderModels.size()]; for (int i = 0; i < tokenNameFinderModels.size(); i++) { double[] outcome = categorizer.categorize(tokenizer.tokenize(s)); System.out.print("{ action: '" + categorizer.getBestCategory(outcome) + "', args: { ");
private static DoccatModel train(DoccatFactory factory) throws IOException { return DocumentCategorizerME.train("x-unspecified", createSampleStream(), TrainingParameters.defaultParams(), factory); }
public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); this.context = context; Logger mLogger = context.getLogger(); if (mLogger.isLoggable(Level.INFO)) { mLogger.log(Level.INFO, "Initializing the OpenNLP Categorizer."); } DoccatModel model; try { DoccatModelResource modelResource = (DoccatModelResource) context .getResourceObject(UimaUtil.MODEL_PARAMETER); model = modelResource.getModel(); } catch (ResourceAccessException e) { throw new ResourceInitializationException(e); } mCategorizer = new DocumentCategorizerME(model); }
/** * Categorizes the given text. * * @param text the text to categorize */ @Override public double[] categorize(String[] text) { return this.categorize(text, Collections.emptyMap()); }
private static DoccatModel train() throws IOException { return DocumentCategorizerME.train("x-unspecified", createSampleStream(), TrainingParameters.defaultParams(), new DoccatFactory()); }
new DocumentCategorizerME(model), listeners.toArray(new DoccatEvaluationMonitor[listeners.size()]));
/** * Categorizes the given text. * * @param text the text to categorize */ @Override public double[] categorize(String[] text) { return this.categorize(text, Collections.emptyMap()); }
/** * Returns a map with the score as a key in ascending order. * The value is a Set of categories with the score. * Many categories can have the same score, hence the Set as value * * @param text the input text to classify * @return the sorted score map */ @Override public SortedMap<Double, Set<String>> sortedScoreMap(String[] text) { SortedMap<Double, Set<String>> descendingMap = new TreeMap<>(); double[] categorize = categorize(text); int catSize = getNumberOfCategories(); for (int i = 0; i < catSize; i++) { String category = getCategory(i); double score = categorize[getIndex(category)]; if (descendingMap.containsKey(score)) { descendingMap.get(score).add(category); } else { Set<String> newset = new HashSet<>(); newset.add(category); descendingMap.put(score, newset); } } return descendingMap; }
@Override public void predict(RecommenderContext aContext, CAS aCas) throws RecommendationException { DoccatModel model = aContext.get(KEY_MODEL).orElseThrow(() -> new RecommendationException("Key [" + KEY_MODEL + "] not found in context")); DocumentCategorizerME finder = new DocumentCategorizerME(model); Type sentenceType = getType(aCas, Sentence.class); Type predictionType = getAnnotationType(aCas, PredictedSpan.class); Type tokenType = getType(aCas, Token.class); Feature confidenceFeature = predictionType.getFeatureByBaseName("score"); Feature labelFeature = predictionType.getFeatureByBaseName("label"); for (AnnotationFS sentence : select(aCas, sentenceType)) { List<AnnotationFS> tokenAnnotations = selectCovered(tokenType, sentence); String[] tokens = tokenAnnotations.stream() .map(AnnotationFS::getCoveredText) .toArray(String[]::new); double[] outcome = finder.categorize(tokens); String label = finder.getBestCategory(outcome); AnnotationFS annotation = aCas.createAnnotation(predictionType, sentence.getBegin(), sentence.getEnd()); annotation.setDoubleValue(confidenceFeature, NumberUtils.max(outcome)); annotation.setStringValue(labelFeature, label); aCas.addFsToIndexes(annotation); } }
@Test public void testSimpleTraining() throws IOException { ObjectStream<DocumentSample> samples = ObjectStreamUtils.createObjectStream( new DocumentSample("1", new String[]{"a", "b", "c"}), new DocumentSample("1", new String[]{"a", "b", "c", "1", "2"}), new DocumentSample("1", new String[]{"a", "b", "c", "3", "4"}), new DocumentSample("0", new String[]{"x", "y", "z"}), new DocumentSample("0", new String[]{"x", "y", "z", "5", "6"}), new DocumentSample("0", new String[]{"x", "y", "z", "7", "8"})); TrainingParameters params = new TrainingParameters(); params.put(TrainingParameters.ITERATIONS_PARAM, 100); params.put(TrainingParameters.CUTOFF_PARAM, 0); DoccatModel model = DocumentCategorizerME.train("x-unspecified", samples, params, new DoccatFactory()); DocumentCategorizer doccat = new DocumentCategorizerME(model); double[] aProbs = doccat.categorize(new String[]{"a"}); Assert.assertEquals("1", doccat.getBestCategory(aProbs)); double[] bProbs = doccat.categorize(new String[]{"x"}); Assert.assertEquals("0", doccat.getBestCategory(bProbs)); //test to make sure sorted map's last key is cat 1 because it has the highest score. SortedMap<Double, Set<String>> sortedScoreMap = doccat.sortedScoreMap(new String[]{"a"}); Set<String> cat = sortedScoreMap.get(sortedScoreMap.lastKey()); Assert.assertEquals(1, cat.size()); }
@Test(expected = InsufficientTrainingDataException.class) public void insufficientTestData() throws IOException { ObjectStream<DocumentSample> samples = ObjectStreamUtils.createObjectStream( new DocumentSample("1", new String[]{"a", "b", "c"})); TrainingParameters params = new TrainingParameters(); params.put(TrainingParameters.ITERATIONS_PARAM, 100); params.put(TrainingParameters.CUTOFF_PARAM, 0); DocumentCategorizerME.train("x-unspecified", samples, params, new DoccatFactory()); }
DocumentCategorizerME doccat = new DocumentCategorizerME(model);
/** * Categorizes the given text. * * @param text the text to categorize */ @Override public double[] categorize(String[] text) { return this.categorize(text, Collections.emptyMap()); }
/** * Returns a map in which the key is the category name and the value is the score * * @param text the input text to classify * @return the score map */ @Override public Map<String, Double> scoreMap(String[] text) { Map<String, Double> probDist = new HashMap<>(); double[] categorize = categorize(text); int catSize = getNumberOfCategories(); for (int i = 0; i < catSize; i++) { String category = getCategory(i); probDist.put(category, categorize[getIndex(category)]); } return probDist; }
DocumentCategorizerME documentCategorizerME = new DocumentCategorizerME(model); String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(document); double[] prob = documentCategorizerME.categorize(tokens); String category = documentCategorizerME.getBestCategory(prob);
params.put(AbstractTrainer.ALGORITHM_PARAM, NaiveBayesTrainer.NAIVE_BAYES_VALUE); DoccatModel model = DocumentCategorizerME.train("x-unspecified", samples, params, new DoccatFactory()); DocumentCategorizer doccat = new DocumentCategorizerME(model);
@Override public void run(String format, String[] args) { super.run(format, args); mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), false); if (mlParams == null) { mlParams = ModelUtil.createDefaultTrainingParameters(); } File modelOutFile = params.getModel(); CmdLineUtil.checkOutputFile("document categorizer model", modelOutFile); FeatureGenerator[] featureGenerators = createFeatureGenerators(params .getFeatureGenerators()); DoccatModel model; try { DoccatFactory factory = DoccatFactory.create(params.getFactory(), featureGenerators); model = DocumentCategorizerME.train(params.getLang(), sampleStream, mlParams, factory); } catch (IOException e) { throw createTerminationIOException(e); } finally { try { sampleStream.close(); } catch (IOException e) { // sorry that this can fail } } CmdLineUtil.writeModel("document categorizer", modelOutFile, model); }