opennlp.tools.doccat.DocumentCategorizerME java code examples

DocumentCategorizerME documentCategorizerME = new DocumentCategorizerME(model);
  String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(document);
  double[] prob = documentCategorizerME.categorize(tokens);
  String category = documentCategorizerME.getBestCategory(prob);

/**
 * Starts the evaluation.
 *
 * @param samples
 *          the data to train and test
 * @param nFolds
 *          number of folds
 *
 * @throws IOException
 */
public void evaluate(ObjectStream<DocumentSample> samples, int nFolds)
  throws IOException {
 CrossValidationPartitioner<DocumentSample> partitioner = new CrossValidationPartitioner<>(
   samples, nFolds);
 while (partitioner.hasNext()) {
  CrossValidationPartitioner.TrainingSampleStream<DocumentSample> trainingSampleStream = partitioner
    .next();
  DoccatModel model = DocumentCategorizerME.train(languageCode,
    trainingSampleStream, params, factory);
  DocumentCategorizerEvaluator evaluator = new DocumentCategorizerEvaluator(
    new DocumentCategorizerME(model), listeners);
  evaluator.evaluate(trainingSampleStream.getTestSampleStream());
  documentAccuracy.add(evaluator.getAccuracy(),
    evaluator.getDocumentCount());
 }
}

/**
 * Returns a map in which the key is the category name and the value is the score
 *
 * @param text the input text to classify
 * @return the score map
 */
@Override
public Map<String, Double> scoreMap(String[] text) {
 Map<String, Double> probDist = new HashMap<>();
 double[] categorize = categorize(text);
 int catSize = getNumberOfCategories();
 for (int i = 0; i < catSize; i++) {
  String category = getCategory(i);
  probDist.put(category, categorize[getIndex(category)]);
 }
 return probDist;
}

trainingParams.put(TrainingParameters.CUTOFF_PARAM, 0);
DoccatModel doccatModel = DocumentCategorizerME.train("en", combinedDocumentSampleStream, trainingParams, new DoccatFactory());
combinedDocumentSampleStream.close();
DocumentCategorizerME categorizer = new DocumentCategorizerME(doccatModel);
NameFinderME[] nameFinderMEs = new NameFinderME[tokenNameFinderModels.size()];
for (int i = 0; i < tokenNameFinderModels.size(); i++) {
  double[] outcome = categorizer.categorize(tokenizer.tokenize(s));
  System.out.print("{ action: '" + categorizer.getBestCategory(outcome) + "', args: { ");

private static DoccatModel train(DoccatFactory factory) throws IOException {
 return DocumentCategorizerME.train("x-unspecified", createSampleStream(),
   TrainingParameters.defaultParams(), factory);
}

public void initialize(UimaContext context)
  throws ResourceInitializationException {
 super.initialize(context);
 this.context = context;
 Logger mLogger = context.getLogger();
 if (mLogger.isLoggable(Level.INFO)) {
  mLogger.log(Level.INFO, "Initializing the OpenNLP Categorizer.");
 }
 DoccatModel model;
 try {
  DoccatModelResource modelResource = (DoccatModelResource) context
    .getResourceObject(UimaUtil.MODEL_PARAMETER);
  model = modelResource.getModel();
 } catch (ResourceAccessException e) {
  throw new ResourceInitializationException(e);
 }
 mCategorizer = new DocumentCategorizerME(model);
}

/**
 * Categorizes the given text.
 *
 * @param text the text to categorize
 */
@Override
public double[] categorize(String[] text) {
 return this.categorize(text, Collections.emptyMap());
}

private static DoccatModel train() throws IOException {
 return DocumentCategorizerME.train("x-unspecified", createSampleStream(),
   TrainingParameters.defaultParams(), new DoccatFactory());
}

new DocumentCategorizerME(model),
listeners.toArray(new DoccatEvaluationMonitor[listeners.size()]));

/**
 * Categorizes the given text.
 *
 * @param text the text to categorize
 */
@Override
public double[] categorize(String[] text) {
 return this.categorize(text, Collections.emptyMap());
}

/**
 * Returns a map with the score as a key in ascending order.
 * The value is a Set of categories with the score.
 * Many categories can have the same score, hence the Set as value
 *
 * @param text the input text to classify
 * @return the sorted score map
 */
@Override
public SortedMap<Double, Set<String>> sortedScoreMap(String[] text) {
 SortedMap<Double, Set<String>> descendingMap = new TreeMap<>();
 double[] categorize = categorize(text);
 int catSize = getNumberOfCategories();
 for (int i = 0; i < catSize; i++) {
  String category = getCategory(i);
  double score = categorize[getIndex(category)];
  if (descendingMap.containsKey(score)) {
   descendingMap.get(score).add(category);
  } else {
   Set<String> newset = new HashSet<>();
   newset.add(category);
   descendingMap.put(score, newset);
  }
 }
 return descendingMap;
}

@Override
public void predict(RecommenderContext aContext, CAS aCas) throws RecommendationException
{
  DoccatModel model = aContext.get(KEY_MODEL).orElseThrow(() -> 
      new RecommendationException("Key [" + KEY_MODEL + "] not found in context"));
  
  DocumentCategorizerME finder = new DocumentCategorizerME(model);
  Type sentenceType = getType(aCas, Sentence.class);
  Type predictionType = getAnnotationType(aCas, PredictedSpan.class);
  Type tokenType = getType(aCas, Token.class);
  Feature confidenceFeature = predictionType.getFeatureByBaseName("score");
  Feature labelFeature = predictionType.getFeatureByBaseName("label");
  for (AnnotationFS sentence : select(aCas, sentenceType)) {
    List<AnnotationFS> tokenAnnotations = selectCovered(tokenType, sentence);
    String[] tokens = tokenAnnotations.stream()
      .map(AnnotationFS::getCoveredText)
      .toArray(String[]::new);
    double[] outcome = finder.categorize(tokens);
    String label = finder.getBestCategory(outcome);
    
    AnnotationFS annotation = aCas.createAnnotation(predictionType, sentence.getBegin(),
        sentence.getEnd());
    annotation.setDoubleValue(confidenceFeature, NumberUtils.max(outcome));
    annotation.setStringValue(labelFeature, label);
    aCas.addFsToIndexes(annotation);
  }
}

@Test
public void testSimpleTraining() throws IOException {
 ObjectStream<DocumentSample> samples = ObjectStreamUtils.createObjectStream(
   new DocumentSample("1", new String[]{"a", "b", "c"}),
   new DocumentSample("1", new String[]{"a", "b", "c", "1", "2"}),
   new DocumentSample("1", new String[]{"a", "b", "c", "3", "4"}),
   new DocumentSample("0", new String[]{"x", "y", "z"}),
   new DocumentSample("0", new String[]{"x", "y", "z", "5", "6"}),
   new DocumentSample("0", new String[]{"x", "y", "z", "7", "8"}));
 TrainingParameters params = new TrainingParameters();
 params.put(TrainingParameters.ITERATIONS_PARAM, 100);
 params.put(TrainingParameters.CUTOFF_PARAM, 0);
 DoccatModel model = DocumentCategorizerME.train("x-unspecified", samples,
     params, new DoccatFactory());
 DocumentCategorizer doccat = new DocumentCategorizerME(model);
 double[] aProbs = doccat.categorize(new String[]{"a"});
 Assert.assertEquals("1", doccat.getBestCategory(aProbs));
 double[] bProbs = doccat.categorize(new String[]{"x"});
 Assert.assertEquals("0", doccat.getBestCategory(bProbs));
 //test to make sure sorted map's last key is cat 1 because it has the highest score.
 SortedMap<Double, Set<String>> sortedScoreMap = doccat.sortedScoreMap(new String[]{"a"});
 Set<String> cat = sortedScoreMap.get(sortedScoreMap.lastKey());
 Assert.assertEquals(1, cat.size());
}

@Test(expected = InsufficientTrainingDataException.class)
public void insufficientTestData() throws IOException {
 ObjectStream<DocumentSample> samples = ObjectStreamUtils.createObjectStream(
   new DocumentSample("1", new String[]{"a", "b", "c"}));
 TrainingParameters params = new TrainingParameters();
 params.put(TrainingParameters.ITERATIONS_PARAM, 100);
 params.put(TrainingParameters.CUTOFF_PARAM, 0);
 DocumentCategorizerME.train("x-unspecified", samples,
   params, new DoccatFactory());
}

DocumentCategorizerME doccat = new DocumentCategorizerME(model);

/**
 * Categorizes the given text.
 *
 * @param text the text to categorize
 */
@Override
public double[] categorize(String[] text) {
 return this.categorize(text, Collections.emptyMap());
}

/**
 * Returns a map in which the key is the category name and the value is the score
 *
 * @param text the input text to classify
 * @return the score map
 */
@Override
public Map<String, Double> scoreMap(String[] text) {
 Map<String, Double> probDist = new HashMap<>();
 double[] categorize = categorize(text);
 int catSize = getNumberOfCategories();
 for (int i = 0; i < catSize; i++) {
  String category = getCategory(i);
  probDist.put(category, categorize[getIndex(category)]);
 }
 return probDist;
}

DocumentCategorizerME documentCategorizerME = new DocumentCategorizerME(model);
  String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(document);
  double[] prob = documentCategorizerME.categorize(tokens);
  String category = documentCategorizerME.getBestCategory(prob);

params.put(AbstractTrainer.ALGORITHM_PARAM, NaiveBayesTrainer.NAIVE_BAYES_VALUE);
DoccatModel model = DocumentCategorizerME.train("x-unspecified", samples,
  params, new DoccatFactory());
DocumentCategorizer doccat = new DocumentCategorizerME(model);

@Override
public void run(String format, String[] args) {
 super.run(format, args);
 mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), false);
 if (mlParams == null) {
  mlParams = ModelUtil.createDefaultTrainingParameters();
 }
 File modelOutFile = params.getModel();
 CmdLineUtil.checkOutputFile("document categorizer model", modelOutFile);
 FeatureGenerator[] featureGenerators = createFeatureGenerators(params
   .getFeatureGenerators());
 DoccatModel model;
 try {
  DoccatFactory factory = DoccatFactory.create(params.getFactory(), featureGenerators);
  model = DocumentCategorizerME.train(params.getLang(), sampleStream,
    mlParams, factory);
 } catch (IOException e) {
  throw createTerminationIOException(e);
 }
 finally {
  try {
   sampleStream.close();
  } catch (IOException e) {
   // sorry that this can fail
  }
 }
 CmdLineUtil.writeModel("document categorizer", modelOutFile, model);
}

Javadoc

Maxent implementation of DocumentCategorizer.

Most used methods

<init>
Initializes the current instance with a doccat model. Default feature generation is used.
train
categorize
Categorize the given text provided as tokens along with the provided extra information
getBestCategory
getCategory
getIndex
getNumberOfCategories

Popular in Java

Parsing JSON documents to java classes using gson
getResourceAsStream (ClassLoader)
onRequestPermissionsResult (Fragment)
putExtra (Intent)
Stack (java.util)
Stack is a Last-In/First-Out(LIFO) data structure which represents a stack of objects. It enables u
Logger (org.slf4j)
The org.slf4j.Logger interface is the main user entry point of SLF4J API. It is expected that loggin
VirtualMachine (com.sun.tools.attach)
A Java virtual machine. A VirtualMachine represents a Java virtual machine to which this Java vir
BorderLayout (java.awt)
A border layout lays out a container, arranging and resizing its components to fit in five regions:
FlowLayout (java.awt)
A flow layout arranges components in a left-to-right flow, much like lines of text in a paragraph. F
Filter (javax.servlet)
A filter is an object that performs filtering tasks on either the request to a resource (a servlet o
Best plugins for Eclipse

How to useDocumentCategorizerME in opennlp.tools.doccat

Best Java code snippets using opennlp.tools.doccat.DocumentCategorizerME (Showing top 20 results out of 315)

How to use
DocumentCategorizerME
in
opennlp.tools.doccat