opennlp.tools.doccat.DocumentCategorizer java code examples

 public void process(CAS cas) {

  FSIterator<AnnotationFS> tokenAnnotations = cas.getAnnotationIndex(mTokenType).iterator();
  List<String> tokensList = new ArrayList<>();

  while (tokenAnnotations.hasNext()) {
   tokensList.add(tokenAnnotations.next().getCoveredText());
  }

  double[] result =
    mCategorizer.categorize(tokensList.toArray(new String[tokensList.size()]));

  String bestCategory = mCategorizer.getBestCategory(result);

  setBestCategory(cas, bestCategory);
 }
}

@Test
public void testSimpleTraining() throws IOException {
 ObjectStream<DocumentSample> samples = ObjectStreamUtils.createObjectStream(
   new DocumentSample("1", new String[]{"a", "b", "c"}),
   new DocumentSample("1", new String[]{"a", "b", "c", "1", "2"}),
   new DocumentSample("1", new String[]{"a", "b", "c", "3", "4"}),
   new DocumentSample("0", new String[]{"x", "y", "z"}),
   new DocumentSample("0", new String[]{"x", "y", "z", "5", "6"}),
   new DocumentSample("0", new String[]{"x", "y", "z", "7", "8"}));
 TrainingParameters params = new TrainingParameters();
 params.put(TrainingParameters.ITERATIONS_PARAM, 100);
 params.put(TrainingParameters.CUTOFF_PARAM, 0);
 DoccatModel model = DocumentCategorizerME.train("x-unspecified", samples,
     params, new DoccatFactory());
 DocumentCategorizer doccat = new DocumentCategorizerME(model);
 double[] aProbs = doccat.categorize(new String[]{"a"});
 Assert.assertEquals("1", doccat.getBestCategory(aProbs));
 double[] bProbs = doccat.categorize(new String[]{"x"});
 Assert.assertEquals("0", doccat.getBestCategory(bProbs));
 //test to make sure sorted map's last key is cat 1 because it has the highest score.
 SortedMap<Double, Set<String>> sortedScoreMap = doccat.sortedScoreMap(new String[]{"a"});
 Set<String> cat = sortedScoreMap.get(sortedScoreMap.lastKey());
 Assert.assertEquals(1, cat.size());
}

String[] tokens  = tokenizer.tokenize(docText); 
double[] probs   = categorizer.categorize(tokens); //<co id="tmt.categorize"/>
String label     = categorizer.getBestCategory(probs);
int    bestIndex = categorizer.getIndex(label);
double score     = probs[bestIndex];

Tokenizer tokenizer = SimpleTokenizer.INSTANCE;
int catCount = categorizer.getNumberOfCategories();
Collection<String> categories 
 = new ArrayList<String>(catCount);
for (int i=0; i < catCount; i++) {
 categories.add(categorizer.getCategory(i));

/**
 * Evaluates the given reference {@link DocumentSample} object.
 *
 * This is done by categorizing the document from the provided
 * {@link DocumentSample}. The detected category is then used
 * to calculate and update the score.
 *
 * @param sample the reference {@link TokenSample}.
 */
public DocumentSample processSample(DocumentSample sample) {
 String[] document = sample.getText();
 double[] probs = categorizer.categorize(document);
 String cat = categorizer.getBestCategory(probs);
 if (sample.getCategory().equals(cat)) {
  accuracy.add(1);
 }
 else {
  accuracy.add(0);
 }
 return new DocumentSample(cat, sample.getText());
}

double[] aProbs = doccat.categorize(new String[]{"a"});
Assert.assertEquals("1", doccat.getBestCategory(aProbs));
double[] bProbs = doccat.categorize(new String[]{"x"});
Assert.assertEquals("0", doccat.getBestCategory(bProbs));
SortedMap<Double, Set<String>> sortedScoreMap = doccat.sortedScoreMap(new String[]{"a"});
Set<String> cat = sortedScoreMap.get(sortedScoreMap.lastKey());
Assert.assertEquals(1, cat.size());

 public void process(CAS cas) {

  FSIterator<AnnotationFS> tokenAnnotations = cas.getAnnotationIndex(mTokenType).iterator();
  List<String> tokensList = new ArrayList<>();

  while (tokenAnnotations.hasNext()) {
   tokensList.add(tokenAnnotations.next().getCoveredText());
  }

  double[] result =
    mCategorizer.categorize(tokensList.toArray(new String[tokensList.size()]));

  String bestCategory = mCategorizer.getBestCategory(result);

  setBestCategory(cas, bestCategory);
 }
}

/**
 * Evaluates the given reference {@link DocumentSample} object.
 *
 * This is done by categorizing the document from the provided
 * {@link DocumentSample}. The detected category is then used
 * to calculate and update the score.
 *
 * @param sample the reference {@link TokenSample}.
 */
public DocumentSample processSample(DocumentSample sample) {
 String[] document = sample.getText();
 double[] probs = categorizer.categorize(document);
 String cat = categorizer.getBestCategory(probs);
 if (sample.getCategory().equals(cat)) {
  accuracy.add(1);
 }
 else {
  accuracy.add(0);
 }
 return new DocumentSample(cat, sample.getText());
}

/**
 * Evaluates the given reference {@link DocumentSample} object.
 *
 * This is done by categorizing the document from the provided
 * {@link DocumentSample}. The detected category is then used
 * to calculate and update the score.
 *
 * @param sample the reference {@link TokenSample}.
 */
public DocumentSample processSample(DocumentSample sample) {
 String[] document = sample.getText();
 double[] probs = categorizer.categorize(document);
 String cat = categorizer.getBestCategory(probs);
 if (sample.getCategory().equals(cat)) {
  accuracy.add(1);
 }
 else {
  accuracy.add(0);
 }
 return new DocumentSample(cat, sample.getText());
}

Javadoc

Interface for classes which categorize documents.

Most used methods

categorize
Categorize the given text provided as tokens along with the provided extra information
getBestCategory
get the best category from previously generated outcome probabilities
getCategory
get the category at a given index
getIndex
get the index of a certain category
getNumberOfCategories
get the number of categories
sortedScoreMap
Get a map of the scores sorted in ascending aorder together with their associated categories. Many c

Popular in Java

Making http requests using okhttp
addToBackStack (FragmentTransaction)
setRequestProperty (URLConnection)
requestLocationUpdates (LocationManager)
Runnable (java.lang)
Represents a command that can be executed. Often used to run code in a different Thread.
Stack (java.util)
Stack is a Last-In/First-Out(LIFO) data structure which represents a stack of objects. It enables u
BlockingQueue (java.util.concurrent)
A java.util.Queue that additionally supports operations that wait for the queue to become non-empty
Base64 (org.apache.commons.codec.binary)
Provides Base64 encoding and decoding as defined by RFC 2045.This class implements section 6.8. Base
Reference (javax.naming)
Response (javax.ws.rs.core)
Defines the contract between a returned instance and the runtime when an application needs to provid
From CI to AI: The AI layer in your organization

How to useDocumentCategorizer in opennlp.tools.doccat

Best Java code snippets using opennlp.tools.doccat.DocumentCategorizer (Showing top 9 results out of 315)

How to use
DocumentCategorizer
in
opennlp.tools.doccat