opennlp.tools.doccat.DocumentCategorizerME.train java code examples

private static DoccatModel train(DoccatFactory factory) throws IOException {
 return DocumentCategorizerME.train("x-unspecified", createSampleStream(),
   TrainingParameters.defaultParams(), factory);
}

private static DoccatModel train() throws IOException {
 return DocumentCategorizerME.train("x-unspecified", createSampleStream(),
   TrainingParameters.defaultParams(), new DoccatFactory());
}

@Test(expected = InsufficientTrainingDataException.class)
public void insufficientTestData() throws IOException {
 ObjectStream<DocumentSample> samples = ObjectStreamUtils.createObjectStream(
   new DocumentSample("1", new String[]{"a", "b", "c"}));
 TrainingParameters params = new TrainingParameters();
 params.put(TrainingParameters.ITERATIONS_PARAM, 100);
 params.put(TrainingParameters.CUTOFF_PARAM, 0);
 DocumentCategorizerME.train("x-unspecified", samples,
   params, new DoccatFactory());
}

/**
 * Starts the evaluation.
 *
 * @param samples
 *          the data to train and test
 * @param nFolds
 *          number of folds
 *
 * @throws IOException
 */
public void evaluate(ObjectStream<DocumentSample> samples, int nFolds)
  throws IOException {
 CrossValidationPartitioner<DocumentSample> partitioner = new CrossValidationPartitioner<>(
   samples, nFolds);
 while (partitioner.hasNext()) {
  CrossValidationPartitioner.TrainingSampleStream<DocumentSample> trainingSampleStream = partitioner
    .next();
  DoccatModel model = DocumentCategorizerME.train(languageCode,
    trainingSampleStream, params, factory);
  DocumentCategorizerEvaluator evaluator = new DocumentCategorizerEvaluator(
    new DocumentCategorizerME(model), listeners);
  evaluator.evaluate(trainingSampleStream.getTestSampleStream());
  documentAccuracy.add(evaluator.getAccuracy(),
    evaluator.getDocumentCount());
 }
}

@Override
public void run(String format, String[] args) {
 super.run(format, args);
 mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), false);
 if (mlParams == null) {
  mlParams = ModelUtil.createDefaultTrainingParameters();
 }
 File modelOutFile = params.getModel();
 CmdLineUtil.checkOutputFile("document categorizer model", modelOutFile);
 FeatureGenerator[] featureGenerators = createFeatureGenerators(params
   .getFeatureGenerators());
 DoccatModel model;
 try {
  DoccatFactory factory = DoccatFactory.create(params.getFactory(), featureGenerators);
  model = DocumentCategorizerME.train(params.getLang(), sampleStream,
    mlParams, factory);
 } catch (IOException e) {
  throw createTerminationIOException(e);
 }
 finally {
  try {
   sampleStream.close();
  } catch (IOException e) {
   // sorry that this can fail
  }
 }
 CmdLineUtil.writeModel("document categorizer", modelOutFile, model);
}

@Test
public void testSimpleTraining() throws IOException {
 ObjectStream<DocumentSample> samples = ObjectStreamUtils.createObjectStream(
   new DocumentSample("1", new String[]{"a", "b", "c"}),
   new DocumentSample("1", new String[]{"a", "b", "c", "1", "2"}),
   new DocumentSample("1", new String[]{"a", "b", "c", "3", "4"}),
   new DocumentSample("0", new String[]{"x", "y", "z"}),
   new DocumentSample("0", new String[]{"x", "y", "z", "5", "6"}),
   new DocumentSample("0", new String[]{"x", "y", "z", "7", "8"}));
 TrainingParameters params = new TrainingParameters();
 params.put(TrainingParameters.ITERATIONS_PARAM, 100);
 params.put(TrainingParameters.CUTOFF_PARAM, 0);
 DoccatModel model = DocumentCategorizerME.train("x-unspecified", samples,
     params, new DoccatFactory());
 DocumentCategorizer doccat = new DocumentCategorizerME(model);
 double[] aProbs = doccat.categorize(new String[]{"a"});
 Assert.assertEquals("1", doccat.getBestCategory(aProbs));
 double[] bProbs = doccat.categorize(new String[]{"x"});
 Assert.assertEquals("0", doccat.getBestCategory(bProbs));
 //test to make sure sorted map's last key is cat 1 because it has the highest score.
 SortedMap<Double, Set<String>> sortedScoreMap = doccat.sortedScoreMap(new String[]{"a"});
 Set<String> cat = sortedScoreMap.get(sortedScoreMap.lastKey());
 Assert.assertEquals(1, cat.size());
}

params.put(AbstractTrainer.ALGORITHM_PARAM, NaiveBayesTrainer.NAIVE_BAYES_VALUE);
DoccatModel model = DocumentCategorizerME.train("x-unspecified", samples,
  params, new DoccatFactory());

  private DoccatModel train(List<DocumentSample> aSamples, TrainingParameters aParameters)
    throws RecommendationException
  {
    try (DocumentSampleStream stream = new DocumentSampleStream(aSamples)) {
      DoccatFactory factory = new DoccatFactory();
      return DocumentCategorizerME.train("unknown", stream, aParameters, factory);
    }
    catch (IOException e) {
      throw new RecommendationException(
          "Exception during training the OpenNLP Document Categorizer model.", e);
    }
  }
}

 public void train(String source, String destination) throws IOException {
  //<start id="maxent.examples.train.setup"/> 
  File[] inputFiles = FileUtil.buildFileList(new File(source));
  File modelFile = new File(destination);
  
  Tokenizer tokenizer = SimpleTokenizer.INSTANCE; //<co id="tm.tok"/>
  CategoryDataStream ds = new CategoryDataStream(inputFiles, tokenizer);

  int cutoff = 5;
  int iterations = 100;
  NameFinderFeatureGenerator nffg //<co id="tm.fg"/>
   = new NameFinderFeatureGenerator();
  BagOfWordsFeatureGenerator bowfg 
   = new BagOfWordsFeatureGenerator();

  DoccatModel model = DocumentCategorizerME.train("en", 
    ds, cutoff, iterations, nffg, bowfg); //<co id="tm.train"/>
  model.serialize(new FileOutputStream(modelFile));
  
/*<calloutlist>
<callout arearefs="tm.tok">Create data stream</callout>
<callout arearefs="tm.fg">Set up features generators</callout> 
<callout arearefs="tm.train">Train categorizer</callout>  
</calloutlist>*/
//<end id="maxent.examples.train.setup"/>
 }

public void collectionProcessComplete(ProcessTrace trace) 
  throws ResourceProcessException, IOException {
 
 GIS.PRINT_MESSAGES = false;
 DoccatModel categoryModel = DocumentCategorizerME.train(language, ObjectStreamUtils.createObjectStream(documentSamples));
 
 File modelFile = new File(getUimaContextAdmin().getResourceManager()
   .getDataPath() + File.separatorChar + mModelName);
 OpennlpUtil.serialize(categoryModel, modelFile);
}

trainingParams.put(TrainingParameters.CUTOFF_PARAM, 0);
DoccatModel doccatModel = DocumentCategorizerME.train("en", combinedDocumentSampleStream, trainingParams, new DoccatFactory());
combinedDocumentSampleStream.close();

/**
 * Starts the evaluation.
 *
 * @param samples
 *          the data to train and test
 * @param nFolds
 *          number of folds
 *
 * @throws IOException
 */
public void evaluate(ObjectStream<DocumentSample> samples, int nFolds)
  throws IOException {
 CrossValidationPartitioner<DocumentSample> partitioner = new CrossValidationPartitioner<>(
   samples, nFolds);
 while (partitioner.hasNext()) {
  CrossValidationPartitioner.TrainingSampleStream<DocumentSample> trainingSampleStream = partitioner
    .next();
  DoccatModel model = DocumentCategorizerME.train(languageCode,
    trainingSampleStream, params, factory);
  DocumentCategorizerEvaluator evaluator = new DocumentCategorizerEvaluator(
    new DocumentCategorizerME(model), listeners);
  evaluator.evaluate(trainingSampleStream.getTestSampleStream());
  documentAccuracy.add(evaluator.getAccuracy(),
    evaluator.getDocumentCount());
 }
}

/**
 * Starts the evaluation.
 *
 * @param samples
 *          the data to train and test
 * @param nFolds
 *          number of folds
 *
 * @throws IOException
 */
public void evaluate(ObjectStream<DocumentSample> samples, int nFolds)
  throws IOException {
 CrossValidationPartitioner<DocumentSample> partitioner = new CrossValidationPartitioner<>(
   samples, nFolds);
 while (partitioner.hasNext()) {
  CrossValidationPartitioner.TrainingSampleStream<DocumentSample> trainingSampleStream = partitioner
    .next();
  DoccatModel model = DocumentCategorizerME.train(languageCode,
    trainingSampleStream, params, factory);
  DocumentCategorizerEvaluator evaluator = new DocumentCategorizerEvaluator(
    new DocumentCategorizerME(model), listeners);
  evaluator.evaluate(trainingSampleStream.getTestSampleStream());
  documentAccuracy.add(evaluator.getAccuracy(),
    evaluator.getDocumentCount());
 }
}

@Override
public void run(String format, String[] args) {
 super.run(format, args);
 mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), false);
 if (mlParams == null) {
  mlParams = ModelUtil.createDefaultTrainingParameters();
 }
 File modelOutFile = params.getModel();
 CmdLineUtil.checkOutputFile("document categorizer model", modelOutFile);
 FeatureGenerator[] featureGenerators = createFeatureGenerators(params
   .getFeatureGenerators());
 DoccatModel model;
 try {
  DoccatFactory factory = DoccatFactory.create(params.getFactory(), featureGenerators);
  model = DocumentCategorizerME.train(params.getLang(), sampleStream,
    mlParams, factory);
 } catch (IOException e) {
  throw createTerminationIOException(e);
 }
 finally {
  try {
   sampleStream.close();
  } catch (IOException e) {
   // sorry that this can fail
  }
 }
 CmdLineUtil.writeModel("document categorizer", modelOutFile, model);
}

@Override
public void run(String format, String[] args) {
 super.run(format, args);
 mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), false);
 if (mlParams == null) {
  mlParams = ModelUtil.createDefaultTrainingParameters();
 }
 File modelOutFile = params.getModel();
 CmdLineUtil.checkOutputFile("document categorizer model", modelOutFile);
 FeatureGenerator[] featureGenerators = createFeatureGenerators(params
   .getFeatureGenerators());
 DoccatModel model;
 try {
  DoccatFactory factory = DoccatFactory.create(params.getFactory(), featureGenerators);
  model = DocumentCategorizerME.train(params.getLang(), sampleStream,
    mlParams, factory);
 } catch (IOException e) {
  throw createTerminationIOException(e);
 }
 finally {
  try {
   sampleStream.close();
  } catch (IOException e) {
   // sorry that this can fail
  }
 }
 CmdLineUtil.writeModel("document categorizer", modelOutFile, model);
}

Popular in Java

Making http requests using okhttp
onRequestPermissionsResult (Fragment)
findViewById (Activity)
compareTo (BigDecimal)
InputStreamReader (java.io)
A class for turning a byte stream into a character stream. Data read from the source input stream is
SecureRandom (java.security)
This class generates cryptographically secure pseudo-random numbers. It is best to invoke SecureRand
Connection (java.sql)
A connection represents a link from a Java application to a database. All SQL statements and results
Time (java.sql)
Java representation of an SQL TIME value. Provides utilities to format and parse the time's represen
Arrays (java.util)
This class contains various methods for manipulating arrays (such as sorting and searching). This cl
Scanner (java.util)
A parser that parses a text string of primitive types and strings with the help of regular expressio
Top PhpStorm plugins

How to use trainmethodin opennlp.tools.doccat.DocumentCategorizerME

Best Java code snippets using opennlp.tools.doccat.DocumentCategorizerME.train (Showing top 15 results out of 315)

How to use
train
method
in
opennlp.tools.doccat.DocumentCategorizerME