private static DoccatModel train(DoccatFactory factory) throws IOException { return DocumentCategorizerME.train("x-unspecified", createSampleStream(), TrainingParameters.defaultParams(), factory); }
private static DoccatModel train() throws IOException { return DocumentCategorizerME.train("x-unspecified", createSampleStream(), TrainingParameters.defaultParams(), new DoccatFactory()); }
@Test(expected = InsufficientTrainingDataException.class) public void insufficientTestData() throws IOException { ObjectStream<DocumentSample> samples = ObjectStreamUtils.createObjectStream( new DocumentSample("1", new String[]{"a", "b", "c"})); TrainingParameters params = new TrainingParameters(); params.put(TrainingParameters.ITERATIONS_PARAM, 100); params.put(TrainingParameters.CUTOFF_PARAM, 0); DocumentCategorizerME.train("x-unspecified", samples, params, new DoccatFactory()); }
/** * Starts the evaluation. * * @param samples * the data to train and test * @param nFolds * number of folds * * @throws IOException */ public void evaluate(ObjectStream<DocumentSample> samples, int nFolds) throws IOException { CrossValidationPartitioner<DocumentSample> partitioner = new CrossValidationPartitioner<>( samples, nFolds); while (partitioner.hasNext()) { CrossValidationPartitioner.TrainingSampleStream<DocumentSample> trainingSampleStream = partitioner .next(); DoccatModel model = DocumentCategorizerME.train(languageCode, trainingSampleStream, params, factory); DocumentCategorizerEvaluator evaluator = new DocumentCategorizerEvaluator( new DocumentCategorizerME(model), listeners); evaluator.evaluate(trainingSampleStream.getTestSampleStream()); documentAccuracy.add(evaluator.getAccuracy(), evaluator.getDocumentCount()); } }
@Override public void run(String format, String[] args) { super.run(format, args); mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), false); if (mlParams == null) { mlParams = ModelUtil.createDefaultTrainingParameters(); } File modelOutFile = params.getModel(); CmdLineUtil.checkOutputFile("document categorizer model", modelOutFile); FeatureGenerator[] featureGenerators = createFeatureGenerators(params .getFeatureGenerators()); DoccatModel model; try { DoccatFactory factory = DoccatFactory.create(params.getFactory(), featureGenerators); model = DocumentCategorizerME.train(params.getLang(), sampleStream, mlParams, factory); } catch (IOException e) { throw createTerminationIOException(e); } finally { try { sampleStream.close(); } catch (IOException e) { // sorry that this can fail } } CmdLineUtil.writeModel("document categorizer", modelOutFile, model); }
@Test public void testSimpleTraining() throws IOException { ObjectStream<DocumentSample> samples = ObjectStreamUtils.createObjectStream( new DocumentSample("1", new String[]{"a", "b", "c"}), new DocumentSample("1", new String[]{"a", "b", "c", "1", "2"}), new DocumentSample("1", new String[]{"a", "b", "c", "3", "4"}), new DocumentSample("0", new String[]{"x", "y", "z"}), new DocumentSample("0", new String[]{"x", "y", "z", "5", "6"}), new DocumentSample("0", new String[]{"x", "y", "z", "7", "8"})); TrainingParameters params = new TrainingParameters(); params.put(TrainingParameters.ITERATIONS_PARAM, 100); params.put(TrainingParameters.CUTOFF_PARAM, 0); DoccatModel model = DocumentCategorizerME.train("x-unspecified", samples, params, new DoccatFactory()); DocumentCategorizer doccat = new DocumentCategorizerME(model); double[] aProbs = doccat.categorize(new String[]{"a"}); Assert.assertEquals("1", doccat.getBestCategory(aProbs)); double[] bProbs = doccat.categorize(new String[]{"x"}); Assert.assertEquals("0", doccat.getBestCategory(bProbs)); //test to make sure sorted map's last key is cat 1 because it has the highest score. SortedMap<Double, Set<String>> sortedScoreMap = doccat.sortedScoreMap(new String[]{"a"}); Set<String> cat = sortedScoreMap.get(sortedScoreMap.lastKey()); Assert.assertEquals(1, cat.size()); }
params.put(AbstractTrainer.ALGORITHM_PARAM, NaiveBayesTrainer.NAIVE_BAYES_VALUE); DoccatModel model = DocumentCategorizerME.train("x-unspecified", samples, params, new DoccatFactory());
private DoccatModel train(List<DocumentSample> aSamples, TrainingParameters aParameters) throws RecommendationException { try (DocumentSampleStream stream = new DocumentSampleStream(aSamples)) { DoccatFactory factory = new DoccatFactory(); return DocumentCategorizerME.train("unknown", stream, aParameters, factory); } catch (IOException e) { throw new RecommendationException( "Exception during training the OpenNLP Document Categorizer model.", e); } } }
public void train(String source, String destination) throws IOException { //<start id="maxent.examples.train.setup"/> File[] inputFiles = FileUtil.buildFileList(new File(source)); File modelFile = new File(destination); Tokenizer tokenizer = SimpleTokenizer.INSTANCE; //<co id="tm.tok"/> CategoryDataStream ds = new CategoryDataStream(inputFiles, tokenizer); int cutoff = 5; int iterations = 100; NameFinderFeatureGenerator nffg //<co id="tm.fg"/> = new NameFinderFeatureGenerator(); BagOfWordsFeatureGenerator bowfg = new BagOfWordsFeatureGenerator(); DoccatModel model = DocumentCategorizerME.train("en", ds, cutoff, iterations, nffg, bowfg); //<co id="tm.train"/> model.serialize(new FileOutputStream(modelFile)); /*<calloutlist> <callout arearefs="tm.tok">Create data stream</callout> <callout arearefs="tm.fg">Set up features generators</callout> <callout arearefs="tm.train">Train categorizer</callout> </calloutlist>*/ //<end id="maxent.examples.train.setup"/> }
public void collectionProcessComplete(ProcessTrace trace) throws ResourceProcessException, IOException { GIS.PRINT_MESSAGES = false; DoccatModel categoryModel = DocumentCategorizerME.train(language, ObjectStreamUtils.createObjectStream(documentSamples)); File modelFile = new File(getUimaContextAdmin().getResourceManager() .getDataPath() + File.separatorChar + mModelName); OpennlpUtil.serialize(categoryModel, modelFile); }
trainingParams.put(TrainingParameters.CUTOFF_PARAM, 0); DoccatModel doccatModel = DocumentCategorizerME.train("en", combinedDocumentSampleStream, trainingParams, new DoccatFactory()); combinedDocumentSampleStream.close();
/** * Starts the evaluation. * * @param samples * the data to train and test * @param nFolds * number of folds * * @throws IOException */ public void evaluate(ObjectStream<DocumentSample> samples, int nFolds) throws IOException { CrossValidationPartitioner<DocumentSample> partitioner = new CrossValidationPartitioner<>( samples, nFolds); while (partitioner.hasNext()) { CrossValidationPartitioner.TrainingSampleStream<DocumentSample> trainingSampleStream = partitioner .next(); DoccatModel model = DocumentCategorizerME.train(languageCode, trainingSampleStream, params, factory); DocumentCategorizerEvaluator evaluator = new DocumentCategorizerEvaluator( new DocumentCategorizerME(model), listeners); evaluator.evaluate(trainingSampleStream.getTestSampleStream()); documentAccuracy.add(evaluator.getAccuracy(), evaluator.getDocumentCount()); } }
/** * Starts the evaluation. * * @param samples * the data to train and test * @param nFolds * number of folds * * @throws IOException */ public void evaluate(ObjectStream<DocumentSample> samples, int nFolds) throws IOException { CrossValidationPartitioner<DocumentSample> partitioner = new CrossValidationPartitioner<>( samples, nFolds); while (partitioner.hasNext()) { CrossValidationPartitioner.TrainingSampleStream<DocumentSample> trainingSampleStream = partitioner .next(); DoccatModel model = DocumentCategorizerME.train(languageCode, trainingSampleStream, params, factory); DocumentCategorizerEvaluator evaluator = new DocumentCategorizerEvaluator( new DocumentCategorizerME(model), listeners); evaluator.evaluate(trainingSampleStream.getTestSampleStream()); documentAccuracy.add(evaluator.getAccuracy(), evaluator.getDocumentCount()); } }
@Override public void run(String format, String[] args) { super.run(format, args); mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), false); if (mlParams == null) { mlParams = ModelUtil.createDefaultTrainingParameters(); } File modelOutFile = params.getModel(); CmdLineUtil.checkOutputFile("document categorizer model", modelOutFile); FeatureGenerator[] featureGenerators = createFeatureGenerators(params .getFeatureGenerators()); DoccatModel model; try { DoccatFactory factory = DoccatFactory.create(params.getFactory(), featureGenerators); model = DocumentCategorizerME.train(params.getLang(), sampleStream, mlParams, factory); } catch (IOException e) { throw createTerminationIOException(e); } finally { try { sampleStream.close(); } catch (IOException e) { // sorry that this can fail } } CmdLineUtil.writeModel("document categorizer", modelOutFile, model); }
@Override public void run(String format, String[] args) { super.run(format, args); mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), false); if (mlParams == null) { mlParams = ModelUtil.createDefaultTrainingParameters(); } File modelOutFile = params.getModel(); CmdLineUtil.checkOutputFile("document categorizer model", modelOutFile); FeatureGenerator[] featureGenerators = createFeatureGenerators(params .getFeatureGenerators()); DoccatModel model; try { DoccatFactory factory = DoccatFactory.create(params.getFactory(), featureGenerators); model = DocumentCategorizerME.train(params.getLang(), sampleStream, mlParams, factory); } catch (IOException e) { throw createTerminationIOException(e); } finally { try { sampleStream.close(); } catch (IOException e) { // sorry that this can fail } } CmdLineUtil.writeModel("document categorizer", modelOutFile, model); }