public static DoccatFactory create(String subclassName, FeatureGenerator[] featureGenerators) throws InvalidFormatException { if (subclassName == null) { // will create the default factory return new DoccatFactory(featureGenerators); } try { DoccatFactory theFactory = ExtensionLoader.instantiateExtension( DoccatFactory.class, subclassName); theFactory.init(featureGenerators); return theFactory; } catch (Exception e) { String msg = "Could not instantiate the " + subclassName + ". The initialization throw an exception."; System.err.println(msg); e.printStackTrace(); throw new InvalidFormatException(msg, e); } }
private static DoccatModel train() throws IOException { return DocumentCategorizerME.train("x-unspecified", createSampleStream(), TrainingParameters.defaultParams(), new DoccatFactory()); }
@Test(expected = InsufficientTrainingDataException.class) public void insufficientTestData() throws IOException { ObjectStream<DocumentSample> samples = ObjectStreamUtils.createObjectStream( new DocumentSample("1", new String[]{"a", "b", "c"})); TrainingParameters params = new TrainingParameters(); params.put(TrainingParameters.ITERATIONS_PARAM, 100); params.put(TrainingParameters.CUTOFF_PARAM, 0); DocumentCategorizerME.train("x-unspecified", samples, params, new DoccatFactory()); }
@Test public void testCustom() throws IOException { FeatureGenerator[] featureGenerators = { new BagOfWordsFeatureGenerator(), new NGramFeatureGenerator(), new NGramFeatureGenerator(2,3) }; DoccatFactory factory = new DoccatFactory(featureGenerators); DoccatModel model = train(factory); Assert.assertNotNull(model); ByteArrayOutputStream out = new ByteArrayOutputStream(); model.serialize(out); ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); DoccatModel fromSerialized = new DoccatModel(in); factory = fromSerialized.getFactory(); Assert.assertNotNull(factory); Assert.assertEquals(3, factory.getFeatureGenerators().length); Assert.assertEquals(BagOfWordsFeatureGenerator.class, factory.getFeatureGenerators()[0].getClass()); Assert.assertEquals(NGramFeatureGenerator.class, factory.getFeatureGenerators()[1].getClass()); Assert.assertEquals(NGramFeatureGenerator.class,factory.getFeatureGenerators()[2].getClass()); }
@Test public void testSimpleTraining() throws IOException { ObjectStream<DocumentSample> samples = ObjectStreamUtils.createObjectStream( new DocumentSample("1", new String[]{"a", "b", "c"}), new DocumentSample("1", new String[]{"a", "b", "c", "1", "2"}), new DocumentSample("1", new String[]{"a", "b", "c", "3", "4"}), new DocumentSample("0", new String[]{"x", "y", "z"}), new DocumentSample("0", new String[]{"x", "y", "z", "5", "6"}), new DocumentSample("0", new String[]{"x", "y", "z", "7", "8"})); TrainingParameters params = new TrainingParameters(); params.put(TrainingParameters.ITERATIONS_PARAM, 100); params.put(TrainingParameters.CUTOFF_PARAM, 0); DoccatModel model = DocumentCategorizerME.train("x-unspecified", samples, params, new DoccatFactory()); DocumentCategorizer doccat = new DocumentCategorizerME(model); double[] aProbs = doccat.categorize(new String[]{"a"}); Assert.assertEquals("1", doccat.getBestCategory(aProbs)); double[] bProbs = doccat.categorize(new String[]{"x"}); Assert.assertEquals("0", doccat.getBestCategory(bProbs)); //test to make sure sorted map's last key is cat 1 because it has the highest score. SortedMap<Double, Set<String>> sortedScoreMap = doccat.sortedScoreMap(new String[]{"a"}); Set<String> cat = sortedScoreMap.get(sortedScoreMap.lastKey()); Assert.assertEquals(1, cat.size()); }
params, new DoccatFactory());
public static DoccatFactory create(String subclassName, FeatureGenerator[] featureGenerators) throws InvalidFormatException { if (subclassName == null) { // will create the default factory return new DoccatFactory(featureGenerators); } try { DoccatFactory theFactory = ExtensionLoader.instantiateExtension( DoccatFactory.class, subclassName); theFactory.init(featureGenerators); return theFactory; } catch (Exception e) { String msg = "Could not instantiate the " + subclassName + ". The initialization throw an exception."; System.err.println(msg); e.printStackTrace(); throw new InvalidFormatException(msg, e); } }
public static DoccatFactory create(String subclassName, FeatureGenerator[] featureGenerators) throws InvalidFormatException { if (subclassName == null) { // will create the default factory return new DoccatFactory(featureGenerators); } try { DoccatFactory theFactory = ExtensionLoader.instantiateExtension( DoccatFactory.class, subclassName); theFactory.init(featureGenerators); return theFactory; } catch (Exception e) { String msg = "Could not instantiate the " + subclassName + ". The initialization throw an exception."; System.err.println(msg); e.printStackTrace(); throw new InvalidFormatException(msg, e); } }
private DoccatModel train(List<DocumentSample> aSamples, TrainingParameters aParameters) throws RecommendationException { try (DocumentSampleStream stream = new DocumentSampleStream(aSamples)) { DoccatFactory factory = new DoccatFactory(); return DocumentCategorizerME.train("unknown", stream, aParameters, factory); } catch (IOException e) { throw new RecommendationException( "Exception during training the OpenNLP Document Categorizer model.", e); } } }
trainingParams.put(TrainingParameters.CUTOFF_PARAM, 0); DoccatModel doccatModel = DocumentCategorizerME.train("en", combinedDocumentSampleStream, trainingParams, new DoccatFactory()); combinedDocumentSampleStream.close();