@Test public void testDefault() throws IOException { DoccatModel model = train(); Assert.assertNotNull(model); ByteArrayOutputStream out = new ByteArrayOutputStream(); model.serialize(out); ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); DoccatModel fromSerialized = new DoccatModel(in); DoccatFactory factory = fromSerialized.getFactory(); Assert.assertNotNull(factory); Assert.assertEquals(1, factory.getFeatureGenerators().length); Assert.assertEquals(BagOfWordsFeatureGenerator.class, factory.getFeatureGenerators()[0].getClass()); }
@Test public void testCustom() throws IOException { FeatureGenerator[] featureGenerators = { new BagOfWordsFeatureGenerator(), new NGramFeatureGenerator(), new NGramFeatureGenerator(2,3) }; DoccatFactory factory = new DoccatFactory(featureGenerators); DoccatModel model = train(factory); Assert.assertNotNull(model); ByteArrayOutputStream out = new ByteArrayOutputStream(); model.serialize(out); ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); DoccatModel fromSerialized = new DoccatModel(in); factory = fromSerialized.getFactory(); Assert.assertNotNull(factory); Assert.assertEquals(3, factory.getFeatureGenerators().length); Assert.assertEquals(BagOfWordsFeatureGenerator.class, factory.getFeatureGenerators()[0].getClass()); Assert.assertEquals(NGramFeatureGenerator.class, factory.getFeatureGenerators()[1].getClass()); Assert.assertEquals(NGramFeatureGenerator.class,factory.getFeatureGenerators()[2].getClass()); }
public void train(String source, String destination) throws IOException { //<start id="maxent.examples.train.setup"/> File[] inputFiles = FileUtil.buildFileList(new File(source)); File modelFile = new File(destination); Tokenizer tokenizer = SimpleTokenizer.INSTANCE; //<co id="tm.tok"/> CategoryDataStream ds = new CategoryDataStream(inputFiles, tokenizer); int cutoff = 5; int iterations = 100; NameFinderFeatureGenerator nffg //<co id="tm.fg"/> = new NameFinderFeatureGenerator(); BagOfWordsFeatureGenerator bowfg = new BagOfWordsFeatureGenerator(); DoccatModel model = DocumentCategorizerME.train("en", ds, cutoff, iterations, nffg, bowfg); //<co id="tm.train"/> model.serialize(new FileOutputStream(modelFile)); /*<calloutlist> <callout arearefs="tm.tok">Create data stream</callout> <callout arearefs="tm.fg">Set up features generators</callout> <callout arearefs="tm.train">Train categorizer</callout> </calloutlist>*/ //<end id="maxent.examples.train.setup"/> }
actg, parser); GISModel model = GIS.trainModel(100, new TwoPassDataIndexer(es, 3));//<co id="atc.train.do"/> new DoccatModel("en", model).serialize(new FileOutputStream(outFile));