/** * Initializes the current instance. * * @param samples {@link ObjectStream} of {@link DocumentSample}s */ public DocumentCategorizerEventStream(ObjectStream<DocumentSample> samples) { super(samples); mContextGenerator = new DocumentCategorizerContextGenerator(new BagOfWordsFeatureGenerator()); }
static FeatureGenerator[] createFeatureGenerators(String featureGeneratorsNames) { if (featureGeneratorsNames == null) { return new FeatureGenerator[]{new BagOfWordsFeatureGenerator()}; } String[] classes = featureGeneratorsNames.split(","); FeatureGenerator[] featureGenerators = new FeatureGenerator[classes.length]; for (int i = 0; i < featureGenerators.length; i++) { featureGenerators[i] = ExtensionLoader.instantiateExtension( FeatureGenerator.class, classes[i]); } return featureGenerators; } }
public FeatureGenerator[] getFeatureGenerators() { if (featureGenerators == null) { if (artifactProvider != null) { String classNames = artifactProvider .getManifestProperty(FEATURE_GENERATORS); if (classNames != null) { this.featureGenerators = loadFeatureGenerators(classNames); } } if (featureGenerators == null) { // could not load using artifact provider // load bag of words as default this.featureGenerators = new FeatureGenerator[]{new BagOfWordsFeatureGenerator()}; } } return featureGenerators; }
@Test public void testEmpty() { BagOfWordsFeatureGenerator generator = new BagOfWordsFeatureGenerator(); Assert.assertEquals(0, generator.extractFeatures(new String[]{}, Collections.emptyMap()).size()); }
@Test public void testUseAllTokens() { BagOfWordsFeatureGenerator generator = new BagOfWordsFeatureGenerator(); Assert.assertArrayEquals(new String[]{"bow=it", "bow=is", "bow=12.345", "bow=feet", "bow=long"}, generator.extractFeatures(new String[]{"it", "is", "12.345", "feet", "long"}, Collections.emptyMap()).toArray()); }
@Test public void testOnlyLetterTokens() { BagOfWordsFeatureGenerator generator = new BagOfWordsFeatureGenerator(true); Assert.assertArrayEquals(new String[]{"bow=it", "bow=is", "bow=feet", "bow=long"}, generator.extractFeatures(new String[]{"it", "is", "12.345", "feet", "long"}, Collections.emptyMap()).toArray()); } }
@Test public void testNull() { BagOfWordsFeatureGenerator generator = new BagOfWordsFeatureGenerator(); try { generator.extractFeatures(null, Collections.emptyMap()); Assert.fail("NullPointerException must be thrown"); } catch (NullPointerException expected) { } }
@Test public void testCustom() throws IOException { FeatureGenerator[] featureGenerators = { new BagOfWordsFeatureGenerator(), new NGramFeatureGenerator(), new NGramFeatureGenerator(2,3) }; DoccatFactory factory = new DoccatFactory(featureGenerators); DoccatModel model = train(factory); Assert.assertNotNull(model); ByteArrayOutputStream out = new ByteArrayOutputStream(); model.serialize(out); ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); DoccatModel fromSerialized = new DoccatModel(in); factory = fromSerialized.getFactory(); Assert.assertNotNull(factory); Assert.assertEquals(3, factory.getFeatureGenerators().length); Assert.assertEquals(BagOfWordsFeatureGenerator.class, factory.getFeatureGenerators()[0].getClass()); Assert.assertEquals(NGramFeatureGenerator.class, factory.getFeatureGenerators()[1].getClass()); Assert.assertEquals(NGramFeatureGenerator.class,factory.getFeatureGenerators()[2].getClass()); }
/** * Initializes the current instance. * * @param samples {@link ObjectStream} of {@link DocumentSample}s */ public DocumentCategorizerEventStream(ObjectStream<DocumentSample> samples) { super(samples); mContextGenerator = new DocumentCategorizerContextGenerator(new BagOfWordsFeatureGenerator()); }
/** * Initializes the current instance. * * @param samples {@link ObjectStream} of {@link DocumentSample}s */ public DocumentCategorizerEventStream(ObjectStream<DocumentSample> samples) { super(samples); mContextGenerator = new DocumentCategorizerContextGenerator(new BagOfWordsFeatureGenerator()); }
static FeatureGenerator[] createFeatureGenerators(String featureGeneratorsNames) { if (featureGeneratorsNames == null) { return new FeatureGenerator[]{new BagOfWordsFeatureGenerator()}; } String[] classes = featureGeneratorsNames.split(","); FeatureGenerator[] featureGenerators = new FeatureGenerator[classes.length]; for (int i = 0; i < featureGenerators.length; i++) { featureGenerators[i] = ExtensionLoader.instantiateExtension( FeatureGenerator.class, classes[i]); } return featureGenerators; } }
static FeatureGenerator[] createFeatureGenerators(String featureGeneratorsNames) { if (featureGeneratorsNames == null) { return new FeatureGenerator[]{new BagOfWordsFeatureGenerator()}; } String[] classes = featureGeneratorsNames.split(","); FeatureGenerator[] featureGenerators = new FeatureGenerator[classes.length]; for (int i = 0; i < featureGenerators.length; i++) { featureGenerators[i] = ExtensionLoader.instantiateExtension( FeatureGenerator.class, classes[i]); } return featureGenerators; } }
public FeatureGenerator[] getFeatureGenerators() { if (featureGenerators == null) { if (artifactProvider != null) { String classNames = artifactProvider .getManifestProperty(FEATURE_GENERATORS); if (classNames != null) { this.featureGenerators = loadFeatureGenerators(classNames); } } if (featureGenerators == null) { // could not load using artifact provider // load bag of words as default this.featureGenerators = new FeatureGenerator[]{new BagOfWordsFeatureGenerator()}; } } return featureGenerators; }
public FeatureGenerator[] getFeatureGenerators() { if (featureGenerators == null) { if (artifactProvider != null) { String classNames = artifactProvider .getManifestProperty(FEATURE_GENERATORS); if (classNames != null) { this.featureGenerators = loadFeatureGenerators(classNames); } } if (featureGenerators == null) { // could not load using artifact provider // load bag of words as default this.featureGenerators = new FeatureGenerator[]{new BagOfWordsFeatureGenerator()}; } } return featureGenerators; }
public void train(String source, String destination) throws IOException { //<start id="maxent.examples.train.setup"/> File[] inputFiles = FileUtil.buildFileList(new File(source)); File modelFile = new File(destination); Tokenizer tokenizer = SimpleTokenizer.INSTANCE; //<co id="tm.tok"/> CategoryDataStream ds = new CategoryDataStream(inputFiles, tokenizer); int cutoff = 5; int iterations = 100; NameFinderFeatureGenerator nffg //<co id="tm.fg"/> = new NameFinderFeatureGenerator(); BagOfWordsFeatureGenerator bowfg = new BagOfWordsFeatureGenerator(); DoccatModel model = DocumentCategorizerME.train("en", ds, cutoff, iterations, nffg, bowfg); //<co id="tm.train"/> model.serialize(new FileOutputStream(modelFile)); /*<calloutlist> <callout arearefs="tm.tok">Create data stream</callout> <callout arearefs="tm.fg">Set up features generators</callout> <callout arearefs="tm.train">Train categorizer</callout> </calloutlist>*/ //<end id="maxent.examples.train.setup"/> }