private static ObjectStream<NameSample> openData(LANGUAGE lang, String name) throws IOException { InputStreamFactory in = new ResourceAsStreamFactory(EvalitaNameSampleStreamTest.class, "/opennlp/tools/formats/" + name); return new EvalitaNameSampleStream(lang, in, EvalitaNameSampleStream.GENERATE_PERSON_ENTITIES); }
private static ObjectStream<NameSample> openData(LANGUAGE lang, String name) throws IOException { InputStreamFactory in = new ResourceAsStreamFactory(Conll03NameSampleStreamTest.class, "/opennlp/tools/formats/" + name); return new Conll03NameSampleStream(lang, in, Conll02NameSampleStream.GENERATE_PERSON_ENTITIES); }
private static ObjectStream<NameSample> openData(LANGUAGE lang, String name) throws IOException { InputStreamFactory in = new ResourceAsStreamFactory(Conll02NameSampleStreamTest.class, "/opennlp/tools/formats/" + name); return new Conll02NameSampleStream(lang, in, Conll02NameSampleStream.GENERATE_PERSON_ENTITIES); }
private static ObjectStream<StringList> openData(String name) throws IOException { InputStreamFactory in = new ResourceAsStreamFactory( NameFinderCensus90NameStreamTest.class, "/opennlp/tools/formats/" + name); return new NameFinderCensus90NameStream(in, StandardCharsets.UTF_8); }
public static LanguageDetectorSampleStream createSampleStream() throws IOException { ResourceAsStreamFactory streamFactory = new ResourceAsStreamFactory( LanguageDetectorMETest.class, "/opennlp/tools/doccat/DoccatSample.txt"); PlainTextByLineStream lineStream = new PlainTextByLineStream(streamFactory, "UTF-8"); return new LanguageDetectorSampleStream(lineStream); } }
private static ObjectStream<SentenceSample> createSampleStream() throws IOException { InputStreamFactory in = new ResourceAsStreamFactory( SentenceDetectorFactoryTest.class, "/opennlp/tools/sentdetect/Sentences.txt"); return new SentenceSampleStream(new PlainTextByLineStream( in, StandardCharsets.UTF_8)); }
private static ObjectStream<POSSample> createSampleStream() throws IOException { InputStreamFactory in = new ResourceAsStreamFactory( POSTaggerFactoryTest.class, "/opennlp/tools/postag/AnnotatedSentences.txt"); return new WordTagSampleStream(new PlainTextByLineStream(in, StandardCharsets.UTF_8)); }
private static ObjectStream<Parse> createParseSampleStream() throws IOException { InputStreamFactory in = new ResourceAsStreamFactory( ParseSampleStreamTest.class, "/opennlp/tools/parser/test.parse"); return new ParseSampleStream(new PlainTextByLineStream(in, StandardCharsets.UTF_8)); }
@Before public void setup() throws IOException { ResourceAsStreamFactory stream = new ResourceAsStreamFactory( getClass(), "/opennlp/tools/formats/brown-cluster.txt"); BrownCluster brownCluster = new BrownCluster(stream.createInputStream()); generator = new BrownBigramFeatureGenerator(brownCluster); }
private static ObjectStream<ChunkSample> createSampleStream() throws IOException { ResourceAsStreamFactory in = new ResourceAsStreamFactory( ChunkerFactoryTest.class, "/opennlp/tools/chunker/test.txt"); return new ChunkSampleStream( new PlainTextByLineStream(in, StandardCharsets.UTF_8)); }
private static ADSentenceStream openData() throws IOException { InputStreamFactory in = new ResourceAsStreamFactory(ADParagraphStreamTest.class, "/opennlp/tools/formats/ad.sample"); return new ADSentenceStream(new PlainTextByLineStream(in, "UTF-8")); } }
private static ObjectStream<POSSample> createSampleStream() throws IOException { InputStreamFactory in = new ResourceAsStreamFactory(POSTaggerMETest.class, "/opennlp/tools/postag/AnnotatedSentences.txt"); return new WordTagSampleStream(new PlainTextByLineStream(in, StandardCharsets.UTF_8)); }
private static ObjectStream<TokenSample> createSampleStream() throws IOException { InputStreamFactory in = new ResourceAsStreamFactory( TokenizerFactoryTest.class, "/opennlp/tools/tokenize/token.train"); return new TokenSampleStream(new PlainTextByLineStream(in, StandardCharsets.UTF_8)); }
private static ObjectStream<DocumentSample> createSampleStream() throws IOException { InputStreamFactory isf = new ResourceAsStreamFactory( DoccatFactoryTest.class, "/opennlp/tools/doccat/DoccatSample.txt"); return new DocumentSampleStream(new PlainTextByLineStream(isf, "UTF-8")); }
@Before public void setup() throws IOException { InputStreamFactory in = new ResourceAsStreamFactory(ADParagraphStreamTest.class, "/opennlp/tools/formats/ad.sample"); try (ADNameSampleStream stream = new ADNameSampleStream(new PlainTextByLineStream(in, StandardCharsets.UTF_8), true)) { NameSample sample; while ((sample = stream.read()) != null) { samples.add(sample); } } }
@Before public void setup() throws IOException { InputStreamFactory in = new ResourceAsStreamFactory( ADParagraphStreamTest.class, "/opennlp/tools/formats/ad.sample"); try (ADChunkSampleStream stream = new ADChunkSampleStream(new PlainTextByLineStream(in, "UTF-8"))) { ChunkSample sample; while ((sample = stream.read()) != null) { samples.add(sample); } } }
@Test(expected = InsufficientTrainingDataException.class) public void testInsufficientData() throws IOException { ResourceAsStreamFactory in = new ResourceAsStreamFactory(getClass(), "/opennlp/tools/chunker/test-insufficient.txt"); ObjectStream<ChunkSample> sampleStream = new ChunkSampleStream( new PlainTextByLineStream(in, StandardCharsets.UTF_8)); TrainingParameters params = new TrainingParameters(); params.put(TrainingParameters.ITERATIONS_PARAM, 70); params.put(TrainingParameters.CUTOFF_PARAM, 1); ChunkerME.train("eng", sampleStream, params, new ChunkerFactory()); }
@Test(expected = InsufficientTrainingDataException.class) public void testInsufficientData() throws IOException { InputStreamFactory trainDataIn = new ResourceAsStreamFactory( TokenizerModel.class, "/opennlp/tools/tokenize/token-insufficient.train"); ObjectStream<TokenSample> samples = new TokenSampleStream( new PlainTextByLineStream(trainDataIn, StandardCharsets.UTF_8)); TrainingParameters mlParams = new TrainingParameters(); mlParams.put(TrainingParameters.ITERATIONS_PARAM, 100); mlParams.put(TrainingParameters.CUTOFF_PARAM, 5); TokenizerME.train(samples, TokenizerFactory.create(null, "eng", null, true, null), mlParams); }
@BeforeClass public static void train() throws Exception { ResourceAsStreamFactory streamFactory = new ResourceAsStreamFactory( LanguageDetectorMETest.class, "/opennlp/tools/doccat/DoccatSample.txt"); PlainTextByLineStream lineStream = new PlainTextByLineStream(streamFactory, "UTF-8"); LanguageDetectorSampleStream sampleStream = new LanguageDetectorSampleStream(lineStream); TrainingParameters params = new TrainingParameters(); params.put(TrainingParameters.ITERATIONS_PARAM, "100"); params.put(TrainingParameters.CUTOFF_PARAM, "5"); params.put(TrainingParameters.ALGORITHM_PARAM, "NAIVEBAYES"); model = LanguageDetectorME.train(sampleStream, params, new DummyFactory()); }
@Test(expected = InsufficientTrainingDataException.class) public void insufficientTestData() throws IOException { InputStreamFactory in = new ResourceAsStreamFactory(POSTaggerMETest.class, "/opennlp/tools/postag/AnnotatedSentencesInsufficient.txt"); ObjectStream<POSSample> stream = new WordTagSampleStream( new PlainTextByLineStream(in, StandardCharsets.UTF_8)); TrainingParameters params = new TrainingParameters(); params.put(TrainingParameters.ALGORITHM_PARAM, ModelType.MAXENT.name()); params.put(TrainingParameters.ITERATIONS_PARAM, 100); params.put(TrainingParameters.CUTOFF_PARAM, 5); POSTaggerME.train("eng", stream, params, new POSTaggerFactory()); }