Tabnine Logo
MarkableFileInputStreamFactory
Code IndexAdd Tabnine to your IDE (free)

How to use
MarkableFileInputStreamFactory
in
opennlp.tools.util

Best Java code snippets using opennlp.tools.util.MarkableFileInputStreamFactory (Showing top 20 results out of 315)

origin: apache/opennlp

public static InputStreamFactory createInputStreamFactory(File file) {
 try {
  return new MarkableFileInputStreamFactory(file);
 } catch (FileNotFoundException e) {
  throw new TerminateToolException(-1, "File '" + file + "' cannot be found", e);
 }
}
origin: apache/opennlp

new MarkableFileInputStreamFactory(sentencesFile), StandardCharsets.UTF_8)) {
origin: apache/opennlp

private static ObjectStream<String> getLineSample(String corpus)
  throws IOException {
 return new PlainTextByLineStream(new MarkableFileInputStreamFactory(
   new File(getOpennlpDataDir(), corpus)), StandardCharsets.ISO_8859_1);
}

origin: apache/opennlp

private ObjectStream<LeipzigTestSample> createLineWiseStream() throws IOException {
 return new LeipzigTestSampleStream(1,
   SimpleTokenizer.INSTANCE,
   new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
     "leipzig/eng_news_2010_300K-sentences.txt")));
}
origin: apache/opennlp

private POSModel train(File trainFile, String lang,
               TrainingParameters params) throws IOException {
 ObjectStream<POSSample> samples =
   new ConllXPOSSampleStream(new MarkableFileInputStreamFactory(trainFile), StandardCharsets.UTF_8);
 return POSTaggerME.train(lang, samples, params, new POSTaggerFactory());
}
origin: apache/opennlp

@BeforeClass
public static void verifyTrainingData() throws Exception {
 verifyTrainingData(new LeipzigTestSampleStream(25, SimpleTokenizer.INSTANCE,
     new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
         "leipzig/eng_news_2010_300K-sentences.txt"))),
   new BigInteger("172812413483919324675263268750583851712"));
}
origin: apache/opennlp

@BeforeClass
public static void verifyTrainingData() throws Exception {
 
 TEST_DATA_FILE = new File(getOpennlpDataDir(), "conll00/test.txt");
 TRAIN_DATA_FILE = new File(getOpennlpDataDir(), "conll00/train.txt");
 verifyTrainingData(new ChunkSampleStream(
     new PlainTextByLineStream(new MarkableFileInputStreamFactory(TEST_DATA_FILE),
         StandardCharsets.UTF_8)),
   new BigInteger("84610235226433393380477662908529306002"));
 verifyTrainingData(new ChunkSampleStream(
     new PlainTextByLineStream(new MarkableFileInputStreamFactory(TEST_DATA_FILE),
         StandardCharsets.UTF_8)),
   new BigInteger("84610235226433393380477662908529306002"));    
}
origin: apache/opennlp

private static ChunkerModel train(File trainFile, TrainingParameters params)
  throws IOException {
 ObjectStream<ChunkSample> samples = new ChunkSampleStream(
   new PlainTextByLineStream(
     new MarkableFileInputStreamFactory(trainFile), StandardCharsets.UTF_8));
 return ChunkerME.train("eng", samples, params, new ChunkerFactory());
}
origin: apache/opennlp

private TokenNameFinderModel train(File trainFile, LANGUAGE lang,
  TrainingParameters params, int types) throws IOException {
 ObjectStream<NameSample> samples = new Conll02NameSampleStream(
   lang,new MarkableFileInputStreamFactory(trainFile), types);
 return  NameFinderME.train(lang.toString().toLowerCase(), null, samples,
   params, new TokenNameFinderFactory());
}
origin: apache/opennlp

private double trainAndEval(String lang, File trainFile, TrainingParameters params,
                  File evalFile) throws IOException {
 ConlluTagset tagset = ConlluTagset.X;
 ObjectStream<LemmaSample> trainSamples = new ConlluLemmaSampleStream(new ConlluStream(
   new MarkableFileInputStreamFactory(trainFile)), tagset);
 LemmatizerModel model = LemmatizerME.train(lang, trainSamples, params, new LemmatizerFactory());
 LemmatizerEvaluator evaluator = new LemmatizerEvaluator(new LemmatizerME(model));
 evaluator.evaluate(new ConlluLemmaSampleStream(new ConlluStream(
   new MarkableFileInputStreamFactory(evalFile)), tagset));
 return evaluator.getWordAccuracy();
}
origin: apache/opennlp

new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
 "conllx/data/danish/ddt/train/danish_ddt_train.conll")), StandardCharsets.UTF_8), 
new BigInteger("30795670444498617202001550516753630016"));
new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
 "conllx/data/danish/ddt/test/danish_ddt_test.conll")), StandardCharsets.UTF_8), 
  new BigInteger("314104267846430512372780024568104131337"));
new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
 "conllx/data/dutch/alpino/train/dutch_alpino_train.conll")), StandardCharsets.UTF_8), 
  new BigInteger("109328245573060521952850454797286933887"));
new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
 "conllx/data/dutch/alpino/test/dutch_alpino_test.conll")), StandardCharsets.UTF_8), 
  new BigInteger("132343141132816640849897155456916243039"));
new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
 "conllx/data/portuguese/bosque/treebank/portuguese_bosque_train.conll")), StandardCharsets.UTF_8), 
  new BigInteger("9504382474772307801979515927230835901"));
new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
 "conllx/data/swedish/talbanken05/train/swedish_talbanken05_train.conll")), StandardCharsets.UTF_8), 
  new BigInteger("175256039869578311901318972681191182910"));
new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
 "conllx/data/swedish/talbanken05/test/swedish_talbanken05_test.conll")), StandardCharsets.UTF_8), 
  new BigInteger("128378790384268106811747599235147991544"));
origin: apache/opennlp

  LANGUAGE.NLD, new MarkableFileInputStreamFactory(dutchTrainingFile),
   Conll02NameSampleStream.GENERATE_PERSON_ENTITIES),
    new BigInteger("109687424525847313767541246922170457976"));
verifyTrainingData(new Conll02NameSampleStream(
  LANGUAGE.NLD, new MarkableFileInputStreamFactory(dutchTestAFile),
   Conll02NameSampleStream.GENERATE_PERSON_ENTITIES),
    new BigInteger("12942966701628852910737840182656846323"));
verifyTrainingData(new Conll02NameSampleStream(
  LANGUAGE.NLD, new MarkableFileInputStreamFactory(dutchTestBFile),
   Conll02NameSampleStream.GENERATE_PERSON_ENTITIES),
    new BigInteger("223206987942490952427646331013509976957"));
  LANGUAGE.SPA, new MarkableFileInputStreamFactory(spanishTrainingFile),
   Conll02NameSampleStream.GENERATE_PERSON_ENTITIES),
    new BigInteger("226089384066775461905386060946810714487"));  
verifyTrainingData(new Conll02NameSampleStream(
  LANGUAGE.SPA, new MarkableFileInputStreamFactory(spanishTestAFile),
   Conll02NameSampleStream.GENERATE_PERSON_ENTITIES),
    new BigInteger("313879596837181728494732341737647284762"));
verifyTrainingData(new Conll02NameSampleStream(
  LANGUAGE.SPA, new MarkableFileInputStreamFactory(spanishTestBFile),
   Conll02NameSampleStream.GENERATE_PERSON_ENTITIES),
    new BigInteger("24037715705115461166858183817622459974"));
origin: apache/opennlp

@Test
public void evalSentenceModel() throws Exception {
 SentenceModel model = new SentenceModel(
     new File(getOpennlpDataDir(), "models-sf/en-sent.bin"));
 MessageDigest digest = MessageDigest.getInstance(HASH_ALGORITHM);
 SentenceDetector sentenceDetector = new SentenceDetectorME(model);
 StringBuilder text = new StringBuilder();
 try (ObjectStream<LeipzigTestSample> lineBatches = new LeipzigTestSampleStream(25,
     SimpleTokenizer.INSTANCE,
     new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
         "leipzig/eng_news_2010_300K-sentences.txt")))) {
  LeipzigTestSample lineBatch;
  while ((lineBatch = lineBatches.read()) != null) {
   text.append(String.join(" ", lineBatch.getText())).append(" ");
  }
 }
 String[] sentences = sentenceDetector.sentDetect(text.toString());
 for (String sentence : sentences) {
  digest.update(sentence.getBytes(StandardCharsets.UTF_8));
 }
 Assert.assertEquals(new BigInteger("228544068397077998410949364710969159291"),
     new BigInteger(1, digest.digest()));
}
origin: apache/opennlp

@Test
public void evalTokenModel() throws Exception {
 // the input stream is currently tokenized, we should detokenize it again,
 //    (or extend to pass in tokenizer, then whitespace tokenizer can be passed)
 // and then tokenize it here
 TokenizerModel model = new TokenizerModel(
     new File(getOpennlpDataDir(), "models-sf/en-token.bin"));
 MessageDigest digest = MessageDigest.getInstance(HASH_ALGORITHM);
 Tokenizer tokenizer = new TokenizerME(model);
 try (ObjectStream<LeipzigTestSample> lines = new LeipzigTestSampleStream(1,
     WhitespaceTokenizer.INSTANCE,
     new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(),
         "leipzig/eng_news_2010_300K-sentences.txt")))) {
  LeipzigTestSample line;
  while ((line = lines.read()) != null) {
   String[] tokens = tokenizer.tokenize(String.join(" ", line.getText()));
   for (String token : tokens) {
    digest.update(token.getBytes(StandardCharsets.UTF_8));
   }
  }
 }
 Assert.assertEquals(new BigInteger("180602607571756839321060482558626151930"),
     new BigInteger(1, digest.digest()));
}
origin: apache/opennlp

private static void eval(ChunkerModel model, File testData,
             double expectedFMeasure) throws IOException {
 ObjectStream<ChunkSample> samples = new ChunkSampleStream(
   new PlainTextByLineStream(new MarkableFileInputStreamFactory(testData), StandardCharsets.UTF_8));
 ChunkerEvaluator evaluator = new ChunkerEvaluator(new ChunkerME(model));
 evaluator.evaluate(samples);
 Assert.assertEquals(expectedFMeasure,
   evaluator.getFMeasure().getFMeasure(), 0.0001);
}

origin: apache/opennlp

private void eval(POSModel model, File testData,
             double expectedAccuracy) throws IOException {
 ObjectStream<POSSample> samples = new ConllXPOSSampleStream(
   new MarkableFileInputStreamFactory(testData), StandardCharsets.UTF_8);
 POSEvaluator evaluator = new POSEvaluator(new POSTaggerME(model));
 evaluator.evaluate(samples);
 Assert.assertEquals(expectedAccuracy, evaluator.getWordAccuracy(), 0.0001);
}
origin: apache/opennlp

private void eval(TokenNameFinderModel model, File testData, LANGUAGE lang,
  int types, double expectedFMeasure) throws IOException {
 ObjectStream<NameSample> samples = new Conll02NameSampleStream(
   lang, new MarkableFileInputStreamFactory(testData), types);
 TokenNameFinderEvaluator evaluator = new TokenNameFinderEvaluator(new NameFinderME(model));
 evaluator.evaluate(samples);
 Assert.assertEquals(expectedFMeasure, evaluator.getFMeasure().getFMeasure(), 0.0001);
}

origin: stackoverflow.com

inputStreamFactory = new MarkableFileInputStreamFactory(
   new File("en-sent.train"));
origin: org.apache.opennlp/opennlp-tools

public static InputStreamFactory createInputStreamFactory(File file) {
 try {
  return new MarkableFileInputStreamFactory(file);
 } catch (FileNotFoundException e) {
  throw new TerminateToolException(-1, "File '" + file + "' cannot be found", e);
 }
}
origin: ai.idylnlp/idylnlp-opennlp-tools-1.8.3

public static InputStreamFactory createInputStreamFactory(File file) {
 try {
  return new MarkableFileInputStreamFactory(file);
 } catch (FileNotFoundException e) {
  throw new TerminateToolException(-1, "File '" + file + "' cannot be found", e);
 }
}
opennlp.tools.utilMarkableFileInputStreamFactory

Javadoc

A factory that creates MarkableFileInputStream from a File

Most used methods

  • <init>

Popular in Java

  • Running tasks concurrently on multiple threads
  • setContentView (Activity)
  • addToBackStack (FragmentTransaction)
  • onCreateOptionsMenu (Activity)
  • FileNotFoundException (java.io)
    Thrown when a file specified by a program cannot be found.
  • StringTokenizer (java.util)
    Breaks a string into tokens; new code should probably use String#split.> // Legacy code: StringTo
  • ReentrantLock (java.util.concurrent.locks)
    A reentrant mutual exclusion Lock with the same basic behavior and semantics as the implicit monitor
  • JFrame (javax.swing)
  • Loader (org.hibernate.loader)
    Abstract superclass of object loading (and querying) strategies. This class implements useful common
  • LoggerFactory (org.slf4j)
    The LoggerFactory is a utility class producing Loggers for various logging APIs, most notably for lo
  • Top PhpStorm plugins
Tabnine Logo
  • Products

    Search for Java codeSearch for JavaScript code
  • IDE Plugins

    IntelliJ IDEAWebStormVisual StudioAndroid StudioEclipseVisual Studio CodePyCharmSublime TextPhpStormVimGoLandRubyMineEmacsJupyter NotebookJupyter LabRiderDataGripAppCode
  • Company

    About UsContact UsCareers
  • Resources

    FAQBlogTabnine AcademyTerms of usePrivacy policyJava Code IndexJavascript Code Index
Get Tabnine for your IDE now