private String[] testOpenNLP(String text) throws Exception { try (InputStream modelIn = this.getClass().getResourceAsStream(RESOURCES_EN_SENT_BIN)) { SentenceModel model = new SentenceModel(modelIn); SentenceDetectorME sentenceDetector = new SentenceDetectorME(model); return sentenceDetector.sentDetect(text); } }
private Span[] testOpenNLPPosition(String text) throws Exception { try (InputStream modelIn = this.getClass().getResourceAsStream(RESOURCES_EN_SENT_BIN)) { SentenceModel model = new SentenceModel(modelIn); SentenceDetectorME sentenceDetector = new SentenceDetectorME(model); return sentenceDetector.sentPosDetect(text); } }
@Override protected SentenceModel loadModel(InputStream in) throws IOException { return new SentenceModel(in); } }
@Override protected SentenceModel loadModel(InputStream modelIn) throws IOException, InvalidFormatException { return new SentenceModel(modelIn); }
if (sentenceModelIndex > 0 && sentenceModelIndex < args.length) { sentenceDetector = new SentenceDetectorME( new SentenceModel(new File(args[sentenceModelIndex])));
public static SentenceModel train(String languageCode, ObjectStream<SentenceSample> samples, SentenceDetectorFactory sdFactory, TrainingParameters mlParams) throws IOException { Map<String, String> manifestInfoEntries = new HashMap<>(); // TODO: Fix the EventStream to throw exceptions when training goes wrong ObjectStream<Event> eventStream = new SDEventStream(samples, sdFactory.getSDContextGenerator(), sdFactory.getEndOfSentenceScanner()); EventTrainer trainer = TrainerFactory.getEventTrainer(mlParams, manifestInfoEntries); MaxentModel sentModel = trainer.train(eventStream); return new SentenceModel(languageCode, sentModel, manifestInfoEntries, sdFactory); }
@Test public void evalSentenceModel() throws Exception { SentenceModel model = new SentenceModel( new File(getOpennlpDataDir(), "models-sf/en-sent.bin")); MessageDigest digest = MessageDigest.getInstance(HASH_ALGORITHM); SentenceDetector sentenceDetector = new SentenceDetectorME(model); StringBuilder text = new StringBuilder(); try (ObjectStream<LeipzigTestSample> lineBatches = new LeipzigTestSampleStream(25, SimpleTokenizer.INSTANCE, new MarkableFileInputStreamFactory(new File(getOpennlpDataDir(), "leipzig/eng_news_2010_300K-sentences.txt")))) { LeipzigTestSample lineBatch; while ((lineBatch = lineBatches.read()) != null) { text.append(String.join(" ", lineBatch.getText())).append(" "); } } String[] sentences = sentenceDetector.sentDetect(text.toString()); for (String sentence : sentences) { digest.update(sentence.getBytes(StandardCharsets.UTF_8)); } Assert.assertEquals(new BigInteger("228544068397077998410949364710969159291"), new BigInteger(1, digest.digest())); }
@Override protected SentenceModel loadModel(InputStream in) throws IOException { return new SentenceModel(in); } }
@Override protected SentenceModel loadModel(InputStream modelIn) throws IOException, InvalidFormatException { return new SentenceModel(modelIn); }
sentDetector = new SentenceDetectorME(new SentenceModel(params.getSentenceDetectorModel())); } catch (IOException e) { throw new TerminateToolException(-1, "Failed to load sentence detector model!", e);
@Test public void testDefault() throws IOException { Dictionary dic = loadAbbDictionary(); char[] eos = {'.', '?'}; SentenceModel sdModel = train(new SentenceDetectorFactory("eng", true, dic, eos)); SentenceDetectorFactory factory = sdModel.getFactory(); Assert.assertTrue(factory.getSDContextGenerator() instanceof DefaultSDContextGenerator); Assert.assertTrue(factory.getEndOfSentenceScanner() instanceof DefaultEndOfSentenceScanner); Assert.assertTrue(Arrays.equals(eos, factory.getEOSCharacters())); ByteArrayOutputStream out = new ByteArrayOutputStream(); sdModel.serialize(out); ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); SentenceModel fromSerialized = new SentenceModel(in); factory = fromSerialized.getFactory(); Assert.assertTrue(factory.getSDContextGenerator() instanceof DefaultSDContextGenerator); Assert.assertTrue(factory.getEndOfSentenceScanner() instanceof DefaultEndOfSentenceScanner); Assert.assertTrue(Arrays.equals(eos, factory.getEOSCharacters())); }
public static SentenceDetector getDefaultSentenceDetector() throws IOException { return new SentenceDetectorME(new SentenceModel( getResourceAsStream(sentDetectorModelFile))); }
@Test public void testNullDict() throws IOException { Dictionary dic = null; char[] eos = {'.', '?'}; SentenceModel sdModel = train(new SentenceDetectorFactory("eng", true, dic, eos)); SentenceDetectorFactory factory = sdModel.getFactory(); Assert.assertNull(factory.getAbbreviationDictionary()); Assert.assertTrue(factory.getSDContextGenerator() instanceof DefaultSDContextGenerator); Assert.assertTrue(factory.getEndOfSentenceScanner() instanceof DefaultEndOfSentenceScanner); Assert.assertTrue(Arrays.equals(eos, factory.getEOSCharacters())); ByteArrayOutputStream out = new ByteArrayOutputStream(); sdModel.serialize(out); ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); SentenceModel fromSerialized = new SentenceModel(in); factory = fromSerialized.getFactory(); Assert.assertNull(factory.getAbbreviationDictionary()); Assert.assertTrue(factory.getSDContextGenerator() instanceof DefaultSDContextGenerator); Assert.assertTrue(factory.getEndOfSentenceScanner() instanceof DefaultEndOfSentenceScanner); Assert.assertTrue(Arrays.equals(eos, factory.getEOSCharacters())); }
public static SentenceDetector getDefaultSentenceDetector() throws IOException { return new SentenceDetectorME(new SentenceModel( getResourceAsStream(sentDetectorModelFile))); }
public void setSentenceDetector(File modelFile) throws IOException{ InputStream modelStream = new FileInputStream(modelFile); SentenceModel model = null ; model = new SentenceModel(modelStream); sentenceDetector = new SentenceDetectorME(model) ; }
@Test public void testDefaultEOS() throws IOException { Dictionary dic = null; char[] eos = null; SentenceModel sdModel = train(new SentenceDetectorFactory("eng", true, dic, eos)); SentenceDetectorFactory factory = sdModel.getFactory(); Assert.assertNull(factory.getAbbreviationDictionary()); Assert.assertTrue(factory.getSDContextGenerator() instanceof DefaultSDContextGenerator); Assert.assertTrue(factory.getEndOfSentenceScanner() instanceof DefaultEndOfSentenceScanner); Assert.assertTrue(Arrays.equals(Factory.defaultEosCharacters, factory.getEOSCharacters())); ByteArrayOutputStream out = new ByteArrayOutputStream(); sdModel.serialize(out); ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); SentenceModel fromSerialized = new SentenceModel(in); factory = fromSerialized.getFactory(); Assert.assertNull(factory.getAbbreviationDictionary()); Assert.assertTrue(factory.getSDContextGenerator() instanceof DefaultSDContextGenerator); Assert.assertTrue(factory.getEndOfSentenceScanner() instanceof DefaultEndOfSentenceScanner); Assert.assertTrue(Arrays.equals(Factory.defaultEosCharacters, factory.getEOSCharacters())); }
@Test public void testDummyFactory() throws IOException { Dictionary dic = loadAbbDictionary(); char[] eos = {'.', '?'}; SentenceModel sdModel = train(new DummySentenceDetectorFactory("eng", true, dic, eos)); SentenceDetectorFactory factory = sdModel.getFactory(); Assert.assertTrue(factory.getAbbreviationDictionary() instanceof DummyDictionary); Assert.assertTrue(factory.getSDContextGenerator() instanceof DummySDContextGenerator); Assert.assertTrue(factory.getEndOfSentenceScanner() instanceof DummyEOSScanner); Assert.assertTrue(Arrays.equals(eos, factory.getEOSCharacters())); ByteArrayOutputStream out = new ByteArrayOutputStream(); sdModel.serialize(out); ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); SentenceModel fromSerialized = new SentenceModel(in); factory = fromSerialized.getFactory(); Assert.assertTrue(factory.getAbbreviationDictionary() instanceof DummyDictionary); Assert.assertTrue(factory.getSDContextGenerator() instanceof DummySDContextGenerator); Assert.assertTrue(factory.getEndOfSentenceScanner() instanceof DummyEOSScanner); Assert.assertTrue(Arrays.equals(eos, factory.getEOSCharacters())); Assert.assertEquals(factory.getAbbreviationDictionary(), sdModel.getAbbreviations()); Assert.assertTrue(Arrays.equals(factory.getEOSCharacters(), sdModel.getEosCharacters())); }
/** * Builds an {@link ApacheExtractor} by instantiating the OpenNLP * Name Finder and Tokenizer. * * @throws IOException */ public ApacheExtractor() throws IOException { nameFinder = new NameFinderME(new TokenNameFinderModel(ApacheExtractor.class.getResourceAsStream(pathToNERModel))); tokenizer = new TokenizerME(new TokenizerModel(ApacheExtractor.class.getResourceAsStream(pathToTokenizerModel))); sentenceDetector = new SentenceDetectorME(new SentenceModel(ApacheExtractor.class.getResourceAsStream(pathToSentenceDetectorModel))); }
/** * Builds an {@link ApacheExtractor} by instantiating the OpenNLP * Name Finder and Tokenizer. * * @throws IOException */ public ApacheExtractor() throws IOException { nameFinder = new NameFinderME(new TokenNameFinderModel(ApacheExtractor.class.getResourceAsStream(pathToNERModel))); tokenizer = new TokenizerME(new TokenizerModel(ApacheExtractor.class.getResourceAsStream(pathToTokenizerModel))); sentenceDetector = new SentenceDetectorME(new SentenceModel(ApacheExtractor.class.getResourceAsStream(pathToSentenceDetectorModel))); }
public static SentenceModel train(String languageCode, ObjectStream<SentenceSample> samples, SentenceDetectorFactory sdFactory, TrainingParameters mlParams) throws IOException { Map<String, String> manifestInfoEntries = new HashMap<>(); // TODO: Fix the EventStream to throw exceptions when training goes wrong ObjectStream<Event> eventStream = new SDEventStream(samples, sdFactory.getSDContextGenerator(), sdFactory.getEndOfSentenceScanner()); EventTrainer trainer = TrainerFactory.getEventTrainer(mlParams, manifestInfoEntries); MaxentModel sentModel = trainer.train(eventStream); return new SentenceModel(languageCode, sentModel, manifestInfoEntries, sdFactory); }