private String[] testOpenNLP(String text) throws Exception { try (InputStream modelIn = this.getClass().getResourceAsStream(RESOURCES_EN_SENT_BIN)) { SentenceModel model = new SentenceModel(modelIn); SentenceDetectorME sentenceDetector = new SentenceDetectorME(model); return sentenceDetector.sentDetect(text); } }
/** * @deprecated Use a {@link SentenceDetectorFactory} to extend * SentenceDetector functionality. */ public SentenceDetectorME(SentenceModel model, Factory factory) { this.model = model.getMaxentModel(); // if the model has custom EOS characters set, use this to get the context // generator and the EOS scanner; otherwise use language-specific defaults char[] customEOSCharacters = model.getEosCharacters(); if (customEOSCharacters == null) { cgen = factory.createSentenceContextGenerator(model.getLanguage(), getAbbreviations(model.getAbbreviations())); scanner = factory.createEndOfSentenceScanner(model.getLanguage()); } else { cgen = factory.createSentenceContextGenerator( getAbbreviations(model.getAbbreviations()), customEOSCharacters); scanner = factory.createEndOfSentenceScanner(customEOSCharacters); } useTokenEnd = model.useTokenEnd(); }
/** * Initializes the current instance. * * @param model the {@link SentenceModel} */ public SentenceDetectorME(SentenceModel model) { SentenceDetectorFactory sdFactory = model.getFactory(); this.model = model.getMaxentModel(); cgen = sdFactory.getSDContextGenerator(); scanner = sdFactory.getEndOfSentenceScanner(); useTokenEnd = sdFactory.isUseTokenEnd(); }
@Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); try (InputStream is = FileLocator.getAsStream(sdModelPath)){ logger.info("Sentence detector model file: " + sdModelPath); sdmodel = new SentenceModel(is); EndOfSentenceScannerImpl eoss = new EndOfSentenceScannerImpl(); DefaultSDContextGenerator cg = new DefaultSDContextGenerator(eoss.getEndOfSentenceCharacters()); sentenceDetector = new SentenceDetectorCtakes(sdmodel.getMaxentModel(), cg, eoss); skipSegmentsSet = new HashSet<>(); if(skipSegmentsArray != null){ Collections.addAll(skipSegmentsSet, skipSegmentsArray); } } catch (IOException e) { e.printStackTrace(); throw new ResourceInitializationException(e); } }
@Test public void testDefault() throws IOException { Dictionary dic = loadAbbDictionary(); char[] eos = {'.', '?'}; SentenceModel sdModel = train(new SentenceDetectorFactory("eng", true, dic, eos)); SentenceDetectorFactory factory = sdModel.getFactory(); Assert.assertTrue(factory.getSDContextGenerator() instanceof DefaultSDContextGenerator); Assert.assertTrue(factory.getEndOfSentenceScanner() instanceof DefaultEndOfSentenceScanner); Assert.assertTrue(Arrays.equals(eos, factory.getEOSCharacters())); ByteArrayOutputStream out = new ByteArrayOutputStream(); sdModel.serialize(out); ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); SentenceModel fromSerialized = new SentenceModel(in); factory = fromSerialized.getFactory(); Assert.assertTrue(factory.getSDContextGenerator() instanceof DefaultSDContextGenerator); Assert.assertTrue(factory.getEndOfSentenceScanner() instanceof DefaultEndOfSentenceScanner); Assert.assertTrue(Arrays.equals(eos, factory.getEOSCharacters())); }
@Test public void testDummyFactory() throws IOException { Dictionary dic = loadAbbDictionary(); char[] eos = {'.', '?'}; SentenceModel sdModel = train(new DummySentenceDetectorFactory("eng", true, dic, eos)); SentenceDetectorFactory factory = sdModel.getFactory(); Assert.assertTrue(factory.getAbbreviationDictionary() instanceof DummyDictionary); Assert.assertTrue(factory.getSDContextGenerator() instanceof DummySDContextGenerator); Assert.assertTrue(factory.getEndOfSentenceScanner() instanceof DummyEOSScanner); Assert.assertTrue(Arrays.equals(eos, factory.getEOSCharacters())); ByteArrayOutputStream out = new ByteArrayOutputStream(); sdModel.serialize(out); ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); SentenceModel fromSerialized = new SentenceModel(in); factory = fromSerialized.getFactory(); Assert.assertTrue(factory.getAbbreviationDictionary() instanceof DummyDictionary); Assert.assertTrue(factory.getSDContextGenerator() instanceof DummySDContextGenerator); Assert.assertTrue(factory.getEndOfSentenceScanner() instanceof DummyEOSScanner); Assert.assertTrue(Arrays.equals(eos, factory.getEOSCharacters())); Assert.assertEquals(factory.getAbbreviationDictionary(), sdModel.getAbbreviations()); Assert.assertTrue(Arrays.equals(factory.getEOSCharacters(), sdModel.getEosCharacters())); }
@Override public void done() { samplesQueue.add(POISON); try { modelTrained.await(); if (ioException != null) { throw new RuntimeException(ioException); } if (sentenceModel == null) { throw new RuntimeException("Error training sentence model."); } OutputStream outputStream = Files.newOutputStream(outputPath.resolve("sentence.bin"), CREATE, TRUNCATE_EXISTING); sentenceModel.serialize(outputStream); } catch (InterruptedException e) { throw new RuntimeException("Interrupted before model could be saved."); } catch (IOException e) { throw new RuntimeException("Failed to write out model."); } }
private SentenceDetector getSentenceDetector(String language) { try { SentenceModel model = openNLP.getSentenceModel(language); if(model != null) { log.debug("Sentence Detection Model {} for lanugage '{}' version: {}", new Object[]{model.getClass().getSimpleName(), model.getLanguage(), model.getVersion() != null ? model.getVersion() : "undefined"}); return new SentenceDetectorME(model); } } catch (Exception e) { } log.debug("Sentence Detection Model for Language '{}' not available.", language); return null; } private POSTagger getPOSTagger(String language) {
@Override protected void validateArtifactMap() throws InvalidFormatException { super.validateArtifactMap(); if (!(artifactMap.get(MAXENT_MODEL_ENTRY_NAME) instanceof MaxentModel)) { throw new InvalidFormatException("Unable to find " + MAXENT_MODEL_ENTRY_NAME + " maxent model!"); } if (!ModelUtil.validateOutcomes(getMaxentModel(), SentenceDetectorME.SPLIT, SentenceDetectorME.NO_SPLIT)) { throw new InvalidFormatException("The maxent model is not compatible " + "with the sentence detector!"); } }
StandardCharsets.UTF_8)), factory, mlParams); Assert.assertEquals("eng", sentdetectModel.getLanguage());
@Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); try (InputStream is = FileLocator.getAsStream(sdModelPath)){ logger.info("Sentence detector model file: " + sdModelPath); sdmodel = new SentenceModel(is); EndOfSentenceScannerImpl eoss = new EndOfSentenceScannerImpl(); DefaultSDContextGenerator cg = new DefaultSDContextGenerator(eoss.getEndOfSentenceCharacters()); sentenceDetector = new SentenceDetectorCtakes(sdmodel.getMaxentModel(), cg, eoss); skipSegmentsSet = new HashSet<>(); if(skipSegmentsArray != null){ Collections.addAll(skipSegmentsSet, skipSegmentsArray); } } catch (IOException e) { e.printStackTrace(); throw new ResourceInitializationException(e); } }
@Test public void testNullDict() throws IOException { Dictionary dic = null; char[] eos = {'.', '?'}; SentenceModel sdModel = train(new SentenceDetectorFactory("eng", true, dic, eos)); SentenceDetectorFactory factory = sdModel.getFactory(); Assert.assertNull(factory.getAbbreviationDictionary()); Assert.assertTrue(factory.getSDContextGenerator() instanceof DefaultSDContextGenerator); Assert.assertTrue(factory.getEndOfSentenceScanner() instanceof DefaultEndOfSentenceScanner); Assert.assertTrue(Arrays.equals(eos, factory.getEOSCharacters())); ByteArrayOutputStream out = new ByteArrayOutputStream(); sdModel.serialize(out); ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); SentenceModel fromSerialized = new SentenceModel(in); factory = fromSerialized.getFactory(); Assert.assertNull(factory.getAbbreviationDictionary()); Assert.assertTrue(factory.getSDContextGenerator() instanceof DefaultSDContextGenerator); Assert.assertTrue(factory.getEndOfSentenceScanner() instanceof DefaultEndOfSentenceScanner); Assert.assertTrue(Arrays.equals(eos, factory.getEOSCharacters())); }
@Override public void collectionProcessComplete() throws AnalysisEngineProcessException { try { stream.close(); } catch (IOException e) { throw new AnalysisEngineProcessException(e); } SentenceModel model; try { model = future.get(); } catch (InterruptedException | ExecutionException e) { throw new AnalysisEngineProcessException(e); } try (OutputStream out = new FileOutputStream(targetLocation)) { model.serialize(out); } catch (IOException e) { throw new AnalysisEngineProcessException(e); } } }
/** * Initializes the current instance. * * @param model the {@link SentenceModel} */ public SentenceDetectorME(SentenceModel model) { SentenceDetectorFactory sdFactory = model.getFactory(); this.model = model.getMaxentModel(); cgen = sdFactory.getSDContextGenerator(); scanner = sdFactory.getEndOfSentenceScanner(); useTokenEnd = sdFactory.isUseTokenEnd(); }
log.debug("Sentence Detection Model {} for lanugage '{}' version: {}", new Object[]{model.getClass().getSimpleName(), model.getLanguage(), model.getVersion() != null ? model.getVersion() : "undefined"}); return new SentenceDetectorME(model);
@Override protected void validateArtifactMap() throws InvalidFormatException { super.validateArtifactMap(); if (!(artifactMap.get(MAXENT_MODEL_ENTRY_NAME) instanceof MaxentModel)) { throw new InvalidFormatException("Unable to find " + MAXENT_MODEL_ENTRY_NAME + " maxent model!"); } if (!ModelUtil.validateOutcomes(getMaxentModel(), SentenceDetectorME.SPLIT, SentenceDetectorME.NO_SPLIT)) { throw new InvalidFormatException("The maxent model is not compatible " + "with the sentence detector!"); } }
private Span[] testOpenNLPPosition(String text) throws Exception { try (InputStream modelIn = this.getClass().getResourceAsStream(RESOURCES_EN_SENT_BIN)) { SentenceModel model = new SentenceModel(modelIn); SentenceDetectorME sentenceDetector = new SentenceDetectorME(model); return sentenceDetector.sentPosDetect(text); } }
/** * Reads configuration parameters. * * @throws ResourceAccessException * @throws IOException * @throws InvalidFormatException */ private void configInit() throws ResourceAccessException, InvalidFormatException, IOException { String sdModelPath = (String) context .getConfigParameterValue(SD_MODEL_FILE_PARAM); InputStream is = FileLocator.getAsStream(sdModelPath); logger.info("Sentence detector model file: " + sdModelPath); sdmodel = new SentenceModel(is); is.close(); EndOfSentenceScannerImpl eoss = new EndOfSentenceScannerImpl(); char[] eosc = eoss.getEndOfSentenceCharacters(); // SentenceDContextGenerator cg = new SentenceDContextGenerator(); DefaultSDContextGenerator cg = new DefaultSDContextGenerator(eosc); sentenceDetector = new SentenceDetectorCtakes(sdmodel.getMaxentModel(), cg, eoss); skipSegmentsSet = ParamUtil.getStringParameterValuesSet( PARAM_SEGMENTS_TO_SKIP, context); // vng change begin paragraphPattern = compilePatternCheck("paragraphPattern", PARAGRAPH_PATTERN); splitPattern = compilePatternCheck("splitPattern", SPLIT_PATTERN); periodPattern = compilePatternCheck("periodPattern", PERIOD_PATTERN); acronymPattern = compilePatternCheck("acronymPattern", ACRONYM_PATTERN); // vng change end } /**
@Test public void testDefaultEOS() throws IOException { Dictionary dic = null; char[] eos = null; SentenceModel sdModel = train(new SentenceDetectorFactory("eng", true, dic, eos)); SentenceDetectorFactory factory = sdModel.getFactory(); Assert.assertNull(factory.getAbbreviationDictionary()); Assert.assertTrue(factory.getSDContextGenerator() instanceof DefaultSDContextGenerator); Assert.assertTrue(factory.getEndOfSentenceScanner() instanceof DefaultEndOfSentenceScanner); Assert.assertTrue(Arrays.equals(Factory.defaultEosCharacters, factory.getEOSCharacters())); ByteArrayOutputStream out = new ByteArrayOutputStream(); sdModel.serialize(out); ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); SentenceModel fromSerialized = new SentenceModel(in); factory = fromSerialized.getFactory(); Assert.assertNull(factory.getAbbreviationDictionary()); Assert.assertTrue(factory.getSDContextGenerator() instanceof DefaultSDContextGenerator); Assert.assertTrue(factory.getEndOfSentenceScanner() instanceof DefaultEndOfSentenceScanner); Assert.assertTrue(Arrays.equals(Factory.defaultEosCharacters, factory.getEOSCharacters())); }
/** * @deprecated Use a {@link SentenceDetectorFactory} to extend * SentenceDetector functionality. */ public SentenceDetectorME(SentenceModel model, Factory factory) { this.model = model.getMaxentModel(); // if the model has custom EOS characters set, use this to get the context // generator and the EOS scanner; otherwise use language-specific defaults char[] customEOSCharacters = model.getEosCharacters(); if (customEOSCharacters == null) { cgen = factory.createSentenceContextGenerator(model.getLanguage(), getAbbreviations(model.getAbbreviations())); scanner = factory.createEndOfSentenceScanner(model.getLanguage()); } else { cgen = factory.createSentenceContextGenerator( getAbbreviations(model.getAbbreviations()), customEOSCharacters); scanner = factory.createEndOfSentenceScanner(customEOSCharacters); } useTokenEnd = model.useTokenEnd(); }