public SDContextGenerator createSentenceContextGenerator( Set<String> abbreviations, char[] customEOSCharacters) { return new DefaultSDContextGenerator(abbreviations, customEOSCharacters); }
if (position < lastIndex && StringUtil.isWhitespace(sb.charAt(position + 1))) collectFeats.add("sn"); collectFeats.add("eos=" + escapeChar(sb.charAt(position))); int prefixStart = previousSpaceIndex(sb, position); int prevStart = previousSpaceIndex(sb, prefixStart); previous = String.valueOf(sb.subSequence(prevStart, prefixStart)).trim(); int suffixEnd = nextSpaceIndex(sb, position, lastIndex); int nextEnd = nextSpaceIndex(sb, suffixEnd + 1, lastIndex + 1); if (position == lastIndex) { suffix = ""; collectFeatures(prefix,suffix,previous,next, sb.charAt(position));
if (!prefix.equals("")) { collectFeats.add(Integer.toString(prefix.length())); if (isFirstUpper(prefix)) { collectFeats.add("xcap"); buf.setLength(0); if (!previous.equals("")) { if (isFirstUpper(previous)) { collectFeats.add("vcap"); buf.setLength(0); if (!suffix.equals("")) { if (isFirstUpper(suffix)) { collectFeats.add("scap"); buf.setLength(0); if (!next.equals("")) { if (isFirstUpper(next)) { collectFeats.add("ncap");
/** * Determines some of the features for the sentence detector and adds them to list features. * * @param prefix String preceding the eos character in the eos token. * @param suffix String following the eos character in the eos token. * @param previous Space delimited token preceding token containing eos character. * @param next Space delimited token following token containing eos character. * * @deprecated use {@link #collectFeatures(String, String, String, String, Character)} instead. */ protected void collectFeatures(String prefix, String suffix, String previous, String next) { collectFeatures(prefix, suffix, previous, next, null); }
/** * Determines some of the features for the sentence detector and adds them to list features. * * @param prefix String preceding the eos character in the eos token. * @param suffix String following the eos character in the eos token. * @param previous Space delimited token preceding token containing eos character. * @param next Space delimited token following token containing eos character. * * @deprecated use {@link #collectFeatures(String, String, String, String, Character)} instead. */ protected void collectFeatures(String prefix, String suffix, String previous, String next) { collectFeatures(prefix, suffix, previous, next, null); }
public SDContextGenerator createSentenceContextGenerator(String languageCode, Set<String> abbreviations) { if ("th".equals(languageCode) || "tha".equals(languageCode)) { return new SentenceContextGenerator(); } else if ("pt".equals(languageCode) || "por".equals(languageCode)) { return new DefaultSDContextGenerator(abbreviations, ptEosCharacters); } return new DefaultSDContextGenerator(abbreviations, defaultEosCharacters); }
if (position < lastIndex && StringUtil.isWhitespace(sb.charAt(position + 1))) collectFeats.add("sn"); collectFeats.add("eos=" + escapeChar(sb.charAt(position))); int prefixStart = previousSpaceIndex(sb, position); int prevStart = previousSpaceIndex(sb, prefixStart); previous = String.valueOf(sb.subSequence(prevStart, prefixStart)).trim(); int suffixEnd = nextSpaceIndex(sb, position, lastIndex); int nextEnd = nextSpaceIndex(sb, suffixEnd + 1, lastIndex + 1); if (position == lastIndex) { suffix = ""; collectFeatures(prefix,suffix,previous,next, sb.charAt(position));
/** * Determines some of the features for the sentence detector and adds them to list features. * * @param prefix String preceding the eos character in the eos token. * @param suffix String following the eos character in the eos token. * @param previous Space delimited token preceding token containing eos character. * @param next Space delimited token following token containing eos character. * * @deprecated use {@link #collectFeatures(String, String, String, String, Character)} instead. */ protected void collectFeatures(String prefix, String suffix, String previous, String next) { collectFeatures(prefix, suffix, previous, next, null); }
if (!prefix.equals("")) { collectFeats.add(Integer.toString(prefix.length())); if (isFirstUpper(prefix)) { collectFeats.add("xcap"); buf.setLength(0); if (!previous.equals("")) { if (isFirstUpper(previous)) { collectFeats.add("vcap"); buf.setLength(0); if (!suffix.equals("")) { if (isFirstUpper(suffix)) { collectFeats.add("scap"); buf.setLength(0); if (!next.equals("")) { if (isFirstUpper(next)) { collectFeats.add("ncap");
@Test public void testGetContextWithAbbreviations() throws Exception { SDContextGenerator sdContextGenerator = new DefaultSDContextGenerator(new HashSet<>(Arrays.asList("Mr./Inc.".split("/"))), Factory.defaultEosCharacters); String[] context = sdContextGenerator.getContext( "Mr. Smith joined RONDHUIT Inc. as a manager of sales department.", 2); Assert.assertArrayEquals("sn/eos=./x=Mr/2/xcap/xabbrev/v=/s=/n=Smith/ncap".split("/"), context); context = sdContextGenerator.getContext( "Mr. Smith joined RONDHUIT Inc. as a manager of sales department.", 29); Assert.assertArrayEquals("sn/eos=./x=Inc/3/xcap/xabbrev/v=RONDHUIT/vcap/s=/n=as".split("/"), context); } }
if (position < lastIndex && StringUtil.isWhitespace(sb.charAt(position + 1))) collectFeats.add("sn"); collectFeats.add("eos=" + escapeChar(sb.charAt(position))); int prefixStart = previousSpaceIndex(sb, position); int prevStart = previousSpaceIndex(sb, prefixStart); previous = String.valueOf(sb.subSequence(prevStart, prefixStart)).trim(); int suffixEnd = nextSpaceIndex(sb, position, lastIndex); int nextEnd = nextSpaceIndex(sb, suffixEnd + 1, lastIndex + 1); if (position == lastIndex) { suffix = ""; collectFeatures(prefix,suffix,previous,next, sb.charAt(position));
if (!prefix.equals("")) { collectFeats.add(Integer.toString(prefix.length())); if (isFirstUpper(prefix)) { collectFeats.add("xcap"); buf.setLength(0); if (!previous.equals("")) { if (isFirstUpper(previous)) { collectFeats.add("vcap"); buf.setLength(0); if (!suffix.equals("")) { if (isFirstUpper(suffix)) { collectFeats.add("scap"); buf.setLength(0); if (!next.equals("")) { if (isFirstUpper(next)) { collectFeats.add("ncap");
@Test public void testGetContext() throws Exception { SDContextGenerator sdContextGenerator = new DefaultSDContextGenerator(Collections.<String>emptySet(), Factory.defaultEosCharacters); String[] context = sdContextGenerator.getContext( "Mr. Smith joined RONDHUIT Inc. as a manager of sales department.", 2); Assert.assertArrayEquals("sn/eos=./x=Mr/2/xcap/v=/s=/n=Smith/ncap".split("/"), context); context = sdContextGenerator.getContext( "Mr. Smith joined RONDHUIT Inc. as a manager of sales department.", 29); Assert.assertArrayEquals("sn/eos=./x=Inc/3/xcap/v=RONDHUIT/vcap/s=/n=as".split("/"), context); }
public SDContextGenerator createSentenceContextGenerator( Set<String> abbreviations, char[] customEOSCharacters) { return new DefaultSDContextGenerator(abbreviations, customEOSCharacters); }
public SDContextGenerator createSentenceContextGenerator( Set<String> abbreviations, char[] customEOSCharacters) { return new DefaultSDContextGenerator(abbreviations, customEOSCharacters); }
public SDContextGenerator createSentenceContextGenerator(String languageCode, Set<String> abbreviations) { if ("th".equals(languageCode) || "tha".equals(languageCode)) { return new SentenceContextGenerator(); } else if ("pt".equals(languageCode) || "por".equals(languageCode)) { return new DefaultSDContextGenerator(abbreviations, ptEosCharacters); } return new DefaultSDContextGenerator(abbreviations, defaultEosCharacters); }
public SDContextGenerator createSentenceContextGenerator(String languageCode, Set<String> abbreviations) { if ("th".equals(languageCode) || "tha".equals(languageCode)) { return new SentenceContextGenerator(); } else if ("pt".equals(languageCode) || "por".equals(languageCode)) { return new DefaultSDContextGenerator(abbreviations, ptEosCharacters); } return new DefaultSDContextGenerator(abbreviations, defaultEosCharacters); }
@Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); try (InputStream is = FileLocator.getAsStream(sdModelPath)){ logger.info("Sentence detector model file: " + sdModelPath); sdmodel = new SentenceModel(is); EndOfSentenceScannerImpl eoss = new EndOfSentenceScannerImpl(); DefaultSDContextGenerator cg = new DefaultSDContextGenerator(eoss.getEndOfSentenceCharacters()); sentenceDetector = new SentenceDetectorCtakes(sdmodel.getMaxentModel(), cg, eoss); skipSegmentsSet = new HashSet<>(); if(skipSegmentsArray != null){ Collections.addAll(skipSegmentsSet, skipSegmentsArray); } } catch (IOException e) { e.printStackTrace(); throw new ResourceInitializationException(e); } }
@Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); try (InputStream is = FileLocator.getAsStream(sdModelPath)){ logger.info("Sentence detector model file: " + sdModelPath); sdmodel = new SentenceModel(is); EndOfSentenceScannerImpl eoss = new EndOfSentenceScannerImpl(); DefaultSDContextGenerator cg = new DefaultSDContextGenerator(eoss.getEndOfSentenceCharacters()); sentenceDetector = new SentenceDetectorCtakes(sdmodel.getMaxentModel(), cg, eoss); skipSegmentsSet = new HashSet<>(); if(skipSegmentsArray != null){ Collections.addAll(skipSegmentsSet, skipSegmentsArray); } } catch (IOException e) { e.printStackTrace(); throw new ResourceInitializationException(e); } }
/** * Reads configuration parameters. * * @throws ResourceAccessException * @throws IOException * @throws InvalidFormatException */ private void configInit() throws ResourceAccessException, InvalidFormatException, IOException { String sdModelPath = (String) context .getConfigParameterValue(SD_MODEL_FILE_PARAM); InputStream is = FileLocator.getAsStream(sdModelPath); logger.info("Sentence detector model file: " + sdModelPath); sdmodel = new SentenceModel(is); is.close(); EndOfSentenceScannerImpl eoss = new EndOfSentenceScannerImpl(); char[] eosc = eoss.getEndOfSentenceCharacters(); // SentenceDContextGenerator cg = new SentenceDContextGenerator(); DefaultSDContextGenerator cg = new DefaultSDContextGenerator(eosc); sentenceDetector = new SentenceDetectorCtakes(sdmodel.getMaxentModel(), cg, eoss); skipSegmentsSet = ParamUtil.getStringParameterValuesSet( PARAM_SEGMENTS_TO_SKIP, context); // vng change begin paragraphPattern = compilePatternCheck("paragraphPattern", PARAGRAPH_PATTERN); splitPattern = compilePatternCheck("splitPattern", SPLIT_PATTERN); periodPattern = compilePatternCheck("periodPattern", PERIOD_PATTERN); acronymPattern = compilePatternCheck("acronymPattern", ACRONYM_PATTERN); // vng change end } /**