/** * Sentence split, in addition to a bunch of other things in this annotator (be careful to check the implementation!) */ public Annotator wordToSentences(Properties properties) { return new WordsToSentencesAnnotator(properties); }
"false"))) { WordsToSentencesAnnotator wts = WordsToSentencesAnnotator.newlineSplitter(false); return wts; } else { WordsToSentencesAnnotator wts = new WordsToSentencesAnnotator(false); wts.setSentenceBoundaryToDiscard(new HashSet<String> (Arrays.asList(toks))); if (bounds != null){ String [] toks = bounds.split(","); wts.addHtmlSentenceBoundaryToDiscard(new HashSet<String> (Arrays.asList(toks))); wts.setOneSentence(Boolean.parseBoolean(isOneSentence));
if (whitespaceTokenization) { if (System.getProperty("line.separator").equals("\n")) { return WordsToSentencesAnnotator.newlineSplitter(false, "\n"); } else { return WordsToSentencesAnnotator.newlineSplitter(false, System.getProperty("line.separator"), "\n"); return WordsToSentencesAnnotator.newlineSplitter(false, PTBTokenizer.getNewlineToken()); return WordsToSentencesAnnotator.nonSplitter(false); return new WordsToSentencesAnnotator(false, boundaryTokenRegex, boundariesToDiscard, htmlElementsToDiscard, nlsb, boundaryMultiTokenRegex, tokenRegexesToDiscard);
/** Return a WordsToSentencesAnnotator that never splits the token stream. You just get one sentence. * * @return A WordsToSentenceAnnotator. */ public static WordsToSentencesAnnotator nonSplitter() { WordToSentenceProcessor<CoreLabel> wts = new WordToSentenceProcessor<>(true); return new WordsToSentencesAnnotator(false, false, wts); }
/** Return a WordsToSentencesAnnotator that splits on newlines (only), which are then deleted. * This constructor counts the lines by putting in empty token lists for empty lines. * It tells the underlying splitter to return empty lists of tokens * and then treats those empty lists as empty lines. We don't * actually include empty sentences in the annotation, though. But they * are used in numbering the sentence. Only this constructor leads to * empty sentences. * * @param nlToken Zero or more new line tokens, which might be a {@literal \n} or the fake * newline tokens returned from the tokenizer. * @return A WordsToSentenceAnnotator. */ public static WordsToSentencesAnnotator newlineSplitter(String... nlToken) { // this constructor will keep empty lines as empty sentences WordToSentenceProcessor<CoreLabel> wts = new WordToSentenceProcessor<>(ArrayUtils.asImmutableSet(nlToken)); return new WordsToSentencesAnnotator(false, true, wts); }
private static AnnotationPipeline makeNumericPipeline() { AnnotationPipeline pipeline = new AnnotationPipeline(); pipeline.addAnnotator(new TokenizerAnnotator(false, "en")); pipeline.addAnnotator(new WordsToSentencesAnnotator(false)); pipeline.addAnnotator(new POSTaggerAnnotator(false)); pipeline.addAnnotator(new TimeAnnotator(true)); return pipeline; }
final boolean verbose = false; ap.addAnnotator(new TokenizerAnnotator(verbose, "en")); ap.addAnnotator(new WordsToSentencesAnnotator(verbose));
public static AnnotationPipeline getPipeline(Properties props, boolean tokenize) throws Exception { // useGUTime = Boolean.parseBoolean(props.getProperty("gutime", "false")); AnnotationPipeline pipeline = new AnnotationPipeline(); if (tokenize) { pipeline.addAnnotator(new TokenizerAnnotator(false, "en")); pipeline.addAnnotator(new WordsToSentencesAnnotator(false)); } pipeline.addAnnotator(new POSTaggerAnnotator(false)); // pipeline.addAnnotator(new NumberAnnotator(false)); // pipeline.addAnnotator(new QuantifiableEntityNormalizingAnnotator(false, false)); String timeAnnotator = props.getProperty("timeAnnotator", "sutime"); switch (timeAnnotator) { case "gutime": useGUTime = true; pipeline.addAnnotator(new GUTimeAnnotator("gutime", props)); break; case "heideltime": requiredDocDateFormat = "yyyy-MM-dd"; pipeline.addAnnotator(new HeidelTimeAnnotator("heideltime", props)); break; case "sutime": pipeline.addAnnotator(new TimeAnnotator("sutime", props)); break; default: throw new IllegalArgumentException("Unknown timeAnnotator: " + timeAnnotator); } return pipeline; }
/** * Sentence split, in addition to a bunch of other things in this annotator (be careful to check the implementation!) */ public Annotator wordToSentences(Properties properties) { return new WordsToSentencesAnnotator(properties); }
/** Return a WordsToSentencesAnnotator that never splits the token stream. You just get one sentence. * * @return A WordsToSentenceAnnotator. */ public static WordsToSentencesAnnotator nonSplitter() { WordToSentenceProcessor<CoreLabel> wts = new WordToSentenceProcessor<>(true); return new WordsToSentencesAnnotator(false, false, wts); }
public static WordsToSentencesAnnotator newlineSplitter(boolean verbose) { WordToSentenceProcessor<CoreLabel> wts = new WordToSentenceProcessor<CoreLabel>("", Collections.<String>emptySet(), Collections.singleton("\n")); return new WordsToSentencesAnnotator(wts, verbose); }
/** Return a WordsToSentencesAnnotator that never splits the token stream. You just get one sentence. * * @param verbose Whether it is verbose. * @return A WordsToSentenceAnnotator. */ public static WordsToSentencesAnnotator nonSplitter(boolean verbose) { WordToSentenceProcessor<CoreLabel> wts = new WordToSentenceProcessor<CoreLabel>(true); return new WordsToSentencesAnnotator(verbose, false, wts); }
/** Return a WordsToSentencesAnnotator that splits on newlines (only), which are then deleted. * This constructor counts the lines by putting in empty token lists for empty lines. * It tells the underlying splitter to return empty lists of tokens * and then treats those empty lists as empty lines. We don't * actually include empty sentences in the annotation, though. But they * are used in numbering the sentence. Only this constructor leads to * empty sentences. * * @param nlToken Zero or more new line tokens, which might be a {@literal \n} or the fake * newline tokens returned from the tokenizer. * @return A WordsToSentenceAnnotator. */ public static WordsToSentencesAnnotator newlineSplitter(String... nlToken) { // this constructor will keep empty lines as empty sentences WordToSentenceProcessor<CoreLabel> wts = new WordToSentenceProcessor<>(ArrayUtils.asImmutableSet(nlToken)); return new WordsToSentencesAnnotator(false, true, wts); }
/** Return a WordsToSentencesAnnotator that splits on newlines (only), which are then deleted. * This constructor counts the lines by putting in empty token lists for empty lines. * It tells the underlying splitter to return empty lists of tokens * and then treats those empty lists as empty lines. We don't * actually include empty sentences in the annotation, though. But they * are used in numbering the sentence. Only this constructor leads to * empty sentences. * * @param verbose Whether it is verbose. * @param nlToken Zero or more new line tokens, which might be a {@literal \n} or the fake * newline tokens returned from the tokenizer. * @return A WordsToSentenceAnnotator. */ public static WordsToSentencesAnnotator newlineSplitter(boolean verbose, String ... nlToken) { // this constructor will keep empty lines as empty sentences WordToSentenceProcessor<CoreLabel> wts = new WordToSentenceProcessor<CoreLabel>(ArrayUtils.asImmutableSet(nlToken)); return new WordsToSentencesAnnotator(verbose, true, wts); }
/** * Initializes the tokenizer to detect date columns. */ public void initialize() { Properties props = new Properties(); pipeline.addAnnotator(new TokenizerAnnotator(false) { @Override public Tokenizer<CoreLabel> getTokenizer(Reader r) { // TODO Auto-generated method stub return new PTBTokenizer<CoreLabel>(r, new CoreLabelTokenFactory(), ""); } }); pipeline.addAnnotator(new WordsToSentencesAnnotator(false)); pipeline.addAnnotator(new POSTaggerAnnotator(false)); pipeline.addAnnotator(new TimeAnnotator("sutime", props)); }
public static AnnotationPipeline getPipeline(Properties props, boolean tokenize) throws Exception { // useGUTime = Boolean.parseBoolean(props.getProperty("gutime", "false")); AnnotationPipeline pipeline = new AnnotationPipeline(); if (tokenize) { pipeline.addAnnotator(new PTBTokenizerAnnotator(false)); pipeline.addAnnotator(new WordsToSentencesAnnotator(false)); } pipeline.addAnnotator(new POSTaggerAnnotator(false)); // pipeline.addAnnotator(new NumberAnnotator(false)); // pipeline.addAnnotator(new QuantifiableEntityNormalizingAnnotator(false, false)); String timeAnnotator = props.getProperty("timeAnnotator", "sutime"); if ("gutime".equals(timeAnnotator)) { useGUTime = true; pipeline.addAnnotator(new GUTimeAnnotator()); } else if ("heideltime".equals(timeAnnotator)) { requiredDocDateFormat = "yyyy-MM-dd"; pipeline.addAnnotator(new HeidelTimeAnnotator("heideltime", props)); } else if ("sutime".equals(timeAnnotator)){ pipeline.addAnnotator(new TimeAnnotator("sutime", props)); } else { throw new IllegalArgumentException("Unknown timeAnnotator: " + timeAnnotator); } return pipeline; }
private static AnnotationPipeline makeNumericPipeline() { AnnotationPipeline pipeline = new AnnotationPipeline(); pipeline.addAnnotator(new TokenizerAnnotator(false, "en")); pipeline.addAnnotator(new WordsToSentencesAnnotator(false)); pipeline.addAnnotator(new POSTaggerAnnotator(false)); pipeline.addAnnotator(new TimeAnnotator(true)); return pipeline; }
boolean verbose = false; ap.addAnnotator(new PTBTokenizerAnnotator(verbose)); ap.addAnnotator(new WordsToSentencesAnnotator(verbose));
boolean verbose = false; ap.addAnnotator(new PTBTokenizerAnnotator(verbose)); ap.addAnnotator(new WordsToSentencesAnnotator(verbose));
public static AnnotationPipeline getPipeline(Properties props, boolean tokenize) throws Exception { // useGUTime = Boolean.parseBoolean(props.getProperty("gutime", "false")); AnnotationPipeline pipeline = new AnnotationPipeline(); if (tokenize) { pipeline.addAnnotator(new TokenizerAnnotator(false, "en")); pipeline.addAnnotator(new WordsToSentencesAnnotator(false)); } pipeline.addAnnotator(new POSTaggerAnnotator(false)); // pipeline.addAnnotator(new NumberAnnotator(false)); // pipeline.addAnnotator(new QuantifiableEntityNormalizingAnnotator(false, false)); String timeAnnotator = props.getProperty("timeAnnotator", "sutime"); switch (timeAnnotator) { case "gutime": useGUTime = true; pipeline.addAnnotator(new GUTimeAnnotator("gutime", props)); break; case "heideltime": requiredDocDateFormat = "yyyy-MM-dd"; pipeline.addAnnotator(new HeidelTimeAnnotator("heideltime", props)); break; case "sutime": pipeline.addAnnotator(new TimeAnnotator("sutime", props)); break; default: throw new IllegalArgumentException("Unknown timeAnnotator: " + timeAnnotator); } return pipeline; }