edu.stanford.nlp.pipeline.WordsToSentencesAnnotator java code examples

/**
 * Sentence split, in addition to a bunch of other things in this annotator (be careful to check the implementation!)
 */
public Annotator wordToSentences(Properties properties) {
 return new WordsToSentencesAnnotator(properties);
}

                   "false"))) {
 WordsToSentencesAnnotator wts =
  WordsToSentencesAnnotator.newlineSplitter(false);
 return wts;
} else {
 WordsToSentencesAnnotator wts =
  new WordsToSentencesAnnotator(false);
  wts.setSentenceBoundaryToDiscard(new HashSet<String>
                   (Arrays.asList(toks)));
 if (bounds != null){
  String [] toks = bounds.split(",");
  wts.addHtmlSentenceBoundaryToDiscard(new HashSet<String>
                     (Arrays.asList(toks)));
  wts.setOneSentence(Boolean.parseBoolean(isOneSentence));

if (whitespaceTokenization) {
 if (System.getProperty("line.separator").equals("\n")) {
  return WordsToSentencesAnnotator.newlineSplitter(false, "\n");
 } else {
  return WordsToSentencesAnnotator.newlineSplitter(false, System.getProperty("line.separator"), "\n");
 return WordsToSentencesAnnotator.newlineSplitter(false, PTBTokenizer.getNewlineToken());
 return WordsToSentencesAnnotator.nonSplitter(false);
return new WordsToSentencesAnnotator(false, boundaryTokenRegex, boundariesToDiscard, htmlElementsToDiscard,
    nlsb, boundaryMultiTokenRegex, tokenRegexesToDiscard);

/** Return a WordsToSentencesAnnotator that never splits the token stream. You just get one sentence.
 *
 *  @return A WordsToSentenceAnnotator.
 */
public static WordsToSentencesAnnotator nonSplitter() {
 WordToSentenceProcessor<CoreLabel> wts = new WordToSentenceProcessor<>(true);
 return new WordsToSentencesAnnotator(false, false, wts);
}

/** Return a WordsToSentencesAnnotator that splits on newlines (only), which are then deleted.
 *  This constructor counts the lines by putting in empty token lists for empty lines.
 *  It tells the underlying splitter to return empty lists of tokens
 *  and then treats those empty lists as empty lines.  We don't
 *  actually include empty sentences in the annotation, though. But they
 *  are used in numbering the sentence. Only this constructor leads to
 *  empty sentences.
 *
 *  @param  nlToken Zero or more new line tokens, which might be a {@literal \n} or the fake
 *                 newline tokens returned from the tokenizer.
 *  @return A WordsToSentenceAnnotator.
 */
public static WordsToSentencesAnnotator newlineSplitter(String... nlToken) {
 // this constructor will keep empty lines as empty sentences
 WordToSentenceProcessor<CoreLabel> wts =
     new WordToSentenceProcessor<>(ArrayUtils.asImmutableSet(nlToken));
 return new WordsToSentencesAnnotator(false, true, wts);
}

private static AnnotationPipeline makeNumericPipeline() {
 AnnotationPipeline pipeline = new AnnotationPipeline();
 pipeline.addAnnotator(new TokenizerAnnotator(false, "en"));
 pipeline.addAnnotator(new WordsToSentencesAnnotator(false));
 pipeline.addAnnotator(new POSTaggerAnnotator(false));
 pipeline.addAnnotator(new TimeAnnotator(true));
 return pipeline;
}

final boolean verbose = false;
ap.addAnnotator(new TokenizerAnnotator(verbose, "en"));
ap.addAnnotator(new WordsToSentencesAnnotator(verbose));

 public static AnnotationPipeline getPipeline(Properties props, boolean tokenize) throws Exception {
//    useGUTime = Boolean.parseBoolean(props.getProperty("gutime", "false"));
  AnnotationPipeline pipeline = new AnnotationPipeline();
  if (tokenize) {
   pipeline.addAnnotator(new TokenizerAnnotator(false, "en"));
   pipeline.addAnnotator(new WordsToSentencesAnnotator(false));
  }
  pipeline.addAnnotator(new POSTaggerAnnotator(false));
//    pipeline.addAnnotator(new NumberAnnotator(false));
//    pipeline.addAnnotator(new QuantifiableEntityNormalizingAnnotator(false, false));
  String timeAnnotator = props.getProperty("timeAnnotator", "sutime");
  switch (timeAnnotator) {
   case "gutime":
    useGUTime = true;
    pipeline.addAnnotator(new GUTimeAnnotator("gutime", props));
    break;
   case "heideltime":
    requiredDocDateFormat = "yyyy-MM-dd";
    pipeline.addAnnotator(new HeidelTimeAnnotator("heideltime", props));
    break;
   case "sutime":
    pipeline.addAnnotator(new TimeAnnotator("sutime", props));
    break;
   default:
    throw new IllegalArgumentException("Unknown timeAnnotator: " + timeAnnotator);
  }
  return pipeline;
 }

/**
 * Sentence split, in addition to a bunch of other things in this annotator (be careful to check the implementation!)
 */
public Annotator wordToSentences(Properties properties) {
 return new WordsToSentencesAnnotator(properties);
}

/** Return a WordsToSentencesAnnotator that never splits the token stream. You just get one sentence.
 *
 *  @return A WordsToSentenceAnnotator.
 */
public static WordsToSentencesAnnotator nonSplitter() {
 WordToSentenceProcessor<CoreLabel> wts = new WordToSentenceProcessor<>(true);
 return new WordsToSentencesAnnotator(false, false, wts);
}

public static WordsToSentencesAnnotator newlineSplitter(boolean verbose) {
 WordToSentenceProcessor<CoreLabel> wts = 
  new WordToSentenceProcessor<CoreLabel>("", 
                      Collections.<String>emptySet(),
                      Collections.singleton("\n"));
 return new WordsToSentencesAnnotator(wts, verbose);
}

/** Return a WordsToSentencesAnnotator that never splits the token stream. You just get one sentence.
 *
 *  @param verbose Whether it is verbose.
 *  @return A WordsToSentenceAnnotator.
 */
public static WordsToSentencesAnnotator nonSplitter(boolean verbose) {
 WordToSentenceProcessor<CoreLabel> wts = new WordToSentenceProcessor<CoreLabel>(true);
 return new WordsToSentencesAnnotator(verbose, false, wts);
}

/** Return a WordsToSentencesAnnotator that splits on newlines (only), which are then deleted.
 *  This constructor counts the lines by putting in empty token lists for empty lines.
 *  It tells the underlying splitter to return empty lists of tokens
 *  and then treats those empty lists as empty lines.  We don't
 *  actually include empty sentences in the annotation, though. But they
 *  are used in numbering the sentence. Only this constructor leads to
 *  empty sentences.
 *
 *  @param  nlToken Zero or more new line tokens, which might be a {@literal \n} or the fake
 *                 newline tokens returned from the tokenizer.
 *  @return A WordsToSentenceAnnotator.
 */
public static WordsToSentencesAnnotator newlineSplitter(String... nlToken) {
 // this constructor will keep empty lines as empty sentences
 WordToSentenceProcessor<CoreLabel> wts =
     new WordToSentenceProcessor<>(ArrayUtils.asImmutableSet(nlToken));
 return new WordsToSentencesAnnotator(false, true, wts);
}

/** Return a WordsToSentencesAnnotator that splits on newlines (only), which are then deleted.
 *  This constructor counts the lines by putting in empty token lists for empty lines.
 *  It tells the underlying splitter to return empty lists of tokens
 *  and then treats those empty lists as empty lines.  We don't
 *  actually include empty sentences in the annotation, though. But they
 *  are used in numbering the sentence. Only this constructor leads to
 *  empty sentences.
 *
 *  @param verbose Whether it is verbose.
 *  @param  nlToken Zero or more new line tokens, which might be a {@literal \n} or the fake
 *                 newline tokens returned from the tokenizer.
 *  @return A WordsToSentenceAnnotator.
 */
public static WordsToSentencesAnnotator newlineSplitter(boolean verbose, String ... nlToken) {
 // this constructor will keep empty lines as empty sentences
 WordToSentenceProcessor<CoreLabel> wts =
     new WordToSentenceProcessor<CoreLabel>(ArrayUtils.asImmutableSet(nlToken));
 return new WordsToSentencesAnnotator(verbose, true, wts);
}

/**
 * Initializes the tokenizer to detect date columns.
 */
public void initialize() {
  Properties props = new Properties();
  pipeline.addAnnotator(new TokenizerAnnotator(false) {
    @Override
    public Tokenizer<CoreLabel> getTokenizer(Reader r) {
      // TODO Auto-generated method stub
      return new PTBTokenizer<CoreLabel>(r, new CoreLabelTokenFactory(), "");
    }
  });
  pipeline.addAnnotator(new WordsToSentencesAnnotator(false));
  pipeline.addAnnotator(new POSTaggerAnnotator(false));
  pipeline.addAnnotator(new TimeAnnotator("sutime", props));
}

 public static AnnotationPipeline getPipeline(Properties props, boolean tokenize) throws Exception
 {
//    useGUTime = Boolean.parseBoolean(props.getProperty("gutime", "false"));
  AnnotationPipeline pipeline = new AnnotationPipeline();
  if (tokenize) {
   pipeline.addAnnotator(new PTBTokenizerAnnotator(false));
   pipeline.addAnnotator(new WordsToSentencesAnnotator(false));
  }
  pipeline.addAnnotator(new POSTaggerAnnotator(false));
//    pipeline.addAnnotator(new NumberAnnotator(false));
//    pipeline.addAnnotator(new QuantifiableEntityNormalizingAnnotator(false, false));
  String timeAnnotator = props.getProperty("timeAnnotator", "sutime");
  if ("gutime".equals(timeAnnotator)) {
   useGUTime = true;
   pipeline.addAnnotator(new GUTimeAnnotator());
  } else if ("heideltime".equals(timeAnnotator)) {
   requiredDocDateFormat = "yyyy-MM-dd";
   pipeline.addAnnotator(new HeidelTimeAnnotator("heideltime", props));
  } else if ("sutime".equals(timeAnnotator)){
   pipeline.addAnnotator(new TimeAnnotator("sutime", props));
  } else {
   throw new IllegalArgumentException("Unknown timeAnnotator: " + timeAnnotator);
  }
  return pipeline;
 }

private static AnnotationPipeline makeNumericPipeline() {
 AnnotationPipeline pipeline = new AnnotationPipeline();
 pipeline.addAnnotator(new TokenizerAnnotator(false, "en"));
 pipeline.addAnnotator(new WordsToSentencesAnnotator(false));
 pipeline.addAnnotator(new POSTaggerAnnotator(false));
 pipeline.addAnnotator(new TimeAnnotator(true));
 return pipeline;
}

boolean verbose = false;
ap.addAnnotator(new PTBTokenizerAnnotator(verbose));
ap.addAnnotator(new WordsToSentencesAnnotator(verbose));

boolean verbose = false;
ap.addAnnotator(new PTBTokenizerAnnotator(verbose));
ap.addAnnotator(new WordsToSentencesAnnotator(verbose));

 public static AnnotationPipeline getPipeline(Properties props, boolean tokenize) throws Exception {
//    useGUTime = Boolean.parseBoolean(props.getProperty("gutime", "false"));
  AnnotationPipeline pipeline = new AnnotationPipeline();
  if (tokenize) {
   pipeline.addAnnotator(new TokenizerAnnotator(false, "en"));
   pipeline.addAnnotator(new WordsToSentencesAnnotator(false));
  }
  pipeline.addAnnotator(new POSTaggerAnnotator(false));
//    pipeline.addAnnotator(new NumberAnnotator(false));
//    pipeline.addAnnotator(new QuantifiableEntityNormalizingAnnotator(false, false));
  String timeAnnotator = props.getProperty("timeAnnotator", "sutime");
  switch (timeAnnotator) {
   case "gutime":
    useGUTime = true;
    pipeline.addAnnotator(new GUTimeAnnotator("gutime", props));
    break;
   case "heideltime":
    requiredDocDateFormat = "yyyy-MM-dd";
    pipeline.addAnnotator(new HeidelTimeAnnotator("heideltime", props));
    break;
   case "sutime":
    pipeline.addAnnotator(new TimeAnnotator("sutime", props));
    break;
   default:
    throw new IllegalArgumentException("Unknown timeAnnotator: " + timeAnnotator);
  }
  return pipeline;
 }

Javadoc

This class assumes that there is a List under the TokensAnnotation field, and runs it through edu.stanford.nlp.process.WordToSentenceProcessorand puts the new List under the SentencesAnnotation field.

Most used methods

<init>
newlineSplitter
Return a WordsToSentencesAnnotator that splits on newlines (only), which are then deleted. This cons
addHtmlSentenceBoundaryToDiscard
nonSplitter
Return a WordsToSentencesAnnotator that never splits the token stream. You just get one sentence.
setOneSentence
setSentenceBoundaryToDiscard

Popular in Java

Finding current android device location
getApplicationContext (Context)
scheduleAtFixedRate (Timer)
runOnUiThread (Activity)
IOException (java.io)
Signals a general, I/O-related error. Error details may be specified when calling the constructor, a
PrintStream (java.io)
Fake signature of an existing Java class.
TimeUnit (java.util.concurrent)
A TimeUnit represents time durations at a given unit of granularity and provides utility methods to
BorderLayout (java.awt)
A border layout lays out a container, arranging and resizing its components to fit in five regions:
FlowLayout (java.awt)
A flow layout arranges components in a left-to-right flow, much like lines of text in a paragraph. F
Get (org.apache.hadoop.hbase.client)
Used to perform Get operations on a single row. To get everything for a row, instantiate a Get objec
Top Vim plugins

How to useWordsToSentencesAnnotator in edu.stanford.nlp.pipeline

Best Java code snippets using edu.stanford.nlp.pipeline.WordsToSentencesAnnotator (Showing top 20 results out of 315)

How to use
WordsToSentencesAnnotator
in
edu.stanford.nlp.pipeline