edu.stanford.nlp.process.CoreLabelTokenFactory java code examples

Refine search

CoreLabel

public static void loadConllFile(String inFile, List<CoreMap> sents, List<DependencyTree> trees, boolean unlabeled, boolean cPOS)
 CoreLabelTokenFactory tf = new CoreLabelTokenFactory(false);
    if (sentenceTokens.size() > 0) {
     trees.add(tree);
     CoreMap sentence = new CoreLabel();
     sentence.set(CoreAnnotations.TokensAnnotation.class, sentenceTokens);
     sents.add(sentence);
    CoreLabel token = tf.makeToken(word, 0, 0);
    token.setTag(pos);
    token.set(CoreAnnotations.CoNLLDepParentIndexAnnotation.class, head);
    token.set(CoreAnnotations.CoNLLDepTypeAnnotation.class, depType);
    sentenceTokens.add(token);

private static int tokReader(Reader r, PrintWriter out, Pattern parseInsideBegin, Pattern parseInsideEnd, String options, boolean preserveLines, boolean dump, boolean lowerCase) {
 int numTokens = 0;
 PTBTokenizer<CoreLabel> tokenizer = new PTBTokenizer<CoreLabel>(r, new CoreLabelTokenFactory(), options);
 boolean printing = parseInsideBegin == null; // start off printing, unless you're looking for a start entity
 boolean beginLine = true;
 while (tokenizer.hasNext()) {
  CoreLabel obj = tokenizer.next();
  String str = obj.get(TextAnnotation.class);
  if (lowerCase) {
   str = str.toLowerCase(Locale.ENGLISH);
   obj.set(TextAnnotation.class, str);
   if (dump) {
    str = obj.toString();

for (PTBTokenizer<CoreLabel> tokenizer = new PTBTokenizer<>(r, new CoreLabelTokenFactory(), options); tokenizer.hasNext(); ) {
 CoreLabel obj = tokenizer.next();
 String origStr = obj.get(CoreAnnotations.TextAnnotation.class);
 String str;
 if (lowerCase) {
  str = origStr.toLowerCase(Locale.ENGLISH);
  obj.set(CoreAnnotations.TextAnnotation.class, str);
 } else {
  str = origStr;
  if (dump) {
   str = obj.toShorterString();

while (offsetBegin < token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class)
    || offsetBegin >= token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class)) {
 output.add(token);
  if (offsetEnd < token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class)) {
   output.add(tokenFactory.makeToken(text.substring(token.beginPosition(), offsetBegin),
       token.beginPosition(), offsetBegin-token.beginPosition()));
   output.add(tokenFactory.makeToken(text.substring(offsetBegin,offsetEnd),
       offsetBegin, offsetEnd-offsetBegin));
   output.add(tokenFactory.makeToken(text.substring(offsetEnd,token.endPosition()),
       offsetEnd, token.endPosition()-offsetEnd));
  } else {
   output.add(tokenFactory.makeToken(text.substring(token.beginPosition(), offsetBegin),
       token.beginPosition(), offsetBegin-token.beginPosition()));
   output.add(tokenFactory.makeToken(text.substring(offsetBegin,token.endPosition()),
       offsetBegin, token.endPosition()-offsetBegin));
 } else if (offsetEnd < token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class)) {
  output.add(tokenFactory.makeToken(text.substring(token.beginPosition(),offsetEnd),
      token.beginPosition(), offsetEnd-token.beginPosition()));
  output.add(tokenFactory.makeToken(text.substring(offsetEnd,token.endPosition()), offsetEnd,
      token.endPosition()-offsetEnd));
 } else {

/**
 * Constructs a new PTBTokenizer that returns CoreLabel objects and
 * uses the options passed in.
 *
 * @param options A String of options. For the default, recommended
 *                options for PTB-style tokenization compatibility, pass
 *                in an empty String.
 * @return A TokenizerFactory that returns CoreLabel objects o
 */
public static PTBTokenizerFactory<CoreLabel> newCoreLabelTokenizerFactory(String options) {
 return new PTBTokenizerFactory<>(new CoreLabelTokenFactory(), options);
}

CoreMap newChunk;
if (tokenFactory != null) {
 newChunk = tokenFactory.makeToken();
} else {
 newChunk = new Annotation("");
 cl.setValue(cl.word());
 cl.setOriginalText(cl.word());

CoreLabel token = tokenFactory.makeToken(t.getText(), t.getBegin(),
    t.getEnd() - t.getBegin());
token.set(SentenceIndexAnnotation.class, sentences.size());
token.set(IndexAnnotation.class, tokens.size());
token.set(TokenKey.class, t);
POS pos = t.getPos();
if (pos == null) {

 tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "ptb3Escaping=false");
} else if (customTokenizer) {
 tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), options.getProperty("tokenizerOptions"));
} else if (printOriginalText) {
 tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "invertible=true");
} else if (whitespaceTokenization) {
 List<String> whitespaceDelims =
 sentenceDelims = whitespaceDelims.toArray(new String[whitespaceDelims.size()]);
} else {
 tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    CoreLabel cl = (CoreLabel) word;
    if ( ! printSpace) {
     pw.print(cl.get(CoreAnnotations.BeforeAnnotation.class));
     printSpace = true;
    pw.print(cl.get(CoreAnnotations.OriginalTextAnnotation.class));
    pw.print(cl.get(CoreAnnotations.AfterAnnotation.class));
   } else {
    if (printSpace) pw.print(" ");

  @Override
  public void process(JCas aInput, JCas aOutput)
    throws AnalysisEngineProcessException
  {
    Tokenizer<CoreLabel> tokenizer = new PTBTokenizer<CoreLabel>(new StringReader(
        aInput.getDocumentText()), new CoreLabelTokenFactory(), "invertible");

    for (CoreLabel label : tokenizer.tokenize()) {
      replace(label.beginPosition(), label.endPosition(), label.word());
    }
  }
}

private static CoreMap wordsToSentence(List<String> sentWords) {
 String sentText = StringUtils.join(sentWords, " ");
 Annotation sentence = new Annotation(sentText);
 List<CoreLabel> tokens = new ArrayList<>(sentWords.size());
 for (String text:sentWords) {
  CoreLabel token = tokenFactory.makeToken();
  token.set(CoreAnnotations.TextAnnotation.class, text);
  tokens.add(token);
 }
 sentence.set(CoreAnnotations.TokensAnnotation.class, tokens);
 return sentence;
}

        currentSentence.getEndCharOffset());
CoreLabelTokenFactory tf = new CoreLabelTokenFactory();
  CoreLabel stanfordTok = tf.makeToken(form, tokStart, tokLength);
  stanfordTok.setIndex(tokIndex++);
  stanfordTokens.add(stanfordTok);

public static List<String> segmenter(final String blob) {
  if (blob == null) {
    return null;
  }
  TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer
      .factory(new CoreLabelTokenFactory(), "normalizeCurrency=false,ptb3Escaping=false");
  Tokenizer<CoreLabel> tokenizer = tokenizerFactory.getTokenizer(new StringReader(blob));
  List<CoreLabel> tokens = new ArrayList<>();
  while (tokenizer.hasNext()) {
    tokens.add(tokenizer.next());
  }
  List<List<CoreLabel>> sentences = new WordToSentenceProcessor<CoreLabel>().process(tokens);
  int end;
  int start = 0;
  List<String> sentenceList = new ArrayList<>();
  for (List<CoreLabel> sentence : sentences) {
    end = sentence.get(sentence.size() - 1).endPosition();
    sentenceList.add(blob.substring(start, end).trim());
    start = end;
  }
  return sentenceList;
}

/**
 * Constructs a CoreLabel as a String with a corresponding BEGIN and END position.
 * (Does not take substring).
 */
@Override
public CoreLabel makeToken(String tokenText, int begin, int length) {
 return makeToken(tokenText, tokenText, begin, length);
}

  public DataBag exec(Tuple input) throws IOException {
    if (input == null || input.size() < 1 || input.isNull(0))
      return null;

    // Output bag
    DataBag bagOfTokens = bagFactory.newDefaultBag();
        
    StringReader textInput = new StringReader(input.get(0).toString());
    PTBTokenizer ptbt = new PTBTokenizer(textInput, new CoreLabelTokenFactory(), "");

    for (CoreLabel label; ptbt.hasNext(); ) {
     label = (CoreLabel)ptbt.next();
     Tuple termText = tupleFactory.newTuple(label.toString());
     bagOfTokens.add(termText);
    }
    
    return bagOfTokens;
  }
}

 m = parseInsidePattern.matcher(""); // create once as performance hack
for (PTBTokenizer<CoreLabel> tokenizer = new PTBTokenizer<CoreLabel>(r, new CoreLabelTokenFactory(), options); tokenizer.hasNext(); ) {
 CoreLabel obj = tokenizer.next();
 String origStr = obj.get(CoreAnnotations.TextAnnotation.class);
 String str;
 if (lowerCase) {
  str = origStr.toLowerCase(Locale.ENGLISH);
  obj.set(CoreAnnotations.TextAnnotation.class, str);
 } else {
  str = origStr;
  if (dump) {
   str = obj.toString();

public static TokenizerFactory<CoreLabel> newTokenizerFactory() {
 return new FrenchTokenizerFactory<>(new CoreLabelTokenFactory());
}

CoreMap newChunk;
if (tokenFactory != null) {
 newChunk = tokenFactory.makeToken();
} else {
 newChunk = new Annotation("");
 cl.setValue(cl.word());
 cl.setOriginalText(cl.word());

private static CoreMap wordsToSentence(List<String> sentWords) {
 String sentText = StringUtils.join(sentWords, " ");
 Annotation sentence = new Annotation(sentText);
 List<CoreLabel> tokens = new ArrayList<>(sentWords.size());
 for (String text:sentWords) {
  CoreLabel token = tokenFactory.makeToken();
  token.set(CoreAnnotations.TextAnnotation.class, text);
  tokens.add(token);
 }
 sentence.set(CoreAnnotations.TokensAnnotation.class, tokens);
 return sentence;
}

        currentSentence.getEndCharOffset());
CoreLabelTokenFactory tf = new CoreLabelTokenFactory();
  CoreLabel stanfordTok = tf.makeToken(form, tokStart, tokLength);
  stanfordTok.setIndex(tokIndex++);
  stanfordTokens.add(stanfordTok);

CoreMap newChunk;
if (tokenFactory != null) {
 newChunk = tokenFactory.makeToken(chunkText, firstCharOffset, lastCharOffset);
} else {
 newChunk = new Annotation(chunkText);

Javadoc

Constructs CoreLabels from Strings optionally with beginning and ending (character after the end) offset positions in an original text. The makeToken method will put the token in the OriginalTextAnnotation AND TextAnnotation keys (2 places!), and optionally records begin and position after offsets in BeginPositionAnnotation and EndPositionAnnotation. If the tokens are built in PTBTokenizer with an "invertible" tokenizer, you will also get a BeforeAnnotation and for the last token an AfterAnnotation. You can also get an empty CoreLabel token.

Most used methods

<init>
Constructor that allows one to choose if index annotation indicating begin/end position will be incl
makeToken

Popular in Java

Updating database using SQL prepared statement
getExternalFilesDir (Context)
orElseThrow (Optional)
Return the contained value, if present, otherwise throw an exception to be created by the provided s
scheduleAtFixedRate (Timer)
Thread (java.lang)
A thread is a thread of execution in a program. The Java Virtual Machine allows an application to ha
Timestamp (java.sql)
A Java representation of the SQL TIMESTAMP type. It provides the capability of representing the SQL
NoSuchElementException (java.util)
Thrown when trying to retrieve an element past the end of an Enumeration or Iterator.
TreeSet (java.util)
TreeSet is an implementation of SortedSet. All optional operations (adding and removing) are support
HttpServletRequest (javax.servlet.http)
Extends the javax.servlet.ServletRequest interface to provide request information for HTTP servlets.
JList (javax.swing)
Top plugins for WebStorm

How to useCoreLabelTokenFactory in edu.stanford.nlp.process

Best Java code snippets using edu.stanford.nlp.process.CoreLabelTokenFactory (Showing top 20 results out of 315)

Refine search

How to use
CoreLabelTokenFactory
in
edu.stanford.nlp.process