Refine search
public static void loadConllFile(String inFile, List<CoreMap> sents, List<DependencyTree> trees, boolean unlabeled, boolean cPOS) CoreLabelTokenFactory tf = new CoreLabelTokenFactory(false); if (sentenceTokens.size() > 0) { trees.add(tree); CoreMap sentence = new CoreLabel(); sentence.set(CoreAnnotations.TokensAnnotation.class, sentenceTokens); sents.add(sentence); CoreLabel token = tf.makeToken(word, 0, 0); token.setTag(pos); token.set(CoreAnnotations.CoNLLDepParentIndexAnnotation.class, head); token.set(CoreAnnotations.CoNLLDepTypeAnnotation.class, depType); sentenceTokens.add(token);
private static int tokReader(Reader r, PrintWriter out, Pattern parseInsideBegin, Pattern parseInsideEnd, String options, boolean preserveLines, boolean dump, boolean lowerCase) { int numTokens = 0; PTBTokenizer<CoreLabel> tokenizer = new PTBTokenizer<CoreLabel>(r, new CoreLabelTokenFactory(), options); boolean printing = parseInsideBegin == null; // start off printing, unless you're looking for a start entity boolean beginLine = true; while (tokenizer.hasNext()) { CoreLabel obj = tokenizer.next(); String str = obj.get(TextAnnotation.class); if (lowerCase) { str = str.toLowerCase(Locale.ENGLISH); obj.set(TextAnnotation.class, str); if (dump) { str = obj.toString();
for (PTBTokenizer<CoreLabel> tokenizer = new PTBTokenizer<>(r, new CoreLabelTokenFactory(), options); tokenizer.hasNext(); ) { CoreLabel obj = tokenizer.next(); String origStr = obj.get(CoreAnnotations.TextAnnotation.class); String str; if (lowerCase) { str = origStr.toLowerCase(Locale.ENGLISH); obj.set(CoreAnnotations.TextAnnotation.class, str); } else { str = origStr; if (dump) { str = obj.toShorterString();
while (offsetBegin < token.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class) || offsetBegin >= token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class)) { output.add(token); if (offsetEnd < token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class)) { output.add(tokenFactory.makeToken(text.substring(token.beginPosition(), offsetBegin), token.beginPosition(), offsetBegin-token.beginPosition())); output.add(tokenFactory.makeToken(text.substring(offsetBegin,offsetEnd), offsetBegin, offsetEnd-offsetBegin)); output.add(tokenFactory.makeToken(text.substring(offsetEnd,token.endPosition()), offsetEnd, token.endPosition()-offsetEnd)); } else { output.add(tokenFactory.makeToken(text.substring(token.beginPosition(), offsetBegin), token.beginPosition(), offsetBegin-token.beginPosition())); output.add(tokenFactory.makeToken(text.substring(offsetBegin,token.endPosition()), offsetBegin, token.endPosition()-offsetBegin)); } else if (offsetEnd < token.get(CoreAnnotations.CharacterOffsetEndAnnotation.class)) { output.add(tokenFactory.makeToken(text.substring(token.beginPosition(),offsetEnd), token.beginPosition(), offsetEnd-token.beginPosition())); output.add(tokenFactory.makeToken(text.substring(offsetEnd,token.endPosition()), offsetEnd, token.endPosition()-offsetEnd)); } else {
/** * Constructs a new PTBTokenizer that returns CoreLabel objects and * uses the options passed in. * * @param options A String of options. For the default, recommended * options for PTB-style tokenization compatibility, pass * in an empty String. * @return A TokenizerFactory that returns CoreLabel objects o */ public static PTBTokenizerFactory<CoreLabel> newCoreLabelTokenizerFactory(String options) { return new PTBTokenizerFactory<>(new CoreLabelTokenFactory(), options); }
CoreMap newChunk; if (tokenFactory != null) { newChunk = tokenFactory.makeToken(); } else { newChunk = new Annotation(""); cl.setValue(cl.word()); cl.setOriginalText(cl.word());
tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "ptb3Escaping=false"); } else if (customTokenizer) { tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), options.getProperty("tokenizerOptions")); } else if (printOriginalText) { tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), "invertible=true"); } else if (whitespaceTokenization) { List<String> whitespaceDelims = sentenceDelims = whitespaceDelims.toArray(new String[whitespaceDelims.size()]); } else { tf = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); CoreLabel cl = (CoreLabel) word; if ( ! printSpace) { pw.print(cl.get(CoreAnnotations.BeforeAnnotation.class)); printSpace = true; pw.print(cl.get(CoreAnnotations.OriginalTextAnnotation.class)); pw.print(cl.get(CoreAnnotations.AfterAnnotation.class)); } else { if (printSpace) pw.print(" ");
@Override public void process(JCas aInput, JCas aOutput) throws AnalysisEngineProcessException { Tokenizer<CoreLabel> tokenizer = new PTBTokenizer<CoreLabel>(new StringReader( aInput.getDocumentText()), new CoreLabelTokenFactory(), "invertible"); for (CoreLabel label : tokenizer.tokenize()) { replace(label.beginPosition(), label.endPosition(), label.word()); } } }
private static CoreMap wordsToSentence(List<String> sentWords) { String sentText = StringUtils.join(sentWords, " "); Annotation sentence = new Annotation(sentText); List<CoreLabel> tokens = new ArrayList<>(sentWords.size()); for (String text:sentWords) { CoreLabel token = tokenFactory.makeToken(); token.set(CoreAnnotations.TextAnnotation.class, text); tokens.add(token); } sentence.set(CoreAnnotations.TokensAnnotation.class, tokens); return sentence; }
currentSentence.getEndCharOffset()); CoreLabelTokenFactory tf = new CoreLabelTokenFactory(); CoreLabel stanfordTok = tf.makeToken(form, tokStart, tokLength); stanfordTok.setIndex(tokIndex++); stanfordTokens.add(stanfordTok);
public static List<String> segmenter(final String blob) { if (blob == null) { return null; } TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer .factory(new CoreLabelTokenFactory(), "normalizeCurrency=false,ptb3Escaping=false"); Tokenizer<CoreLabel> tokenizer = tokenizerFactory.getTokenizer(new StringReader(blob)); List<CoreLabel> tokens = new ArrayList<>(); while (tokenizer.hasNext()) { tokens.add(tokenizer.next()); } List<List<CoreLabel>> sentences = new WordToSentenceProcessor<CoreLabel>().process(tokens); int end; int start = 0; List<String> sentenceList = new ArrayList<>(); for (List<CoreLabel> sentence : sentences) { end = sentence.get(sentence.size() - 1).endPosition(); sentenceList.add(blob.substring(start, end).trim()); start = end; } return sentenceList; }
/** * Constructs a CoreLabel as a String with a corresponding BEGIN and END position. * (Does not take substring). */ @Override public CoreLabel makeToken(String tokenText, int begin, int length) { return makeToken(tokenText, tokenText, begin, length); }
public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() < 1 || input.isNull(0)) return null; // Output bag DataBag bagOfTokens = bagFactory.newDefaultBag(); StringReader textInput = new StringReader(input.get(0).toString()); PTBTokenizer ptbt = new PTBTokenizer(textInput, new CoreLabelTokenFactory(), ""); for (CoreLabel label; ptbt.hasNext(); ) { label = (CoreLabel)ptbt.next(); Tuple termText = tupleFactory.newTuple(label.toString()); bagOfTokens.add(termText); } return bagOfTokens; } }
m = parseInsidePattern.matcher(""); // create once as performance hack for (PTBTokenizer<CoreLabel> tokenizer = new PTBTokenizer<CoreLabel>(r, new CoreLabelTokenFactory(), options); tokenizer.hasNext(); ) { CoreLabel obj = tokenizer.next(); String origStr = obj.get(CoreAnnotations.TextAnnotation.class); String str; if (lowerCase) { str = origStr.toLowerCase(Locale.ENGLISH); obj.set(CoreAnnotations.TextAnnotation.class, str); } else { str = origStr; if (dump) { str = obj.toString();
public static TokenizerFactory<CoreLabel> newTokenizerFactory() { return new FrenchTokenizerFactory<>(new CoreLabelTokenFactory()); }
CoreMap newChunk; if (tokenFactory != null) { newChunk = tokenFactory.makeToken(); } else { newChunk = new Annotation(""); cl.setValue(cl.word()); cl.setOriginalText(cl.word());
private static CoreMap wordsToSentence(List<String> sentWords) { String sentText = StringUtils.join(sentWords, " "); Annotation sentence = new Annotation(sentText); List<CoreLabel> tokens = new ArrayList<>(sentWords.size()); for (String text:sentWords) { CoreLabel token = tokenFactory.makeToken(); token.set(CoreAnnotations.TextAnnotation.class, text); tokens.add(token); } sentence.set(CoreAnnotations.TokensAnnotation.class, tokens); return sentence; }
currentSentence.getEndCharOffset()); CoreLabelTokenFactory tf = new CoreLabelTokenFactory(); CoreLabel stanfordTok = tf.makeToken(form, tokStart, tokLength); stanfordTok.setIndex(tokIndex++); stanfordTokens.add(stanfordTok);
CoreMap newChunk; if (tokenFactory != null) { newChunk = tokenFactory.makeToken(chunkText, firstCharOffset, lastCharOffset); } else { newChunk = new Annotation(chunkText);