public static String joinWords(Iterable<? extends HasWord> l, String glue) { StringBuilder sb = new StringBuilder(l instanceof Collection ? ((Collection) l).size() : 64); boolean first = true; for (HasWord o : l) { if ( ! first) { sb.append(glue); } else { first = false; } sb.append(o.word()); } return sb.toString(); }
protected String getText(Label label) { if (label instanceof HasWord) { String word = ((HasWord) label).word(); if (word != null) { return word; } } return label.value(); }
@SuppressWarnings("OverlyStrongTypeCast") private static String getString(Object o) { if (o instanceof HasWord) { HasWord h = (HasWord) o; return h.word(); } else if (o instanceof String) { return (String) o; } else if (o instanceof CoreMap) { return ((CoreMap) o).get(CoreAnnotations.TextAnnotation.class); } else { throw new RuntimeException("Expected token to be either Word or String."); } }
/** * Returns the substring of the sentence from start (inclusive) * to end (exclusive). * * @param start Leftmost index of the substring * @param end Rightmost index of the ngram * @return The ngram as a String. Currently returns null if one of the indices is out of bounds. * But maybe it should exception instead. */ public static <T> String extractNgram(List<T> list, int start, int end) { if (start < 0 || end > list.size() || start >= end) return null; final StringBuilder sb = new StringBuilder(); for (int i = start; i < end; i++) { T o = list.get(i); if (sb.length() != 0) sb.append(' '); sb.append((o instanceof HasWord) ? ((HasWord) o).word() : o.toString()); } return sb.toString(); }
/** * Splits the Word w on the character splitChar. */ private HasWord splitTag(HasWord w) { if (splitChar == 0) { return w; } String s = w.word(); int split = s.lastIndexOf(splitChar); if (split <= 0) { // == 0 isn't allowed - no empty words! return w; } String word = s.substring(0, split); String tag = s.substring(split + 1, s.length()); return new TaggedWord(word, tag); }
/** * Returns a presentable version of the given PTB-tokenized words. * Pass in a List of Words or a Document and this method will * take the word() values (to prevent additional text from creeping in, e.g., POS tags), * and call {@link #ptb2Text(String)} on the output. * * @param ptbWords A list of HasWord objects * @return A presentable version of the given PTB-tokenized words */ public static String labelList2Text(List<? extends HasWord> ptbWords) { List<String> words = new ArrayList<>(); for (HasWord hw : ptbWords) { words.add(hw.word()); } return ptb2Text(words); }
/** * Creates a new CategoryWordTag label from an existing label. * The oldLabel value() -- i.e., category -- is used for the new label. * The tag and word * are initialized iff the current label implements HasTag and HasWord * respectively. * * @param oldLabel The label to use as a basis of this Label */ public CategoryWordTag(Label oldLabel) { super(oldLabel); if (oldLabel instanceof HasTag) { this.tag = ((HasTag) oldLabel).tag(); } if (oldLabel instanceof HasWord) { this.word = ((HasWord) oldLabel).word(); } }
public void printSamples(List samples, PrintStream out) { for (int i = 0; i < document.size(); i++) { HasWord word = (HasWord) document.get(i); String s = "null"; if (word!=null) { s = word.word(); } out.print(StringUtils.padOrTrim(s, 10)); for (Object sample : samples) { int[] sequence = (int[]) sample; out.print(" " + StringUtils.padLeft(sequence[i], 2)); } out.println(); } }
public List<HasWord> apply(List<HasWord> in) { List<HasWord> escaped = new ArrayList<>(in); for (HasWord word : escaped) { word.setWord(ATBTreeUtils.escape(word.word())); } return escaped; }
public static void extractLabels(Map<Pair<Integer, Integer>, String> spanToLabels, List<HasWord> tokens, String line) { String[] pieces = line.trim().split("\\s+"); if (pieces.length == 0) { return; } if (pieces.length == 1) { String error = "Found line with label " + line + " but no tokens to associate with that line"; throw new RuntimeException(error); } //TODO: BUG: The pieces are tokenized differently than the splitting, e.g., on possessive markers as in "actors' expenses" for (int i = 0; i < tokens.size() - pieces.length + 2; ++i) { boolean found = true; for (int j = 1; j < pieces.length; ++j) { if (!tokens.get(i + j - 1).word().equals(pieces[j])) { found = false; break; } } if (found) { spanToLabels.put(new Pair<>(i, i + pieces.length - 1), pieces[0]); } } }
/** <i>Note:</i> At present this clobbers the input list items. * This should be fixed. */ public List<HasWord> apply(List<HasWord> arg) { List<HasWord> ans = new ArrayList<>(arg); for (HasWord wd : ans) { String w = wd.word(); Matcher m2 = p2.matcher(w); // log.info("Escaper: w is " + w); if (m2.find()) { // log.info(" Found pattern."); w = m2.replaceAll("$1"); // log.info(" Changed it to: " + w); } String newW = UTF8EquivalenceFunction.replaceAscii(w); wd.setWord(newW); } return ans; }
/** * Americanize the HasWord or String coming in. * * @param w A HasWord or String to covert to American if needed. * @return Either the input or an Americanized version of it. */ @Override public HasWord apply(HasWord w) { String str = w.word(); String outStr = americanize(str, capitalizeTimex); if (!outStr.equals(str)) { w.setWord(outStr); } return w; }
/** Converts an input list of {@link HasWord} in IBM Arabic to * LDC ATBv3 representation. The method safely copies the input object * prior to escaping. * * @param sentence A collection of type {@link edu.stanford.nlp.ling.Word} * @return A copy of the input with each word escaped. * @throws RuntimeException If a word is mapped to null */ @Override public List<HasWord> apply(List<HasWord> sentence) { List<HasWord> newSentence = new ArrayList<>(sentence); for (HasWord wd : newSentence) wd.setWord(apply(wd.word())); return newSentence; }
@Override @SuppressWarnings("unchecked") protected T getNext() { try { T nextToken = null; // Depending on the orthographic normalization options, // some tokens can be obliterated. In this case, keep iterating // until we see a non-zero length token. do { nextToken = (T) lexer.next(); } while (nextToken != null && nextToken.word().length() == 0); return nextToken; } catch (IOException e) { throw new RuntimeIOException(e); } }
/** * @param input must be a List of objects of type HasWord */ @Override public List<HasWord> process(List<? extends IN> input) { List<HasWord> output = new ArrayList<>(); for (IN h : input) { String s = h.word(); h.setWord(escapeString(s)); output.add(h); } if (fixQuotes) { return fixQuotes(output); } return output; }
private Distribution<Integer> getSegmentedWordLengthDistribution(Treebank tb) { // CharacterLevelTagExtender ext = new CharacterLevelTagExtender(); ClassicCounter<Integer> c = new ClassicCounter<>(); for (Tree gold : tb) { StringBuilder goldChars = new StringBuilder(); ArrayList goldYield = gold.yield(); for (Object aGoldYield : goldYield) { Word word = (Word) aGoldYield; goldChars.append(word); } List<HasWord> ourWords = segment(goldChars.toString()); for (HasWord ourWord : ourWords) { c.incrementCount(Integer.valueOf(ourWord.word().length())); } } return Distribution.getDistribution(c); }
@Override public boolean test(Dependency<G, D, N> d) { /* log.info("DRF: Checking " + d + ": hasWord?: " + (d.dependent() instanceof HasWord) + "; value: " + ((d.dependent() instanceof HasWord)? ((HasWord) d.dependent()).word(): d.dependent().value())); */ if (d == null) { return false; } String word = null; if (d.dependent() instanceof HasWord) { word = ((HasWord) d.dependent()).word(); } if (word == null) { word = d.dependent().value(); } // log.info("Dep: kid is " + ((MapLabel) d.dependent()).toString("value{map}")); return wordRejectFilter.test(word); }
/** * Construct a fall through tree in case we can't parse this sentence. * * @param words Words of the sentence that didn't parse * @return A tree with X for all the internal nodes. * Preterminals have the right tag if the words are tagged. */ public static Tree xTree(List<? extends HasWord> words) { TreeFactory treeFactory = new LabeledScoredTreeFactory(); List<Tree> lst2 = new ArrayList<>(); for (HasWord obj : words) { String s = obj.word(); Tree t = treeFactory.newLeaf(s); String tag = "XX"; if (obj instanceof HasTag) { if (((HasTag) obj).tag() != null) { tag = ((HasTag) obj).tag(); } } Tree t2 = treeFactory.newTreeNode(tag, Collections.singletonList(t)); lst2.add(t2); } return treeFactory.newTreeNode("X", lst2); }
/** Turns a sentence into a flat phrasal tree. * The structure is S -> tag*. And then each tag goes to a word. * The tag is either found from the label or made "WD". * The tag and phrasal node have a StringLabel. * * @param s The Sentence to make the Tree from * @param lf The LabelFactory with which to create the new Tree labels * @return The one phrasal level Tree */ public static Tree toFlatTree(List<? extends HasWord> s, LabelFactory lf) { List<Tree> daughters = new ArrayList<>(s.size()); for (HasWord word : s) { Tree wordNode = new LabeledScoredTreeNode(lf.newLabel(word.word())); if (word instanceof TaggedWord) { TaggedWord taggedWord = (TaggedWord) word; wordNode = new LabeledScoredTreeNode(new StringLabel(taggedWord.tag()), Collections.singletonList(wordNode)); } else { wordNode = new LabeledScoredTreeNode(lf.newLabel("WD"), Collections.singletonList(wordNode)); } daughters.add(wordNode); } return new LabeledScoredTreeNode(new StringLabel("S"), daughters); }
/** * Do nothing other than decorate the label with @ */ private static Label makeSimpleSyntheticLabel(Tree t) { String topCat = t.label().value(); String labelStr = '@' + topCat; String word = ((HasWord) t.label()).word(); String tag = ((HasTag) t.label()).tag(); return new CategoryWordTag(labelStr, word, tag); }