private static String wordShapeChris4Short(String s, int len, Collection<String> knownLCWords) { int sbLen = (knownLCWords != null) ? len + 1: len; // markKnownLC makes String 1 longer final StringBuilder sb = new StringBuilder(sbLen); boolean nonLetters = false; for (int i = 0; i < len; i++) { char c = s.charAt(i); char m = chris4equivalenceClass(c); for (String gr : greek) { if (s.startsWith(gr, i)) { m = 'g'; //System.out.println(s + " :: " + s.substring(i+1)); i += gr.length() - 1; // System.out.println("Position skips to " + i); break; } } if (m != 'x' && m != 'X') { nonLetters = true; } sb.append(m); } if (knownLCWords != null) { if ( ! nonLetters && knownLCWords.contains(s.toLowerCase())) { sb.append('k'); } } // System.out.println(s + " became " + sb); return sb.toString(); }
/** * Returns a fine-grained word shape classifier, that equivalence classes * lower and upper case and digits, and collapses sequences of the * same type, but keeps all punctuation. This adds an extra recognizer * for a greek letter embedded in the String, which is useful for bio. */ private static String wordShapeDan2Bio(String s, Collection<String> knownLCWords) { if (containsGreekLetter(s)) { return wordShapeDan2(s, knownLCWords) + "-GREEK"; } else { return wordShapeDan2(s, knownLCWords); } }
if (knownLCWords != null && dontUseLC(wordShaper)) { knownLCWords = null; return inStr; case WORDSHAPEDAN1: return wordShapeDan1(inStr); case WORDSHAPECHRIS1: return wordShapeChris1(inStr); case WORDSHAPEDAN2: return wordShapeDan2(inStr, knownLCWords); case WORDSHAPEDAN2USELC: return wordShapeDan2(inStr, knownLCWords); case WORDSHAPEDAN2BIO: return wordShapeDan2Bio(inStr, knownLCWords); case WORDSHAPEDAN2BIOUSELC: return wordShapeDan2Bio(inStr, knownLCWords); case WORDSHAPEJENNY1: return wordShapeJenny1(inStr, knownLCWords); case WORDSHAPEJENNY1USELC: return wordShapeJenny1(inStr, knownLCWords); case WORDSHAPECHRIS2: return wordShapeChris2(inStr, false, knownLCWords); case WORDSHAPECHRIS2USELC: return wordShapeChris2(inStr, false, knownLCWords); case WORDSHAPECHRIS3: return wordShapeChris2(inStr, true, knownLCWords); case WORDSHAPECHRIS3USELC: return wordShapeChris2(inStr, true, knownLCWords); case WORDSHAPECHRIS4:
/** * This one picks up on Dan2 ideas, but seeks to make less distinctions * mid sequence by sorting for long words, but to maintain extra * distinctions for short words. It exactly preserves the character shape * of the first and last 2 (i.e., BOUNDARY_SIZE) characters and then * will record shapes that occur between them (perhaps only if they are * different) * * @param s The String to find the word shape of * @param omitIfInBoundary If true, character classes present in the * first or last two (i.e., BOUNDARY_SIZE) letters * of the word are not also registered * as classes that appear in the middle of the word. * @param knownLCWords If non-null and non-empty, tag with a "k" suffix words * that are in this list when lowercased (representing * that the word is "known" as a lowercase word). * @return A word shape for the word. */ private static String wordShapeChris2(String s, boolean omitIfInBoundary, Collection<String> knownLCWords) { int len = s.length(); if (len <= BOUNDARY_SIZE * 2) { return wordShapeChris2Short(s, len, knownLCWords); } else { return wordShapeChris2Long(s, omitIfInBoundary, len, knownLCWords); } }
/** * Specify the String and the int identifying which word shaper to * use and this returns the result of using that wordshaper on the String. * * @param inStr String to calculate word shape of * @param wordShaper Constant for which shaping formula to use * @return The wordshape String */ public static String wordShape(String inStr, int wordShaper) { return wordShape(inStr, wordShaper, null); }
/** * Usage: {@code java edu.stanford.nlp.process.WordShapeClassifier * [-wordShape name] string+ }<br> * where {@code name} is an argument to {@code lookupShaper}. * Known names have patterns along the lines of: dan[12](bio)?(UseLC)?, * jenny1(useLC)?, chris[1234](useLC)?, cluster1. * If you don't specify a word shape function, you get chris1. * * @param args Command-line arguments, as above. */ public static void main(String[] args) { int i = 0; int classifierToUse = WORDSHAPECHRIS1; if (args.length == 0) { System.out.println("edu.stanford.nlp.process.WordShapeClassifier [-wordShape name] string+"); } else if (args[0].charAt(0) == '-') { if (args[0].equals("-wordShape") && args.length >= 2) { classifierToUse = lookupShaper(args[1]); i += 2; } else { log.info("Unknown flag: " + args[0]); i++; } } for (; i < args.length; i++) { System.out.print(args[i] + ": "); System.out.println(wordShape(args[i], classifierToUse)); } }
ExtractorWordShapeConjunction(int left, int right, String wsc) { super(); this.left = left; this.right = right; wordShaper = WordShapeClassifier.lookupShaper(wsc); name = "ExtractorWordShapeConjunction(" + left + ',' + right + ',' + wsc + ')'; }
public static String wordShapeChris4(String s) { return wordShapeChris4(s, false, null); }
public String distSimClass(String word) { if ( ! cased) { word = word.toLowerCase(); } if (numberEquivalence) { word = WordShapeClassifier.wordShape(word, WordShapeClassifier.WORDSHAPEDIGITS); } String distSim = lexicon.get(word); if (distSim == null) { distSim = unknownWordClass; } return distSim; }
/** * This one picks up on Dan2 ideas, but seeks to make less distinctions * mid sequence by sorting for long words, but to maintain extra * distinctions for short words. It exactly preserves the character shape * of the first and last 2 (i.e., BOUNDARY_SIZE) characters and then * will record shapes that occur between them (perhaps only if they are * different) * * @param s The String to find the word shape of * @param omitIfInBoundary If true, character classes present in the * first or last two (i.e., BOUNDARY_SIZE) letters * of the word are not also registered * as classes that appear in the middle of the word. * @param knownLCWords If non-null and non-empty, tag with a "k" suffix words * that are in this list when lowercased (representing * that the word is "known" as a lowercase word). * @return A word shape for the word. */ private static String wordShapeChris2(String s, boolean omitIfInBoundary, Collection<String> knownLCWords) { int len = s.length(); if (len <= BOUNDARY_SIZE * 2) { return wordShapeChris2Short(s, len, knownLCWords); } else { return wordShapeChris2Long(s, omitIfInBoundary, len, knownLCWords); } }
/** * Usage: <code>java edu.stanford.nlp.process.WordShapeClassifier * [-wordShape name] string+ </code><br> * where <code>name</code> is an argument to <code>lookupShaper</code>. * Known names have patterns along the lines of: dan[12](bio)?(UseLC)?, * jenny1(useLC)?, chris[1234](useLC)?. If you don't specify a word shape * function, you get chris1. * * @param args Command-line arguments, as above. */ public static void main(String[] args) { int i = 0; int classifierToUse = WORDSHAPECHRIS1; if (args.length == 0) { System.out.println("edu.stanford.nlp.process.WordShapeClassifier [-wordShape name] string+"); } else if (args[0].charAt(0) == '-') { if (args[0].equals("-wordShape") && args.length >= 2) { classifierToUse = lookupShaper(args[1]); i += 2; } else { System.err.println("Unknown flag: " + args[0]); i++; } } for (; i < args.length; i++) { System.out.print(args[i] + ": "); System.out.println(wordShape(args[i], classifierToUse)); } }
ExtractorWordShapeClassifier(int position, String wsc) { super(position, false); wordShaper = WordShapeClassifier.lookupShaper(wsc); name = "ExtractorWordShapeClassifier(" + position+ ',' + wsc + ')'; }
public static String wordShapeChris4(String s) { return wordShapeChris4(s, false, null); }
if (knownLCWords != null && dontUseLC(wordShaper)) { knownLCWords = null; return inStr; case WORDSHAPEDAN1: return wordShapeDan1(inStr); case WORDSHAPECHRIS1: return wordShapeChris1(inStr); case WORDSHAPEDAN2: return wordShapeDan2(inStr, knownLCWords); case WORDSHAPEDAN2USELC: return wordShapeDan2(inStr, knownLCWords); case WORDSHAPEDAN2BIO: return wordShapeDan2Bio(inStr, knownLCWords); case WORDSHAPEDAN2BIOUSELC: return wordShapeDan2Bio(inStr, knownLCWords); case WORDSHAPEJENNY1: return wordShapeJenny1(inStr, knownLCWords); case WORDSHAPEJENNY1USELC: return wordShapeJenny1(inStr, knownLCWords); case WORDSHAPECHRIS2: return wordShapeChris2(inStr, false, knownLCWords); case WORDSHAPECHRIS2USELC: return wordShapeChris2(inStr, false, knownLCWords); case WORDSHAPECHRIS3: return wordShapeChris2(inStr, true, knownLCWords); case WORDSHAPECHRIS3USELC: return wordShapeChris2(inStr, true, knownLCWords); case WORDSHAPECHRIS4:
@Override String extract(History h, PairsHolder pH) { StringBuilder sb = new StringBuilder(); for (int j = left; j <= right; j++) { String s = pH.getWord(h, j); sb.append(WordShapeClassifier.wordShape(s, wordShaper)); if (j < right) { sb.append('|'); } } return sb.toString(); }
/** * Returns a fine-grained word shape classifier, that equivalence classes * lower and upper case and digits, and collapses sequences of the * same type, but keeps all punctuation. This adds an extra recognizer * for a greek letter embedded in the String, which is useful for bio. */ private static String wordShapeDan2Bio(String s, Collection<String> knownLCWords) { if (containsGreekLetter(s)) { return wordShapeDan2(s, knownLCWords) + "-GREEK"; } else { return wordShapeDan2(s, knownLCWords); } }
/** * This one picks up on Dan2 ideas, but seeks to make less distinctions * mid sequence by sorting for long words, but to maintain extra * distinctions for short words. It exactly preserves the character shape * of the first and last 2 (i.e., BOUNDARY_SIZE) characters and then * will record shapes that occur between them (perhaps only if they are * different) * * @param s The String to find the word shape of * @param omitIfInBoundary If true, character classes present in the * first or last two (i.e., BOUNDARY_SIZE) letters * of the word are not also registered * as classes that appear in the middle of the word. * @param knownLCWords If non-null and non-empty, tag with a "k" suffix words * that are in this list when lowercased (representing * that the word is "known" as a lowercase word). * @return A word shape for the word. */ private static String wordShapeChris2(String s, boolean omitIfInBoundary, Collection<String> knownLCWords) { int len = s.length(); if (len <= BOUNDARY_SIZE * 2) { return wordShapeChris2Short(s, len, knownLCWords); } else { return wordShapeChris2Long(s, omitIfInBoundary, len, knownLCWords); } }
/** * Usage: <code>java edu.stanford.nlp.process.WordShapeClassifier * [-wordShape name] string+ </code><br> * where <code>name</code> is an argument to <code>lookupShaper</code>. * Known names have patterns along the lines of: dan[12](bio)?(UseLC)?, * jenny1(useLC)?, chris[1234](useLC)?, cluster1. * If you don't specify a word shape function, you get chris1. * * @param args Command-line arguments, as above. */ public static void main(String[] args) { int i = 0; int classifierToUse = WORDSHAPECHRIS1; if (args.length == 0) { System.out.println("edu.stanford.nlp.process.WordShapeClassifier [-wordShape name] string+"); } else if (args[0].charAt(0) == '-') { if (args[0].equals("-wordShape") && args.length >= 2) { classifierToUse = lookupShaper(args[1]); i += 2; } else { System.err.println("Unknown flag: " + args[0]); i++; } } for (; i < args.length; i++) { System.out.print(args[i] + ": "); System.out.println(wordShape(args[i], classifierToUse)); } }
myFlags[col].useSplitNGrams = Boolean.parseBoolean(val); } else if (key.equals("wordShape")) { myFlags[col].wordShape = WordShapeClassifier.lookupShaper(val); } else if (key.equals("splitWordShape")) { myFlags[col].splitWordShape = WordShapeClassifier.lookupShaper(val); } else if (key.equals("useSplitPrefixSuffixNGrams")) { myFlags[col].useSplitPrefixSuffixNGrams = Boolean.parseBoolean(val);
for (int i = 0; i < len; i++) { char c = s.charAt(i); char m = chris4equivalenceClass(c); int iIncr = 0; for (String gr : greek) {