edu.stanford.nlp.process.WordShapeClassifier java code examples

private static String wordShapeChris4Short(String s, int len, Collection<String> knownLCWords) {
 int sbLen = (knownLCWords != null) ? len + 1: len;  // markKnownLC makes String 1 longer
 final StringBuilder sb = new StringBuilder(sbLen);
 boolean nonLetters = false;
 for (int i = 0; i < len; i++) {
  char c = s.charAt(i);
  char m = chris4equivalenceClass(c);
  for (String gr : greek) {
   if (s.startsWith(gr, i)) {
    m = 'g';
    //System.out.println(s + "  ::  " + s.substring(i+1));
    i += gr.length() - 1;
    // System.out.println("Position skips to " + i);
    break;
   }
  }
  if (m != 'x' && m != 'X') {
   nonLetters = true;
  }
  sb.append(m);
 }
 if (knownLCWords != null) {
  if ( ! nonLetters && knownLCWords.contains(s.toLowerCase())) {
   sb.append('k');
  }
 }
 // System.out.println(s + " became " + sb);
 return sb.toString();
}

/**
 * Returns a fine-grained word shape classifier, that equivalence classes
 * lower and upper case and digits, and collapses sequences of the
 * same type, but keeps all punctuation.  This adds an extra recognizer
 * for a greek letter embedded in the String, which is useful for bio.
 */
private static String wordShapeDan2Bio(String s, Collection<String> knownLCWords) {
 if (containsGreekLetter(s)) {
  return wordShapeDan2(s, knownLCWords) + "-GREEK";
 } else {
  return wordShapeDan2(s, knownLCWords);
 }
}

if (knownLCWords != null && dontUseLC(wordShaper)) {
 knownLCWords = null;
  return inStr;
 case WORDSHAPEDAN1:
  return wordShapeDan1(inStr);
 case WORDSHAPECHRIS1:
  return wordShapeChris1(inStr);
 case WORDSHAPEDAN2:
  return wordShapeDan2(inStr, knownLCWords);
 case WORDSHAPEDAN2USELC:
  return wordShapeDan2(inStr, knownLCWords);
 case WORDSHAPEDAN2BIO:
  return wordShapeDan2Bio(inStr, knownLCWords);
 case WORDSHAPEDAN2BIOUSELC:
  return wordShapeDan2Bio(inStr, knownLCWords);
 case WORDSHAPEJENNY1:
  return wordShapeJenny1(inStr, knownLCWords);
 case WORDSHAPEJENNY1USELC:
  return wordShapeJenny1(inStr, knownLCWords);
 case WORDSHAPECHRIS2:
  return wordShapeChris2(inStr, false, knownLCWords);
 case WORDSHAPECHRIS2USELC:
  return wordShapeChris2(inStr, false, knownLCWords);
 case WORDSHAPECHRIS3:
  return wordShapeChris2(inStr, true, knownLCWords);
 case WORDSHAPECHRIS3USELC:
  return wordShapeChris2(inStr, true, knownLCWords);
 case WORDSHAPECHRIS4:

/**
 * This one picks up on Dan2 ideas, but seeks to make less distinctions
 * mid sequence by sorting for long words, but to maintain extra
 * distinctions for short words. It exactly preserves the character shape
 * of the first and last 2 (i.e., BOUNDARY_SIZE) characters and then
 * will record shapes that occur between them (perhaps only if they are
 * different)
 *
 * @param s The String to find the word shape of
 * @param omitIfInBoundary If true, character classes present in the
 *                         first or last two (i.e., BOUNDARY_SIZE) letters
 *                         of the word are not also registered
 *                         as classes that appear in the middle of the word.
 * @param knownLCWords If non-null and non-empty, tag with a "k" suffix words
 *                    that are in this list when lowercased (representing
 *                    that the word is "known" as a lowercase word).
 * @return A word shape for the word.
 */
private static String wordShapeChris2(String s, boolean omitIfInBoundary, Collection<String> knownLCWords) {
 int len = s.length();
 if (len <= BOUNDARY_SIZE * 2) {
  return wordShapeChris2Short(s, len, knownLCWords);
 } else {
  return wordShapeChris2Long(s, omitIfInBoundary, len, knownLCWords);
 }
}

/**
 * Specify the String and the int identifying which word shaper to
 * use and this returns the result of using that wordshaper on the String.
 *
 * @param inStr String to calculate word shape of
 * @param wordShaper Constant for which shaping formula to use
 * @return The wordshape String
 */
public static String wordShape(String inStr, int wordShaper) {
 return wordShape(inStr, wordShaper, null);
}

/**
 * Usage: {@code java edu.stanford.nlp.process.WordShapeClassifier
 * [-wordShape name] string+ }<br>
 * where {@code name} is an argument to {@code lookupShaper}.
 * Known names have patterns along the lines of: dan[12](bio)?(UseLC)?,
 * jenny1(useLC)?, chris[1234](useLC)?, cluster1.
 * If you don't specify a word shape function, you get chris1.
 *
 * @param args Command-line arguments, as above.
 */
public static void main(String[] args) {
 int i = 0;
 int classifierToUse = WORDSHAPECHRIS1;
 if (args.length == 0) {
  System.out.println("edu.stanford.nlp.process.WordShapeClassifier [-wordShape name] string+");
 } else if (args[0].charAt(0) == '-') {
  if (args[0].equals("-wordShape") && args.length >= 2) {
   classifierToUse = lookupShaper(args[1]);
   i += 2;
  } else {
   log.info("Unknown flag: " + args[0]);
   i++;
  }
 }
 for (; i < args.length; i++) {
  System.out.print(args[i] + ": ");
  System.out.println(wordShape(args[i], classifierToUse));
 }
}

ExtractorWordShapeConjunction(int left, int right, String wsc) {
 super();
 this.left = left;
 this.right = right;
 wordShaper = WordShapeClassifier.lookupShaper(wsc);
 name = "ExtractorWordShapeConjunction(" + left + ',' + right + ',' + wsc + ')';
}

public static String wordShapeChris4(String s) {
 return wordShapeChris4(s, false, null);
}

public String distSimClass(String word) {
 if ( ! cased) {
  word = word.toLowerCase();
 }
 if (numberEquivalence) {
  word = WordShapeClassifier.wordShape(word, WordShapeClassifier.WORDSHAPEDIGITS);
 }
 String distSim = lexicon.get(word);
 if (distSim == null) {
  distSim = unknownWordClass;
 }
 return distSim;
}

/**
 * This one picks up on Dan2 ideas, but seeks to make less distinctions
 * mid sequence by sorting for long words, but to maintain extra
 * distinctions for short words. It exactly preserves the character shape
 * of the first and last 2 (i.e., BOUNDARY_SIZE) characters and then
 * will record shapes that occur between them (perhaps only if they are
 * different)
 *
 * @param s The String to find the word shape of
 * @param omitIfInBoundary If true, character classes present in the
 *                         first or last two (i.e., BOUNDARY_SIZE) letters
 *                         of the word are not also registered
 *                         as classes that appear in the middle of the word.
 * @param knownLCWords If non-null and non-empty, tag with a "k" suffix words
 *                    that are in this list when lowercased (representing
 *                    that the word is "known" as a lowercase word).
 * @return A word shape for the word.
 */
private static String wordShapeChris2(String s, boolean omitIfInBoundary, Collection<String> knownLCWords) {
 int len = s.length();
 if (len <= BOUNDARY_SIZE * 2) {
  return wordShapeChris2Short(s, len, knownLCWords);
 } else {
  return wordShapeChris2Long(s, omitIfInBoundary, len, knownLCWords);
 }
}

/**
 * Usage: <code>java edu.stanford.nlp.process.WordShapeClassifier
 * [-wordShape name] string+ </code><br>
 * where <code>name</code> is an argument to <code>lookupShaper</code>.
 * Known names have patterns along the lines of: dan[12](bio)?(UseLC)?,
 * jenny1(useLC)?, chris[1234](useLC)?.  If you don't specify a word shape
 * function, you get chris1.
 *
 * @param args Command-line arguments, as above.
 */
public static void main(String[] args) {
 int i = 0;
 int classifierToUse = WORDSHAPECHRIS1;
 if (args.length == 0) {
  System.out.println("edu.stanford.nlp.process.WordShapeClassifier [-wordShape name] string+");
 } else if (args[0].charAt(0) == '-') {
  if (args[0].equals("-wordShape") && args.length >= 2) {
   classifierToUse = lookupShaper(args[1]);
   i += 2;
  } else {
   System.err.println("Unknown flag: " + args[0]);
   i++;
  }
 }
 for (; i < args.length; i++) {
  System.out.print(args[i] + ": ");
  System.out.println(wordShape(args[i], classifierToUse));
 }
}

ExtractorWordShapeClassifier(int position, String wsc) {
 super(position, false);
 wordShaper = WordShapeClassifier.lookupShaper(wsc);
 name = "ExtractorWordShapeClassifier(" + position+ ',' + wsc + ')';
}

public static String wordShapeChris4(String s) {
 return wordShapeChris4(s, false, null);
}

if (knownLCWords != null && dontUseLC(wordShaper)) {
 knownLCWords = null;
  return inStr;
 case WORDSHAPEDAN1:
  return wordShapeDan1(inStr);
 case WORDSHAPECHRIS1:
  return wordShapeChris1(inStr);
 case WORDSHAPEDAN2:
  return wordShapeDan2(inStr, knownLCWords);
 case WORDSHAPEDAN2USELC:
  return wordShapeDan2(inStr, knownLCWords);
 case WORDSHAPEDAN2BIO:
  return wordShapeDan2Bio(inStr, knownLCWords);
 case WORDSHAPEDAN2BIOUSELC:
  return wordShapeDan2Bio(inStr, knownLCWords);
 case WORDSHAPEJENNY1:
  return wordShapeJenny1(inStr, knownLCWords);
 case WORDSHAPEJENNY1USELC:
  return wordShapeJenny1(inStr, knownLCWords);
 case WORDSHAPECHRIS2:
  return wordShapeChris2(inStr, false, knownLCWords);
 case WORDSHAPECHRIS2USELC:
  return wordShapeChris2(inStr, false, knownLCWords);
 case WORDSHAPECHRIS3:
  return wordShapeChris2(inStr, true, knownLCWords);
 case WORDSHAPECHRIS3USELC:
  return wordShapeChris2(inStr, true, knownLCWords);
 case WORDSHAPECHRIS4:

@Override
String extract(History h, PairsHolder pH) {
 StringBuilder sb = new StringBuilder();
 for (int j = left; j <= right; j++) {
  String s = pH.getWord(h, j);
  sb.append(WordShapeClassifier.wordShape(s, wordShaper));
  if (j < right) {
   sb.append('|');
  }
 }
 return sb.toString();
}

/**
 * Returns a fine-grained word shape classifier, that equivalence classes
 * lower and upper case and digits, and collapses sequences of the
 * same type, but keeps all punctuation.  This adds an extra recognizer
 * for a greek letter embedded in the String, which is useful for bio.
 */
private static String wordShapeDan2Bio(String s, Collection<String> knownLCWords) {
 if (containsGreekLetter(s)) {
  return wordShapeDan2(s, knownLCWords) + "-GREEK";
 } else {
  return wordShapeDan2(s, knownLCWords);
 }
}

/**
 * This one picks up on Dan2 ideas, but seeks to make less distinctions
 * mid sequence by sorting for long words, but to maintain extra
 * distinctions for short words. It exactly preserves the character shape
 * of the first and last 2 (i.e., BOUNDARY_SIZE) characters and then
 * will record shapes that occur between them (perhaps only if they are
 * different)
 *
 * @param s The String to find the word shape of
 * @param omitIfInBoundary If true, character classes present in the
 *                         first or last two (i.e., BOUNDARY_SIZE) letters
 *                         of the word are not also registered
 *                         as classes that appear in the middle of the word.
 * @param knownLCWords If non-null and non-empty, tag with a "k" suffix words
 *                    that are in this list when lowercased (representing
 *                    that the word is "known" as a lowercase word).
 * @return A word shape for the word.
 */
private static String wordShapeChris2(String s, boolean omitIfInBoundary, Collection<String> knownLCWords) {
 int len = s.length();
 if (len <= BOUNDARY_SIZE * 2) {
  return wordShapeChris2Short(s, len, knownLCWords);
 } else {
  return wordShapeChris2Long(s, omitIfInBoundary, len, knownLCWords);
 }
}

/**
 * Usage: <code>java edu.stanford.nlp.process.WordShapeClassifier
 * [-wordShape name] string+ </code><br>
 * where <code>name</code> is an argument to <code>lookupShaper</code>.
 * Known names have patterns along the lines of: dan[12](bio)?(UseLC)?,
 * jenny1(useLC)?, chris[1234](useLC)?, cluster1.
 * If you don't specify a word shape function, you get chris1.
 *
 * @param args Command-line arguments, as above.
 */
public static void main(String[] args) {
 int i = 0;
 int classifierToUse = WORDSHAPECHRIS1;
 if (args.length == 0) {
  System.out.println("edu.stanford.nlp.process.WordShapeClassifier [-wordShape name] string+");
 } else if (args[0].charAt(0) == '-') {
  if (args[0].equals("-wordShape") && args.length >= 2) {
   classifierToUse = lookupShaper(args[1]);
   i += 2;
  } else {
   System.err.println("Unknown flag: " + args[0]);
   i++;
  }
 }
 for (; i < args.length; i++) {
  System.out.print(args[i] + ": ");
  System.out.println(wordShape(args[i], classifierToUse));
 }
}

 myFlags[col].useSplitNGrams = Boolean.parseBoolean(val);
} else if (key.equals("wordShape")) {
 myFlags[col].wordShape = WordShapeClassifier.lookupShaper(val);
} else if (key.equals("splitWordShape")) {
 myFlags[col].splitWordShape = WordShapeClassifier.lookupShaper(val);
} else if (key.equals("useSplitPrefixSuffixNGrams")) {
 myFlags[col].useSplitPrefixSuffixNGrams = Boolean.parseBoolean(val);

for (int i = 0; i < len; i++) {
 char c = s.charAt(i);
 char m = chris4equivalenceClass(c);
 int iIncr = 0;
 for (String gr : greek) {

Javadoc

Provides static methods which map any String to another String indicative of its "word shape" -- e.g., whether capitalized, numeric, etc. Different implementations may implement quite different, normally language specific ideas of what word shapes are useful.

Most used methods

chris4equivalenceClass
containsGreekLetter
Somewhat ad-hoc list of only greek letters that bio people use, partly to avoid false positives on s
dontUseLC
Returns true if the specified word shaper doesn't use known lower case words, even if a list of them
lookupShaper
Look up a shaper by a short String name.
wordShape
Specify the string and the int identifying which word shaper to use and this returns the result of u
wordShapeChris1
This one equivalence classes all strings into one of 24 semantically informed classes, somewhat simi
wordShapeChris2
This one picks up on Dan2 ideas, but seeks to make less distinctions mid sequence by sorting for lon
wordShapeChris2Long
wordShapeChris2Short
wordShapeChris4
This one picks up on Dan2 ideas, but seeks to make less distinctions mid sequence by sorting for lon
wordShapeChris4Long
wordShapeChris4Short

Popular in Java

Making http requests using okhttp
scheduleAtFixedRate (Timer)
setRequestProperty (URLConnection)
addToBackStack (FragmentTransaction)
ServerSocket (java.net)
This class represents a server-side socket that waits for incoming client connections. A ServerSocke
BlockingQueue (java.util.concurrent)
A java.util.Queue that additionally supports operations that wait for the queue to become non-empty
ThreadPoolExecutor (java.util.concurrent)
An ExecutorService that executes each submitted task using one of possibly several pooled threads, n
Servlet (javax.servlet)
Defines methods that all servlets must implement. A servlet is a small Java program that runs within
JCheckBox (javax.swing)
JOptionPane (javax.swing)
Top Sublime Text plugins

How to useWordShapeClassifier in edu.stanford.nlp.process

Best Java code snippets using edu.stanford.nlp.process.WordShapeClassifier (Showing top 20 results out of 315)

How to use
WordShapeClassifier
in
edu.stanford.nlp.process