/** * This one picks up on Dan2 ideas, but seeks to make less distinctions * mid sequence by sorting for long words, but to maintain extra * distinctions for short words. It exactly preserves the character shape * of the first and last 2 (i.e., BOUNDARY_SIZE) characters and then * will record shapes that occur between them (perhaps only if they are * different) * * @param s The String to find the word shape of * @param omitIfInBoundary If true, character classes present in the * first or last two (i.e., BOUNDARY_SIZE) letters * of the word are not also registered * as classes that appear in the middle of the word. * @param knownLCWords If non-null and non-empty, tag with a "k" suffix words * that are in this list when lowercased (representing * that the word is "known" as a lowercase word). * @return A word shape for the word. */ private static String wordShapeChris2(String s, boolean omitIfInBoundary, Collection<String> knownLCWords) { int len = s.length(); if (len <= BOUNDARY_SIZE * 2) { return wordShapeChris2Short(s, len, knownLCWords); } else { return wordShapeChris2Long(s, omitIfInBoundary, len, knownLCWords); } }
/** * This one picks up on Dan2 ideas, but seeks to make less distinctions * mid sequence by sorting for long words, but to maintain extra * distinctions for short words. It exactly preserves the character shape * of the first and last 2 (i.e., BOUNDARY_SIZE) characters and then * will record shapes that occur between them (perhaps only if they are * different) * * @param s The String to find the word shape of * @param omitIfInBoundary If true, character classes present in the * first or last two (i.e., BOUNDARY_SIZE) letters * of the word are not also registered * as classes that appear in the middle of the word. * @param knownLCWords If non-null and non-empty, tag with a "k" suffix words * that are in this list when lowercased (representing * that the word is "known" as a lowercase word). * @return A word shape for the word. */ private static String wordShapeChris2(String s, boolean omitIfInBoundary, Collection<String> knownLCWords) { int len = s.length(); if (len <= BOUNDARY_SIZE * 2) { return wordShapeChris2Short(s, len, knownLCWords); } else { return wordShapeChris2Long(s, omitIfInBoundary, len, knownLCWords); } }
/** * This one picks up on Dan2 ideas, but seeks to make less distinctions * mid sequence by sorting for long words, but to maintain extra * distinctions for short words. It exactly preserves the character shape * of the first and last 2 (i.e., BOUNDARY_SIZE) characters and then * will record shapes that occur between them (perhaps only if they are * different) * * @param s The String to find the word shape of * @param omitIfInBoundary If true, character classes present in the * first or last two (i.e., BOUNDARY_SIZE) letters * of the word are not also registered * as classes that appear in the middle of the word. * @param knownLCWords If non-null and non-empty, tag with a "k" suffix words * that are in this list when lowercased (representing * that the word is "known" as a lowercase word). * @return A word shape for the word. */ private static String wordShapeChris2(String s, boolean omitIfInBoundary, Collection<String> knownLCWords) { int len = s.length(); if (len <= BOUNDARY_SIZE * 2) { return wordShapeChris2Short(s, len, knownLCWords); } else { return wordShapeChris2Long(s, omitIfInBoundary, len, knownLCWords); } }
/** * This one picks up on Dan2 ideas, but seeks to make less distinctions * mid sequence by sorting for long words, but to maintain extra * distinctions for short words. It exactly preserves the character shape * of the first and last 2 (i.e., BOUNDARY_SIZE) characters and then * will record shapes that occur between them (perhaps only if they are * different) * * @param s The String to find the word shape of * @param omitIfInBoundary If true, character classes present in the * first or last two (i.e., BOUNDARY_SIZE) letters * of the word are not also registered * as classes that appear in the middle of the word. * @param knownLCWords If non-null and non-empty, tag with a "k" suffix words * that are in this list when lowercased (representing * that the word is "known" as a lowercase word). * @return A word shape for the word. */ private static String wordShapeChris2(String s, boolean omitIfInBoundary, Collection<String> knownLCWords) { int len = s.length(); if (len <= BOUNDARY_SIZE * 2) { return wordShapeChris2Short(s, len, knownLCWords); } else { return wordShapeChris2Long(s, omitIfInBoundary, len, knownLCWords); } }
/** * This one picks up on Dan2 ideas, but seeks to make less distinctions * mid sequence by sorting for long words, but to maintain extra * distinctions for short words. It exactly preserves the character shape * of the first and last 2 (i.e., BOUNDARY_SIZE) characters and then * will record shapes that occur between them (perhaps only if they are * different) * * @param s The String to find the word shape of * @param omitIfInBoundary If true, character classes present in the * first or last two (i.e., BOUNDARY_SIZE) letters * of the word are not also registered * as classes that appear in the middle of the word. * @param knownLCWords If non-null and non-empty, tag with a "k" suffix words * that are in this list when lowercased (representing * that the word is "known" as a lowercase word). * @return A word shape for the word. */ private static String wordShapeChris2(String s, boolean omitIfInBoundary, Collection<String> knownLCWords) { int len = s.length(); if (len <= BOUNDARY_SIZE * 2) { return wordShapeChris2Short(s, len, knownLCWords); } else { return wordShapeChris2Long(s, omitIfInBoundary, len, knownLCWords); } }