/** * This one picks up on Dan2 ideas, but seeks to make less distinctions * mid sequence by sorting for long words, but to maintain extra * distinctions for short words, by always recording the class of the * first and last two characters of the word. * Compared to chris2 on which it is based, * it uses more Unicode classes, and so collapses things like * punctuation more, and might work better with real unicode. * * @param s The String to find the word shape of * @param omitIfInBoundary If true, character classes present in the * first or last two (i.e., BOUNDARY_SIZE) letters * of the word are not also registered * as classes that appear in the middle of the word. * @param knownLCWords If non-null and non-empty, tag with a "k" suffix words * that are in this list when lowercased (representing * that the word is "known" as a lowercase word). * @return A word shape for the word. */ private static String wordShapeChris4(String s, boolean omitIfInBoundary, Collection<String> knownLCWords) { int len = s.length(); if (len <= BOUNDARY_SIZE * 2) { return wordShapeChris4Short(s, len, knownLCWords); } else { return wordShapeChris4Long(s, omitIfInBoundary, len, knownLCWords); } }
/** * This one picks up on Dan2 ideas, but seeks to make less distinctions * mid sequence by sorting for long words, but to maintain extra * distinctions for short words, by always recording the class of the * first and last two characters of the word. * Compared to chris2 on which it is based, * it uses more Unicode classes, and so collapses things like * punctuation more, and might work better with real unicode. * * @param s The String to find the word shape of * @param omitIfInBoundary If true, character classes present in the * first or last two (i.e., BOUNDARY_SIZE) letters * of the word are not also registered * as classes that appear in the middle of the word. * @param knownLCWords If non-null and non-empty, tag with a "k" suffix words * that are in this list when lowercased (representing * that the word is "known" as a lowercase word). * @return A word shape for the word. */ private static String wordShapeChris4(String s, boolean omitIfInBoundary, Collection<String> knownLCWords) { int len = s.length(); if (len <= BOUNDARY_SIZE * 2) { return wordShapeChris4Short(s, len, knownLCWords); } else { return wordShapeChris4Long(s, omitIfInBoundary, len, knownLCWords); } }
/** * This one picks up on Dan2 ideas, but seeks to make less distinctions * mid sequence by sorting for long words, but to maintain extra * distinctions for short words, by always recording the class of the * first and last two characters of the word. * Compared to chris2 on which it is based, * it uses more Unicode classes, and so collapses things like * punctuation more, and might work better with real unicode. * * @param s The String to find the word shape of * @param omitIfInBoundary If true, character classes present in the * first or last two (i.e., BOUNDARY_SIZE) letters * of the word are not also registered * as classes that appear in the middle of the word. * @param knownLCWords If non-null and non-empty, tag with a "k" suffix words * that are in this list when lowercased (representing * that the word is "known" as a lowercase word). * @return A word shape for the word. */ private static String wordShapeChris4(String s, boolean omitIfInBoundary, Collection<String> knownLCWords) { int len = s.length(); if (len <= BOUNDARY_SIZE * 2) { return wordShapeChris4Short(s, len, knownLCWords); } else { return wordShapeChris4Long(s, omitIfInBoundary, len, knownLCWords); } }
/** * This one picks up on Dan2 ideas, but seeks to make less distinctions * mid sequence by sorting for long words, but to maintain extra * distinctions for short words, by always recording the class of the * first and last two characters of the word. * Compared to chris2 on which it is based, * it uses more Unicode classes, and so collapses things like * punctuation more, and might work better with real unicode. * * @param s The String to find the word shape of * @param omitIfInBoundary If true, character classes present in the * first or last two (i.e., BOUNDARY_SIZE) letters * of the word are not also registered * as classes that appear in the middle of the word. * @param knownLCWords If non-null and non-empty, tag with a "k" suffix words * that are in this list when lowercased (representing * that the word is "known" as a lowercase word). * @return A word shape for the word. */ private static String wordShapeChris4(String s, boolean omitIfInBoundary, Collection<String> knownLCWords) { int len = s.length(); if (len <= BOUNDARY_SIZE * 2) { return wordShapeChris4Short(s, len, knownLCWords); } else { return wordShapeChris4Long(s, omitIfInBoundary, len, knownLCWords); } }
/** * This one picks up on Dan2 ideas, but seeks to make less distinctions * mid sequence by sorting for long words, but to maintain extra * distinctions for short words, by always recording the class of the * first and last two characters of the word. * Compared to chris2 on which it is based, * it uses more Unicode classes, and so collapses things like * punctuation more, and might work better with real unicode. * * @param s The String to find the word shape of * @param omitIfInBoundary If true, character classes present in the * first or last two (i.e., BOUNDARY_SIZE) letters * of the word are not also registered * as classes that appear in the middle of the word. * @param knownLCWords If non-null and non-empty, tag with a "k" suffix words * that are in this list when lowercased (representing * that the word is "known" as a lowercase word). * @return A word shape for the word. */ private static String wordShapeChris4(String s, boolean omitIfInBoundary, Collection<String> knownLCWords) { int len = s.length(); if (len <= BOUNDARY_SIZE * 2) { return wordShapeChris4Short(s, len, knownLCWords); } else { return wordShapeChris4Long(s, omitIfInBoundary, len, knownLCWords); } }