/** * Unicode normalizes some piece of natural language text. The chosen form * is compatibility decomposition, canonical composition (NFKC). * * @param input The string to normalize. * @return The normalized string. */ protected String normalize(String input) { if (input == null || input.length() == 0) return input; return environment.getLinguistics().getNormalizer().normalize(input); }
private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents) { token = normalizer.normalize(token); token = LinguisticsCase.toLowerCase(token); if (removeAccents) token = transformer.accentDrop(token, language); if (stemMode != StemMode.NONE) token = stemmer.stem(token); return token; }
private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents, Stemmer stemmer) { token = normalizer.normalize(token); token = LinguisticsCase.toLowerCase(token); if (removeAccents) token = transformer.accentDrop(token, language); if (stemMode != StemMode.NONE) { token = doStemming(token, stemmer); } return token; }
wordData = normalizer.normalize(wordData);
wordData = normalizer.normalize(wordData);