@Attribute(required = false, name = "language") private void setLanguageIsoCode(String languageIsoCode) { if (languageIsoCode != null) { final LanguageCode language = LanguageCode.forISOCode(languageIsoCode); if (language != null) { setLanguage(language); } else { // Try by enum name for backward-compatibility setLanguage(LanguageCode.valueOf(languageIsoCode)); } } else { setLanguage(null); } }
@Override public String toString() { return StringUtils.identifierToHumanReadable(name()); } }
ExtendedWhitespaceTokenizer.class); for (LanguageCode lc : LanguageCode.values()) for (LanguageCode lc : LanguageCode.values()) "Tokenizer for " + lc.toString() + " (" + lc.getIsoCode() + ") is not available." + " This may degrade clustering quality of " + lc.toString() + " content. Cause: {}"));
final LanguageCode languageCode = language.equals("") ? null : LanguageCode.valueOf(language); final Cluster languageCluster = new Cluster( languageCode != null ? languageCode.toString() : "Unknown Language"); languageCounts.put(languageCode != null ? languageCode.getIsoCode() : "", languageDocuments.size());
/** * Reload all lexical resources associated with the given key. */ private static HashMap<LanguageCode, ILexicalData> reloadResources(ResourceLookup resourceLookup) { // Load lexical resources. ObjectHashSet<MutableCharArray> mergedStopwords = new ObjectHashSet<>(); ArrayList<Pattern> mergedStoplabels = Lists.newArrayList(); HashMap<LanguageCode, ILexicalData> resourceMap = Maps.newHashMap(); for (LanguageCode languageCode : LanguageCode.values()) { final String isoCode = languageCode.getIsoCode(); ObjectHashSet<MutableCharArray> stopwords = toLower(load(resourceLookup, "stopwords." + isoCode)); ArrayList<Pattern> stoplabels = compile(load(resourceLookup, "stoplabels." + isoCode)); mergedStopwords.addAll(stopwords); mergedStoplabels.addAll(stoplabels); resourceMap.put(languageCode, new DefaultLexicalData(stopwords, stoplabels)); } resourceMap.put(null, new DefaultLexicalData(mergedStopwords, mergedStoplabels)); return resourceMap; }
final LanguageCode carrot2Language = LanguageCode.forISOCode(lang); if (carrot2Language != null) { carrotDocument.setLanguage(carrot2Language);
doc.setLanguage(LanguageCode.valueOf(req.Language.trim().toUpperCase()));
@JsonProperty("language") @Attribute(required = false, name = "language") private String getLanguageIsoCode() { final LanguageCode language = getLanguage(); return language != null ? language.getIsoCode() : null; }
/** * Formats a cluster label for final rendering. */ public String format(PreprocessingContext context, int featureIndex) { final char [][] wordsImage = context.allWords.image; final int [][] phrasesWordIndices = context.allPhrases.wordIndices; final int wordCount = wordsImage.length; final StringBuilder label = new StringBuilder(); if (featureIndex < wordCount) { final char [] image = wordsImage[featureIndex]; appendFormatted(label, image, true, false); } else { final boolean insertSpace = context.language.getLanguageCode().usesSpaceDelimiters(); final int [] wordIndices = phrasesWordIndices[featureIndex - wordCount]; final short [] termTypes = context.allWords.type; for (int i = 0; i < wordIndices.length; i++) { if (insertSpace && i > 0) label.append(' '); final int wordIndex = wordIndices[i]; appendFormatted(label, wordsImage[wordIndex], i == 0, TokenTypeUtils.isCommon(termTypes[wordIndex])); } } return label.toString(); }
this.majorityLanguage = entry.getElement().getIsoCode(); languageCounts.put(entry.getElement() != null ? entry.getElement().getIsoCode() : "", entry.getCount());
/** * Build the cluster's label from suffix tree edge indices. */ private String buildLabel(int [] phraseIndices) { // Count the number of terms first. int termsCount = 0; for (int j = 0; j < phraseIndices.length; j += 2) { termsCount += phraseIndices[j + 1] - phraseIndices[j] + 1; } // Extract terms info for the phrase and construct the label. final boolean [] stopwords = new boolean[termsCount]; final char [][] images = new char [termsCount][]; final short [] tokenTypes = context.allWords.type; int k = 0; for (int i = 0; i < phraseIndices.length; i += 2) { for (int j = phraseIndices[i]; j <= phraseIndices[i + 1]; j++, k++) { final int termIndex = sb.input.get(j); images[k] = context.allWords.image[termIndex]; stopwords[k] = TokenTypeUtils.isCommon(tokenTypes[termIndex]); } } return LabelFormatter.format(images, stopwords, context.language.getLanguageCode().usesSpaceDelimiters()); }
for (LanguageCode lc : LanguageCode.values()) factory, identity, stemmerVerifier, logger, "Stemmer for " + lc.toString() + " (" + lc.getIsoCode() + ") is not available." + " This may degrade clustering quality of " + lc.toString() + " content. Cause: {}");
public String apply(Document document) { final LanguageCode language = document.getLanguage(); return language != null ? language.name() : ""; } });
/** * Create and return an {@link IStemmer} adapter for a * {@link SnowballProgram} for a given language code. An identity stemmer is * returned for unknown languages. */ public static IStemmer createStemmer(LanguageCode language) { final Class<? extends SnowballProgram> stemmerClazz = snowballStemmerClasses .get(language); if (stemmerClazz == null) { log.warn("No Snowball stemmer class for: " + language.name() + ". Quality of clustering may be degraded."); return IdentityStemmer.INSTANCE; } try { return new SnowballStemmerAdapter(stemmerClazz.newInstance()); } catch (Exception e) { log.warn("Could not instantiate snowball stemmer" + " for language: " + language.name() + ". Quality of clustering may be degraded.", e); return IdentityStemmer.INSTANCE; } } }