org.carrot2.core.LanguageCode java code examples

@Attribute(required = false, name = "language")
private void setLanguageIsoCode(String languageIsoCode)
{
  if (languageIsoCode != null)
  {
    final LanguageCode language = LanguageCode.forISOCode(languageIsoCode);
    if (language != null)
    {
      setLanguage(language);
    }
    else
    {
      // Try by enum name for backward-compatibility
      setLanguage(LanguageCode.valueOf(languageIsoCode));
    }
  }
  else
  {
    setLanguage(null);
  }
}

  @Override
  public String toString()
  {
    return StringUtils.identifierToHumanReadable(name());
  }
}

  ExtendedWhitespaceTokenizer.class);
for (LanguageCode lc : LanguageCode.values())
for (LanguageCode lc : LanguageCode.values())
        "Tokenizer for " + lc.toString() + " (" + lc.getIsoCode()
          + ") is not available."
          + " This may degrade clustering quality of " + lc.toString()
          + " content. Cause: {}"));

final LanguageCode languageCode = language.equals("") ? null : LanguageCode.valueOf(language);
final Cluster languageCluster = new Cluster(
  languageCode != null ? languageCode.toString() : "Unknown Language");
languageCounts.put(languageCode != null ? languageCode.getIsoCode() : "",
  languageDocuments.size());

/**
 * Reload all lexical resources associated with the given key.
 */
private static HashMap<LanguageCode, ILexicalData> reloadResources(ResourceLookup resourceLookup)
{
  // Load lexical resources.
  ObjectHashSet<MutableCharArray> mergedStopwords = new ObjectHashSet<>();
  ArrayList<Pattern> mergedStoplabels = Lists.newArrayList();
  HashMap<LanguageCode, ILexicalData> resourceMap = Maps.newHashMap();
  for (LanguageCode languageCode : LanguageCode.values())
  {
    final String isoCode = languageCode.getIsoCode();
    ObjectHashSet<MutableCharArray> stopwords = toLower(load(resourceLookup, "stopwords." + isoCode));
    ArrayList<Pattern> stoplabels = 
      compile(load(resourceLookup, "stoplabels." + isoCode));
    mergedStopwords.addAll(stopwords);
    mergedStoplabels.addAll(stoplabels);
    
    resourceMap.put(languageCode, new DefaultLexicalData(stopwords, stoplabels));
  }
  resourceMap.put(null, new DefaultLexicalData(mergedStopwords, mergedStoplabels));
  return resourceMap;
}

final LanguageCode carrot2Language = LanguageCode.forISOCode(lang);
if (carrot2Language != null) {
 carrotDocument.setLanguage(carrot2Language);

doc.setLanguage(LanguageCode.valueOf(req.Language.trim().toUpperCase()));

@JsonProperty("language")
@Attribute(required = false, name = "language")
private String getLanguageIsoCode()
{
  final LanguageCode language = getLanguage();
  return language != null ? language.getIsoCode() : null;
}

/**
 * Formats a cluster label for final rendering.
 */
public String format(PreprocessingContext context, int featureIndex)
{
  final char [][] wordsImage = context.allWords.image;
  final int [][] phrasesWordIndices = context.allPhrases.wordIndices;
  final int wordCount = wordsImage.length;
  final StringBuilder label = new StringBuilder();
  if (featureIndex < wordCount)
  {
    final char [] image = wordsImage[featureIndex];
    appendFormatted(label, image, true, false);
  }
  else
  {
    final boolean insertSpace = context.language.getLanguageCode().usesSpaceDelimiters();
    final int [] wordIndices = phrasesWordIndices[featureIndex - wordCount];
    final short [] termTypes = context.allWords.type;
    for (int i = 0; i < wordIndices.length; i++)
    {
      if (insertSpace && i > 0) label.append(' ');
      final int wordIndex = wordIndices[i];
      appendFormatted(label, wordsImage[wordIndex], i == 0,
        TokenTypeUtils.isCommon(termTypes[wordIndex]));
    }
  }
  return label.toString();
}

    this.majorityLanguage = entry.getElement().getIsoCode();
languageCounts.put(entry.getElement() != null ? entry.getElement().getIsoCode() : "", 
  entry.getCount());

/**
 * Build the cluster's label from suffix tree edge indices. 
 */
private String buildLabel(int [] phraseIndices)
{
  // Count the number of terms first.
  int termsCount = 0;
  for (int j = 0; j < phraseIndices.length; j += 2)
  {
    termsCount += phraseIndices[j + 1] - phraseIndices[j] + 1;
  }

  // Extract terms info for the phrase and construct the label.
  final boolean [] stopwords = new boolean[termsCount];
  final char [][] images = new char [termsCount][];
  final short [] tokenTypes = context.allWords.type;
  int k = 0;
  for (int i = 0; i < phraseIndices.length; i += 2)
  {
    for (int j = phraseIndices[i]; j <= phraseIndices[i + 1]; j++, k++)
    {
      final int termIndex = sb.input.get(j);
      images[k] = context.allWords.image[termIndex];
      stopwords[k] = TokenTypeUtils.isCommon(tokenTypes[termIndex]);
    }
  }
  
  return LabelFormatter.format(images, stopwords, 
    context.language.getLanguageCode().usesSpaceDelimiters());
}

for (LanguageCode lc : LanguageCode.values())
        factory, identity, stemmerVerifier,
        logger, "Stemmer for "
          + lc.toString() + " (" + lc.getIsoCode() + ") is not available."
          + " This may degrade clustering quality of " 
          + lc.toString() + " content. Cause: {}");

  public String apply(Document document)
  {
    final LanguageCode language = document.getLanguage();
    return language != null ? language.name() : "";
  }
});

 /**
  * Create and return an {@link IStemmer} adapter for a
  * {@link SnowballProgram} for a given language code. An identity stemmer is
  * returned for unknown languages.
  */
 public static IStemmer createStemmer(LanguageCode language) {
  final Class<? extends SnowballProgram> stemmerClazz = snowballStemmerClasses
    .get(language);
  if (stemmerClazz == null) {
   log.warn("No Snowball stemmer class for: " + language.name()
     + ". Quality of clustering may be degraded.");
   return IdentityStemmer.INSTANCE;
  }
  try {
   return new SnowballStemmerAdapter(stemmerClazz.newInstance());
  } catch (Exception e) {
   log.warn("Could not instantiate snowball stemmer"
     + " for language: " + language.name()
     + ". Quality of clustering may be degraded.", e);
   return IdentityStemmer.INSTANCE;
  }
 }
}

Javadoc

Codes for languages for which linguistic resources are available in DefaultLanguageModelFactory.

Notes about third-party or extra implementation needs.

#POLISH makes use of Morfologik stemming library if it is available in the classpath.
#CHINESE_SIMPLIFIED makes use of Lucene's smartcn tokenizer if it is available in the classpath.
#THAI makes use of Lucene's ThaiWordFilter if it is available in the classpath.

Most used methods

forISOCode
Return a LanguageCode constant for a given ISO code (or null) if not available.
name
valueOf
getIsoCode
toString
usesSpaceDelimiters
Returns true if this language uses space delimiters between words. This is a hint for formatting clu
values

Popular in Java

Reactive rest calls using spring rest template
scheduleAtFixedRate (Timer)
compareTo (BigDecimal)
getResourceAsStream (ClassLoader)
InputStream (java.io)
A readable source of bytes.Most clients will use input streams that read data from the file system (
Thread (java.lang)
A thread is a thread of execution in a program. The Java Virtual Machine allows an application to ha
DecimalFormat (java.text)
A concrete subclass of NumberFormat that formats decimal numbers. It has a variety of features desig
HashMap (java.util)
HashMap is an implementation of Map. All optional operations are supported.All elements are permitte
ConcurrentHashMap (java.util.concurrent)
A plug-in replacement for JDK1.5 java.util.concurrent.ConcurrentHashMap. This version is based on or
Cipher (javax.crypto)
This class provides access to implementations of cryptographic ciphers for encryption and decryption
Github Copilot alternatives

How to useLanguageCode in org.carrot2.core

Best Java code snippets using org.carrot2.core.LanguageCode (Showing top 14 results out of 315)

How to use
LanguageCode
in
org.carrot2.core