zemberek.morphology.TurkishMorphology.analyze java code examples

private boolean addLemmas() {
 String word = termAttribute.toString();
 WordAnalysis analysis = morphology.analyze(word);
 Set<String> l = new HashSet<>(5);
 //l.add(word);
 analysis.forEach(s -> l.addAll(s.getLemmas()));
 lemmas = new ArrayDeque<>(l);
 return true;
}

public List<WordAnalysis> analyzeSentence(String sentence) {
 String normalized = TextUtil.normalizeQuotesHyphens(sentence);
 List<WordAnalysis> result = new ArrayList<>();
 for (Token token : tokenizer.tokenize(normalized)) {
  result.add(analyze(token));
 }
 return result;
}

@Override
public void analyzeWord(
  WordAnalysisRequest request,
  StreamObserver<WordAnalysisProto> responseObserver) {
 String input = request.getInput();
 WordAnalysis a = morphology.analyze(input);
 responseObserver.onNext(toWordAnalysisProto(a));
 responseObserver.onCompleted();
}

private static void filterVocab(Path vocabFile, Path outFile) throws IOException {
 List<String> words = Files.readAllLines(vocabFile, StandardCharsets.UTF_8);
 TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
 List<String> result = new ArrayList<>();
 for (String word : words) {
  WordAnalysis analysis = morphology.analyze(word);
  if (!analysis.isCorrect()) {
   Log.warn("Cannot analyze %s", word);
   continue;
  }
  result.add(word);
 }
 Files.write(outFile, result, StandardCharsets.UTF_8);
}

private boolean unambiguous(String sentence) {
 for (String token : TurkishTokenizer.DEFAULT.tokenizeToStrings(sentence)) {
  WordAnalysis analyses = morphology.analyze(token);
  Set<String> lemmas = new HashSet<>();
  for (SingleAnalysis analysis : analyses) {
   lemmas.add(analysis.getDictionaryItem().normalizedLemma());
  }
  if (lemmas.size() > 1) {
   return false;
  }
 }
 return true;
}

/**
 * Tries to combine words that are written separately using heuristics. If it cannot combine,
 * returns empty string.
 *
 * Such as:
 * <pre>
 * göndere bilirler -> göndere bilirler
 * elma lar -> elmalar
 * ankara 'ya -> ankara'ya
 * </pre>
 */
String combineCommon(String i1, String i2) {
 String combined = i1 + i2;
 if (i2.startsWith("'") || i2.startsWith("bil")) {
  WordAnalysis w = morphology.analyze(combined);
  if (hasAnalysis(w)) {
   return combined;
  }
 }
 if (!hasRegularAnalysis(i2)) {
  WordAnalysis w = morphology.analyze(combined);
  if (hasAnalysis(w)) {
   return combined;
  }
 }
 return "";
}

List<String> annotations = new ArrayList<>();
for (String word : words) {
 WordAnalysis analysis = morphology.analyze(word);
 if (!analysis.isCorrect()) {
  Log.warn("Cannot analyze %s", word);
     i == tokens.size() - 1 ? new ArrayList<>() : tokens.subList(i + 1, tokens.size());
   String ending = String.join(" ", morphemes);
   if (isCorrectAndContainsNoProper(morphology.analyze(stem))) {
    if (ending.length() > 0) {
     stemEndings.add(word + " " + stem + " " + ending);

List<String> annotations = new ArrayList<>();
for (String word : words) {
 WordAnalysis analysis = morphology.analyze(word);
 if (!analysis.isCorrect()) {
  Log.warn("Cannot analyze %s", word);

private List<String> getUnrankedSuggestions(String word) {
 String normalized = TurkishAlphabet.INSTANCE.normalize(word).replaceAll("['’]", "");
 List<String> strings = decoder.getSuggestions(normalized, charMatcher);
 WordAnalysisSurfaceFormatter.CaseType caseType = formatter.guessCase(word);
 if (caseType == WordAnalysisSurfaceFormatter.CaseType.MIXED_CASE ||
   caseType == WordAnalysisSurfaceFormatter.CaseType.LOWER_CASE) {
  caseType = WordAnalysisSurfaceFormatter.CaseType.DEFAULT_CASE;
 }
 Set<String> results = new LinkedHashSet<>(strings.size());
 for (String string : strings) {
  WordAnalysis analyses = morphology.analyze(string);
  for (SingleAnalysis analysis : analyses) {
   if (analysis.isUnknown()) {
    continue;
   }
   if (analysisPredicate != null && !analysisPredicate.test(analysis)) {
    continue;
   }
   String formatted = formatter.formatToCase(analysis, caseType, getApostrophe(word));
   results.add(formatted);
  }
 }
 return new ArrayList<>(results);
}

List<String> getEndingsFromVocabulary(List<String> words) {
 Histogram<String> endings = new Histogram<>(words.size() / 10);
 for (String word : words) {
  WordAnalysis analyses = morphology.analyze(word);
  for (SingleAnalysis analysis : analyses) {
   if (analysis.isUnknown()) {
    continue;
   }
   StemAndEnding se = analysis.getStemAndEnding();
   if (se.ending.length() > 0) {
    endings.add(se.ending);
   }
  }
 }
 return endings.getSortedList(Turkish.STRING_COMPARATOR_ASC);
}

/**
 * Returns true if only word is analysed with internal dictionary and analysis dictionary item is
 * not proper noun.
 */
boolean hasRegularAnalysis(String s) {
 WordAnalysis a = morphology.analyze(s);
 return a.stream().anyMatch(k -> !k.isUnknown() && !k.isRuntime() &&
   k.getDictionaryItem().secondaryPos != SecondaryPos.ProperNoun &&
   k.getDictionaryItem().secondaryPos != SecondaryPos.Abbreviation
 );
}

public boolean check(String input) {
 WordAnalysis analyses = morphology.analyze(input);
 WordAnalysisSurfaceFormatter.CaseType caseType = formatter.guessCase(input);
 for (SingleAnalysis analysis : analyses) {
  if (analysis.isUnknown()) {
   continue;
  }
  if (analysisPredicate != null && !analysisPredicate.test(analysis)) {
   continue;
  }
  String apostrophe = getApostrophe(input);
  if (formatter.canBeFormatted(analysis, caseType)) {
   String formatted = formatter.formatToCase(analysis, caseType, apostrophe);
   if (input.equals(formatted)) {
    return true;
   }
  }
 }
 return false;
}

private void test(String input, DictionaryItem newItem) throws IOException {
 WordAnalysis before = morphology.analyze(input);
 Log.info("Parses for " + input + " before adding " + newItem);
 printResults(before);
 morphology.invalidateCache();
 morphology.getMorphotactics().getStemTransitions().addDictionaryItem(newItem);
 WordAnalysis after = morphology.analyze(input);
 Log.info("Parses for " + input + " after adding " + newItem);
 printResults(after);
}

 return;
WordAnalysis analyses = morphology.analyze(word);
SingleAnalysis longest =
  analyses.analysisCount() > 0 ?

for (String line : lines) {
 for (String s : splitter.split(line)) {
  WordAnalysis results = parser.analyze(s);
  total++;
  if (total % 50000 == 0) {

for (String line : lines) {
 for (String s : splitter.split(line)) {
  WordAnalysis results = parser.analyze(s);
  total++;
  if (total % 50000 == 0) {

public static void main(String[] args) throws IOException {
 TurkishMorphology morphology = TurkishMorphology.builder()
   .ignoreDiacriticsInAnalysis()
   .setLexicon(RootLexicon.getDefault())
   .build();
 morphology.analyze("kisi").forEach(System.out::println);
}

public static void main(String[] args) {
 TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
 String word = "kutucuğumuz";
 Log.info("Word = " + word);
 Log.info("Results: ");
 WordAnalysis results = morphology.analyze(word);
 for (SingleAnalysis result : results) {
  Log.info(result.formatLong());
  Log.info("\tStems = " + result.getStems());
  Log.info("\tLemmas = " + result.getLemmas());
 }
}

public static void main(String[] args) {
 TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
 DictionaryItem newStem = morphology.getLexicon().getMatchingItems("poğaça").get(0);
 String word = "simidime";
 Log.info("Input Word = " + word);
 WordAnalysis results = morphology.analyze(word);
 for (SingleAnalysis result : results) {
  List<Result> generated =
    morphology.getWordGenerator().generate(newStem, result.getMorphemes());
  for (Result s : generated) {
   Log.info("Input analysis: " + result.formatLong());
   Log.info("After stem change, word = " + s.surface);
   Log.info("After stem change, Analysis = " + s.analysis.formatLong());
  }
 }
}

public static void main(String[] args) {
 TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
 String word = "kalemi";
 Log.info("Word = " + word);
 WordAnalysis results = morphology.analyze(word);
 for (SingleAnalysis result : results) {
  Log.info("Lexical and Surface : " + result.formatLong());
  Log.info("Only Lexical        : " + result.formatLexical());
  Log.info("Oflazer style       : " +
    AnalysisFormatters.OFLAZER_STYLE.format(result));
  Log.info();
 }
}

Javadoc

Applies morphological analysis and disambiguation to a sentence.

Popular methods of TurkishMorphology

createWithDefaults
analyzeAndDisambiguate
Applies morphological analysis and disambiguation to a sentence.
builder
getLexicon
analyzeSentence
getWordGenerator
disambiguate
getMorphotactics
invalidateCache
<init>
analyzeWithCache
analyzeWithoutCache

Popular in Java

Creating JSON documents from java classes using gson
setContentView (Activity)
findViewById (Activity)
compareTo (BigDecimal)
OutputStream (java.io)
A writable sink for bytes.Most clients will use output streams that write data to the file system (
List (java.util)
An ordered collection (also known as a sequence). The user of this interface has precise control ove
ThreadPoolExecutor (java.util.concurrent)
An ExecutorService that executes each submitted task using one of possibly several pooled threads, n
Servlet (javax.servlet)
Defines methods that all servlets must implement. A servlet is a small Java program that runs within
BoxLayout (javax.swing)
Location (org.springframework.beans.factory.parsing)
Class that models an arbitrary location in a Resource.Typically used to track the location of proble
Top Vim plugins

How to use analyzemethodin zemberek.morphology.TurkishMorphology

Best Java code snippets using zemberek.morphology.TurkishMorphology.analyze (Showing top 20 results out of 315)

How to use
analyze
method
in
zemberek.morphology.TurkishMorphology