zemberek.morphology.TurkishMorphology java code examples

private static void filterVocab(Path vocabFile, Path outFile) throws IOException {
 List<String> words = Files.readAllLines(vocabFile, StandardCharsets.UTF_8);
 TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
 List<String> result = new ArrayList<>();
 for (String word : words) {
  WordAnalysis analysis = morphology.analyze(word);
  if (!analysis.isCorrect()) {
   Log.warn("Cannot analyze %s", word);
   continue;
  }
  result.add(word);
 }
 Files.write(outFile, result, StandardCharsets.UTF_8);
}

private String replaceWordsWithLemma(String sentence) {
 SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
 List<String> res = new ArrayList<>();
 for (SentenceWordAnalysis e : analysis) {
  SingleAnalysis best = e.getBestAnalysis();
  if (best.isUnknown()) {
   res.add(e.getWordAnalysis().getInput());
   continue;
  }
  List<String> lemmas = best.getLemmas();
  res.add(lemmas.get(lemmas.size() - 1));
 }
 return String.join(" ", res);
}

public static void main(String[] args) {
 String[] number = {"A3sg", "A3pl"};
 String[] possessives = {"P1sg", "P2sg", "P3sg"};
 String[] cases = {"Dat", "Loc", "Abl"};
 TurkishMorphology morphology =
   TurkishMorphology.builder().setLexicon("armut").disableCache().build();
 DictionaryItem item = morphology.getLexicon().getMatchingItems("armut").get(0);
 for (String numberM : number) {
  for (String possessiveM : possessives) {
   for (String caseM : cases) {
    List<Result> results =
      morphology.getWordGenerator().generate(item, numberM, possessiveM, caseM);
    results.forEach(s->System.out.println(s.surface));
   }
  }
 }
}

/**
 * Applies morphological analysis and disambiguation to a sentence.
 *
 * @param sentence Sentence.
 * @return SentenceAnalysis instance.
 */
public SentenceAnalysis analyzeAndDisambiguate(String sentence) {
 return disambiguate(sentence, analyzeSentence(sentence));
}

public static void saveLemmas(int minLength) throws IOException {
 TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
 Set<String> set = new HashSet<>();
 for (DictionaryItem item : morphology.getLexicon()) {
  String lemma = item.lemma;
  if (item.attributes.contains(RootAttribute.Dummy)) {
   continue;
  }
  if (lemma.length() < minLength) {
   continue;
  }
  if (item.primaryPos == PrimaryPos.Punctuation) {
   continue;
  }
  set.add(lemma);
 }
 List<String> list = new ArrayList<>(set);
 list.sort(Turkish.STRING_COMPARATOR_ASC);
 Files.write(Paths.get("zemberek.vocab"), list);
}

public ZemberekContext(ZemberekGrpcConfiguration configuration) {
 tokenizer = TurkishTokenizer.ALL;
 morphology = TurkishMorphology.createWithDefaults();
 this.configuration = configuration;
}

private boolean addLemmas() {
 String word = termAttribute.toString();
 WordAnalysis analysis = morphology.analyze(word);
 Set<String> l = new HashSet<>(5);
 //l.add(word);
 analysis.forEach(s -> l.addAll(s.getLemmas()));
 lemmas = new ArrayDeque<>(l);
 return true;
}

public static void main(String[] args) {
 TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
 DictionaryItem newStem = morphology.getLexicon().getMatchingItems("poğaça").get(0);
 String word = "simidime";
 Log.info("Input Word = " + word);
 WordAnalysis results = morphology.analyze(word);
 for (SingleAnalysis result : results) {
  List<Result> generated =
    morphology.getWordGenerator().generate(newStem, result.getMorphemes());
  for (Result s : generated) {
   Log.info("Input analysis: " + result.formatLong());
   Log.info("After stem change, word = " + s.surface);
   Log.info("After stem change, Analysis = " + s.analysis.formatLong());
  }
 }
}

 public static void main(String[] args) {

  TurkishMorphology morphology = TurkishMorphology.createWithDefaults();

  String sentence = "Bol baharatlı bir yemek yaptıralım.";
  Log.info("Sentence  = " + sentence);
  List<WordAnalysis> analyses = morphology.analyzeSentence(sentence);

  Log.info("Sentence word analysis result:");
  for (WordAnalysis entry : analyses) {
   Log.info("Word = " + entry.getInput());
   for (SingleAnalysis analysis : entry) {
    Log.info(analysis.formatLong());
   }
  }
  SentenceAnalysis result = morphology.disambiguate(sentence, analyses);

  Log.info("\nAfter ambiguity resolution : ");
  result.bestAnalysis().forEach(Log::info);
 }
}

public static void main(String[] args) {
 TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
 String sentence = "Keşke yarın hava güzel olsa.";
 Log.info("Sentence  = " + sentence);
 SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence);
 for (SentenceWordAnalysis a : analysis) {
  PrimaryPos primaryPos = a.getBestAnalysis().getPos();
  Log.info("%s : %s ",
    a.getWordAnalysis().getInput(),
    primaryPos);
 }
}

private void test(String input, DictionaryItem newItem) throws IOException {
 WordAnalysis before = morphology.analyze(input);
 Log.info("Parses for " + input + " before adding " + newItem);
 printResults(before);
 morphology.invalidateCache();
 morphology.getMorphotactics().getStemTransitions().addDictionaryItem(newItem);
 WordAnalysis after = morphology.analyze(input);
 Log.info("Parses for " + input + " after adding " + newItem);
 printResults(after);
}

 public static void main(String[] args) {

  TurkishMorphology morphology = TurkishMorphology.builder()
    .setLexicon(RootLexicon.getDefault())
    .useInformalAnalysis()
    .build();

  List<SingleAnalysis> analyses = morphology
    .analyzeAndDisambiguate("okuycam diyo")
    .bestAnalysis();

  for (SingleAnalysis a : analyses) {
   System.out.println(a.surfaceForm() + "-" + a);
  }

  System.out.println("Converting formal surface form:");

  InformalAnalysisConverter converter =
    new InformalAnalysisConverter(morphology.getWordGenerator());

  for (SingleAnalysis a : analyses) {
   System.out.println(converter.convert(a.surfaceForm(), a));
  }

 }
}

public static void main(String[] args) throws IOException {
 TurkishMorphology morphology = TurkishMorphology.builder()
   .ignoreDiacriticsInAnalysis()
   .setLexicon(RootLexicon.getDefault())
   .build();
 morphology.analyze("kisi").forEach(System.out::println);
}

WordAnalysis analyses = informalAsciiTolerantMorphology.analyze(current);
  List<WordGenerator.Result> results = morphology.getWordGenerator().generate(
    analysis.getDictionaryItem(),
    analysis.getMorphemes());
if (candidates.isEmpty() || morphology.analyze(current).isCorrect()) {
 candidates.add(current);

@Override
public void run() {
 Builder b = TurkishMorphology.builder().setLexicon(RootLexicon.getDefault());
 if (disableUnknownAnalysis) {
  b.disableUnidentifiedTokenAnalyzer();
  SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(input);

if (disambiguate) {
  if (deepWordAnalysis) {
    SentenceAnalysis results = morphology.analyzeAndDisambiguate(sentence);
    for (SentenceWordAnalysis wordAnalysis : results) {
      List<AnalyzeWordItem> analyze_list = new ArrayList<>();
    List<SingleAnalysis> singleAnalyses = morphology.analyzeAndDisambiguate(sentence)
        .bestAnalysis();
    for (SingleAnalysis analysis : singleAnalyses) {
    List<WordAnalysis> results = morphology.analyzeSentence(sentence);
    for (WordAnalysis wordAnalysis : results) {
      List<AnalyzeWordItem> analyze_list = new ArrayList<>();
    List<WordAnalysis> results = morphology.analyzeSentence(sentence);
    for (WordAnalysis wordAnalysis : results) {
      List<AnalyzeWordItem> analyze_list = new ArrayList<>();

  .asCharSource(dataPath.toFile(), Charsets.UTF_8).readLines(new DataSetLoader());
TurkishMorphology morphology = TurkishMorphology.create(
  RootLexicon.builder().addTextDictionaryResources(
    "tr/master-dictionary.dict",
  WordAnalysis a = morphology.analyze(s);
  if (!a.isCorrect()) {
   break;

morphology = TurkishMorphology.builder()
  .setLexicon(RootLexicon.getDefault())
  .build();

private static void removeZemberekDictionaryWordsFromList(Path input, Path out)
  throws IOException {
 LinkedHashSet<String> list = new LinkedHashSet<>(
   Files.readAllLines(input, StandardCharsets.UTF_8));
 System.out.println("Total amount of lines = " + list.size());
 TurkishMorphology morphology = TurkishMorphology.create(
   RootLexicon.builder().addTextDictionaryResources(
     "tr/master-dictionary.dict",
     "tr/non-tdk.dict",
     "tr/proper.dict",
     "tr/proper-from-corpus.dict",
     "tr/abbreviations.dict"
   ).build());
 List<String> toRemove = new ArrayList<>();
 for (DictionaryItem item : morphology.getLexicon()) {
  if (list.contains(item.lemma)) {
   toRemove.add(item.lemma);
  }
 }
 System.out.println("Total amount to remove = " + toRemove.size());
 list.removeAll(toRemove);
 try (PrintWriter pw = new PrintWriter(out.toFile(), "utf-8")) {
  list.forEach(pw::println);
 }
}

public ResultSentence disambiguate(String sentence) {
 List<WordAnalysis> ambiguous = analyzer.analyzeSentence(sentence);
 ResultSentence s = new ResultSentence(sentence, ambiguous);
 s.makeDecisions(rules);
 return s;
}

How to useTurkishMorphology in zemberek.morphology

Best Java code snippets using zemberek.morphology.TurkishMorphology (Showing top 20 results out of 315)

How to use
TurkishMorphology
in
zemberek.morphology