private static void filterVocab(Path vocabFile, Path outFile) throws IOException { List<String> words = Files.readAllLines(vocabFile, StandardCharsets.UTF_8); TurkishMorphology morphology = TurkishMorphology.createWithDefaults(); List<String> result = new ArrayList<>(); for (String word : words) { WordAnalysis analysis = morphology.analyze(word); if (!analysis.isCorrect()) { Log.warn("Cannot analyze %s", word); continue; } result.add(word); } Files.write(outFile, result, StandardCharsets.UTF_8); }
private String replaceWordsWithLemma(String sentence) { SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence); List<String> res = new ArrayList<>(); for (SentenceWordAnalysis e : analysis) { SingleAnalysis best = e.getBestAnalysis(); if (best.isUnknown()) { res.add(e.getWordAnalysis().getInput()); continue; } List<String> lemmas = best.getLemmas(); res.add(lemmas.get(lemmas.size() - 1)); } return String.join(" ", res); }
public static void main(String[] args) { String[] number = {"A3sg", "A3pl"}; String[] possessives = {"P1sg", "P2sg", "P3sg"}; String[] cases = {"Dat", "Loc", "Abl"}; TurkishMorphology morphology = TurkishMorphology.builder().setLexicon("armut").disableCache().build(); DictionaryItem item = morphology.getLexicon().getMatchingItems("armut").get(0); for (String numberM : number) { for (String possessiveM : possessives) { for (String caseM : cases) { List<Result> results = morphology.getWordGenerator().generate(item, numberM, possessiveM, caseM); results.forEach(s->System.out.println(s.surface)); } } } }
/** * Applies morphological analysis and disambiguation to a sentence. * * @param sentence Sentence. * @return SentenceAnalysis instance. */ public SentenceAnalysis analyzeAndDisambiguate(String sentence) { return disambiguate(sentence, analyzeSentence(sentence)); }
public static void saveLemmas(int minLength) throws IOException { TurkishMorphology morphology = TurkishMorphology.createWithDefaults(); Set<String> set = new HashSet<>(); for (DictionaryItem item : morphology.getLexicon()) { String lemma = item.lemma; if (item.attributes.contains(RootAttribute.Dummy)) { continue; } if (lemma.length() < minLength) { continue; } if (item.primaryPos == PrimaryPos.Punctuation) { continue; } set.add(lemma); } List<String> list = new ArrayList<>(set); list.sort(Turkish.STRING_COMPARATOR_ASC); Files.write(Paths.get("zemberek.vocab"), list); }
public ZemberekContext(ZemberekGrpcConfiguration configuration) { tokenizer = TurkishTokenizer.ALL; morphology = TurkishMorphology.createWithDefaults(); this.configuration = configuration; }
private boolean addLemmas() { String word = termAttribute.toString(); WordAnalysis analysis = morphology.analyze(word); Set<String> l = new HashSet<>(5); //l.add(word); analysis.forEach(s -> l.addAll(s.getLemmas())); lemmas = new ArrayDeque<>(l); return true; }
public static void main(String[] args) { TurkishMorphology morphology = TurkishMorphology.createWithDefaults(); DictionaryItem newStem = morphology.getLexicon().getMatchingItems("poğaça").get(0); String word = "simidime"; Log.info("Input Word = " + word); WordAnalysis results = morphology.analyze(word); for (SingleAnalysis result : results) { List<Result> generated = morphology.getWordGenerator().generate(newStem, result.getMorphemes()); for (Result s : generated) { Log.info("Input analysis: " + result.formatLong()); Log.info("After stem change, word = " + s.surface); Log.info("After stem change, Analysis = " + s.analysis.formatLong()); } } }
public static void main(String[] args) { TurkishMorphology morphology = TurkishMorphology.createWithDefaults(); String sentence = "Bol baharatlı bir yemek yaptıralım."; Log.info("Sentence = " + sentence); List<WordAnalysis> analyses = morphology.analyzeSentence(sentence); Log.info("Sentence word analysis result:"); for (WordAnalysis entry : analyses) { Log.info("Word = " + entry.getInput()); for (SingleAnalysis analysis : entry) { Log.info(analysis.formatLong()); } } SentenceAnalysis result = morphology.disambiguate(sentence, analyses); Log.info("\nAfter ambiguity resolution : "); result.bestAnalysis().forEach(Log::info); } }
public static void main(String[] args) { TurkishMorphology morphology = TurkishMorphology.createWithDefaults(); String sentence = "Keşke yarın hava güzel olsa."; Log.info("Sentence = " + sentence); SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence); for (SentenceWordAnalysis a : analysis) { PrimaryPos primaryPos = a.getBestAnalysis().getPos(); Log.info("%s : %s ", a.getWordAnalysis().getInput(), primaryPos); } }
private void test(String input, DictionaryItem newItem) throws IOException { WordAnalysis before = morphology.analyze(input); Log.info("Parses for " + input + " before adding " + newItem); printResults(before); morphology.invalidateCache(); morphology.getMorphotactics().getStemTransitions().addDictionaryItem(newItem); WordAnalysis after = morphology.analyze(input); Log.info("Parses for " + input + " after adding " + newItem); printResults(after); }
public static void main(String[] args) { TurkishMorphology morphology = TurkishMorphology.builder() .setLexicon(RootLexicon.getDefault()) .useInformalAnalysis() .build(); List<SingleAnalysis> analyses = morphology .analyzeAndDisambiguate("okuycam diyo") .bestAnalysis(); for (SingleAnalysis a : analyses) { System.out.println(a.surfaceForm() + "-" + a); } System.out.println("Converting formal surface form:"); InformalAnalysisConverter converter = new InformalAnalysisConverter(morphology.getWordGenerator()); for (SingleAnalysis a : analyses) { System.out.println(converter.convert(a.surfaceForm(), a)); } } }
WordAnalysis analyses = informalAsciiTolerantMorphology.analyze(current); List<WordGenerator.Result> results = morphology.getWordGenerator().generate( analysis.getDictionaryItem(), analysis.getMorphemes()); if (candidates.isEmpty() || morphology.analyze(current).isCorrect()) { candidates.add(current);
@Override public void run() { Builder b = TurkishMorphology.builder().setLexicon(RootLexicon.getDefault()); if (disableUnknownAnalysis) { b.disableUnidentifiedTokenAnalyzer(); SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(input);
if (disambiguate) { if (deepWordAnalysis) { SentenceAnalysis results = morphology.analyzeAndDisambiguate(sentence); for (SentenceWordAnalysis wordAnalysis : results) { List<AnalyzeWordItem> analyze_list = new ArrayList<>(); List<SingleAnalysis> singleAnalyses = morphology.analyzeAndDisambiguate(sentence) .bestAnalysis(); for (SingleAnalysis analysis : singleAnalyses) { List<WordAnalysis> results = morphology.analyzeSentence(sentence); for (WordAnalysis wordAnalysis : results) { List<AnalyzeWordItem> analyze_list = new ArrayList<>(); List<WordAnalysis> results = morphology.analyzeSentence(sentence); for (WordAnalysis wordAnalysis : results) { List<AnalyzeWordItem> analyze_list = new ArrayList<>();
morphology = TurkishMorphology.builder() .setLexicon(RootLexicon.getDefault()) .build();
private static void removeZemberekDictionaryWordsFromList(Path input, Path out) throws IOException { LinkedHashSet<String> list = new LinkedHashSet<>( Files.readAllLines(input, StandardCharsets.UTF_8)); System.out.println("Total amount of lines = " + list.size()); TurkishMorphology morphology = TurkishMorphology.create( RootLexicon.builder().addTextDictionaryResources( "tr/master-dictionary.dict", "tr/non-tdk.dict", "tr/proper.dict", "tr/proper-from-corpus.dict", "tr/abbreviations.dict" ).build()); List<String> toRemove = new ArrayList<>(); for (DictionaryItem item : morphology.getLexicon()) { if (list.contains(item.lemma)) { toRemove.add(item.lemma); } } System.out.println("Total amount to remove = " + toRemove.size()); list.removeAll(toRemove); try (PrintWriter pw = new PrintWriter(out.toFile(), "utf-8")) { list.forEach(pw::println); } }
public ResultSentence disambiguate(String sentence) { List<WordAnalysis> ambiguous = analyzer.analyzeSentence(sentence); ResultSentence s = new ResultSentence(sentence, ambiguous); s.makeDecisions(rules); return s; }