private boolean addLemmas() { String word = termAttribute.toString(); WordAnalysis analysis = morphology.analyze(word); Set<String> l = new HashSet<>(5); //l.add(word); analysis.forEach(s -> l.addAll(s.getLemmas())); lemmas = new ArrayDeque<>(l); return true; }
public List<WordAnalysis> analyzeSentence(String sentence) { String normalized = TextUtil.normalizeQuotesHyphens(sentence); List<WordAnalysis> result = new ArrayList<>(); for (Token token : tokenizer.tokenize(normalized)) { result.add(analyze(token)); } return result; }
@Override public void analyzeWord( WordAnalysisRequest request, StreamObserver<WordAnalysisProto> responseObserver) { String input = request.getInput(); WordAnalysis a = morphology.analyze(input); responseObserver.onNext(toWordAnalysisProto(a)); responseObserver.onCompleted(); }
private static void filterVocab(Path vocabFile, Path outFile) throws IOException { List<String> words = Files.readAllLines(vocabFile, StandardCharsets.UTF_8); TurkishMorphology morphology = TurkishMorphology.createWithDefaults(); List<String> result = new ArrayList<>(); for (String word : words) { WordAnalysis analysis = morphology.analyze(word); if (!analysis.isCorrect()) { Log.warn("Cannot analyze %s", word); continue; } result.add(word); } Files.write(outFile, result, StandardCharsets.UTF_8); }
private boolean unambiguous(String sentence) { for (String token : TurkishTokenizer.DEFAULT.tokenizeToStrings(sentence)) { WordAnalysis analyses = morphology.analyze(token); Set<String> lemmas = new HashSet<>(); for (SingleAnalysis analysis : analyses) { lemmas.add(analysis.getDictionaryItem().normalizedLemma()); } if (lemmas.size() > 1) { return false; } } return true; }
/** * Tries to combine words that are written separately using heuristics. If it cannot combine, * returns empty string. * * Such as: * <pre> * göndere bilirler -> göndere bilirler * elma lar -> elmalar * ankara 'ya -> ankara'ya * </pre> */ String combineCommon(String i1, String i2) { String combined = i1 + i2; if (i2.startsWith("'") || i2.startsWith("bil")) { WordAnalysis w = morphology.analyze(combined); if (hasAnalysis(w)) { return combined; } } if (!hasRegularAnalysis(i2)) { WordAnalysis w = morphology.analyze(combined); if (hasAnalysis(w)) { return combined; } } return ""; }
List<String> annotations = new ArrayList<>(); for (String word : words) { WordAnalysis analysis = morphology.analyze(word); if (!analysis.isCorrect()) { Log.warn("Cannot analyze %s", word); i == tokens.size() - 1 ? new ArrayList<>() : tokens.subList(i + 1, tokens.size()); String ending = String.join(" ", morphemes); if (isCorrectAndContainsNoProper(morphology.analyze(stem))) { if (ending.length() > 0) { stemEndings.add(word + " " + stem + " " + ending);
List<String> annotations = new ArrayList<>(); for (String word : words) { WordAnalysis analysis = morphology.analyze(word); if (!analysis.isCorrect()) { Log.warn("Cannot analyze %s", word);
private List<String> getUnrankedSuggestions(String word) { String normalized = TurkishAlphabet.INSTANCE.normalize(word).replaceAll("['’]", ""); List<String> strings = decoder.getSuggestions(normalized, charMatcher); WordAnalysisSurfaceFormatter.CaseType caseType = formatter.guessCase(word); if (caseType == WordAnalysisSurfaceFormatter.CaseType.MIXED_CASE || caseType == WordAnalysisSurfaceFormatter.CaseType.LOWER_CASE) { caseType = WordAnalysisSurfaceFormatter.CaseType.DEFAULT_CASE; } Set<String> results = new LinkedHashSet<>(strings.size()); for (String string : strings) { WordAnalysis analyses = morphology.analyze(string); for (SingleAnalysis analysis : analyses) { if (analysis.isUnknown()) { continue; } if (analysisPredicate != null && !analysisPredicate.test(analysis)) { continue; } String formatted = formatter.formatToCase(analysis, caseType, getApostrophe(word)); results.add(formatted); } } return new ArrayList<>(results); }
List<String> getEndingsFromVocabulary(List<String> words) { Histogram<String> endings = new Histogram<>(words.size() / 10); for (String word : words) { WordAnalysis analyses = morphology.analyze(word); for (SingleAnalysis analysis : analyses) { if (analysis.isUnknown()) { continue; } StemAndEnding se = analysis.getStemAndEnding(); if (se.ending.length() > 0) { endings.add(se.ending); } } } return endings.getSortedList(Turkish.STRING_COMPARATOR_ASC); }
/** * Returns true if only word is analysed with internal dictionary and analysis dictionary item is * not proper noun. */ boolean hasRegularAnalysis(String s) { WordAnalysis a = morphology.analyze(s); return a.stream().anyMatch(k -> !k.isUnknown() && !k.isRuntime() && k.getDictionaryItem().secondaryPos != SecondaryPos.ProperNoun && k.getDictionaryItem().secondaryPos != SecondaryPos.Abbreviation ); }
public boolean check(String input) { WordAnalysis analyses = morphology.analyze(input); WordAnalysisSurfaceFormatter.CaseType caseType = formatter.guessCase(input); for (SingleAnalysis analysis : analyses) { if (analysis.isUnknown()) { continue; } if (analysisPredicate != null && !analysisPredicate.test(analysis)) { continue; } String apostrophe = getApostrophe(input); if (formatter.canBeFormatted(analysis, caseType)) { String formatted = formatter.formatToCase(analysis, caseType, apostrophe); if (input.equals(formatted)) { return true; } } } return false; }
private void test(String input, DictionaryItem newItem) throws IOException { WordAnalysis before = morphology.analyze(input); Log.info("Parses for " + input + " before adding " + newItem); printResults(before); morphology.invalidateCache(); morphology.getMorphotactics().getStemTransitions().addDictionaryItem(newItem); WordAnalysis after = morphology.analyze(input); Log.info("Parses for " + input + " after adding " + newItem); printResults(after); }
return; WordAnalysis analyses = morphology.analyze(word); SingleAnalysis longest = analyses.analysisCount() > 0 ?
for (String line : lines) { for (String s : splitter.split(line)) { WordAnalysis results = parser.analyze(s); total++; if (total % 50000 == 0) {
for (String line : lines) { for (String s : splitter.split(line)) { WordAnalysis results = parser.analyze(s); total++; if (total % 50000 == 0) {
public static void main(String[] args) throws IOException { TurkishMorphology morphology = TurkishMorphology.builder() .ignoreDiacriticsInAnalysis() .setLexicon(RootLexicon.getDefault()) .build(); morphology.analyze("kisi").forEach(System.out::println); }
public static void main(String[] args) { TurkishMorphology morphology = TurkishMorphology.createWithDefaults(); String word = "kutucuğumuz"; Log.info("Word = " + word); Log.info("Results: "); WordAnalysis results = morphology.analyze(word); for (SingleAnalysis result : results) { Log.info(result.formatLong()); Log.info("\tStems = " + result.getStems()); Log.info("\tLemmas = " + result.getLemmas()); } }
public static void main(String[] args) { TurkishMorphology morphology = TurkishMorphology.createWithDefaults(); DictionaryItem newStem = morphology.getLexicon().getMatchingItems("poğaça").get(0); String word = "simidime"; Log.info("Input Word = " + word); WordAnalysis results = morphology.analyze(word); for (SingleAnalysis result : results) { List<Result> generated = morphology.getWordGenerator().generate(newStem, result.getMorphemes()); for (Result s : generated) { Log.info("Input analysis: " + result.formatLong()); Log.info("After stem change, word = " + s.surface); Log.info("After stem change, Analysis = " + s.analysis.formatLong()); } } }
public static void main(String[] args) { TurkishMorphology morphology = TurkishMorphology.createWithDefaults(); String word = "kalemi"; Log.info("Word = " + word); WordAnalysis results = morphology.analyze(word); for (SingleAnalysis result : results) { Log.info("Lexical and Surface : " + result.formatLong()); Log.info("Only Lexical : " + result.formatLexical()); Log.info("Oflazer style : " + AnalysisFormatters.OFLAZER_STYLE.format(result)); Log.info(); } }