public static void saveLemmas(int minLength) throws IOException { TurkishMorphology morphology = TurkishMorphology.createWithDefaults(); Set<String> set = new HashSet<>(); for (DictionaryItem item : morphology.getLexicon()) { String lemma = item.lemma; if (item.attributes.contains(RootAttribute.Dummy)) { continue; } if (lemma.length() < minLength) { continue; } if (item.primaryPos == PrimaryPos.Punctuation) { continue; } set.add(lemma); } List<String> list = new ArrayList<>(set); list.sort(Turkish.STRING_COMPARATOR_ASC); Files.write(Paths.get("zemberek.vocab"), list); }
static TurkishStopWords generateFromDictionary() throws IOException { Set<PrimaryPos> pos = Sets.newHashSet( PrimaryPos.Adverb, PrimaryPos.Conjunction, PrimaryPos.Determiner, PrimaryPos.Interjection, PrimaryPos.PostPositive, PrimaryPos.Numeral, PrimaryPos.Pronoun, PrimaryPos.Question ); TurkishMorphology morphology = TurkishMorphology.createWithDefaults(); Set<String> set = new HashSet<>(); RootLexicon lexicon = morphology.getLexicon(); for (DictionaryItem item : lexicon) { if (pos.contains(item.primaryPos)) { set.add(item.lemma); } } List<String> str = new ArrayList<>(set); str.sort(Turkish.STRING_COMPARATOR_ASC); return new TurkishStopWords(new LinkedHashSet<>(str)); }
public static void saveRegular() throws IOException { TurkishMorphology morphology = TurkishMorphology.createWithDefaults(); Set<String> set = new HashSet<>(); for (DictionaryItem item : morphology.getLexicon()) { String lemma = item.lemma; if (item.attributes.contains(RootAttribute.Dummy)) { continue; } if (item.primaryPos == PrimaryPos.Punctuation /*|| item.secondaryPos == SecondaryPos.ProperNoun || item.secondaryPos == SecondaryPos.Abbreviation*/) { continue; } set.add(lemma); TurkishAlphabet alphabet = TurkishAlphabet.INSTANCE; if (alphabet.containsCircumflex(lemma)) { set.add(alphabet.normalizeCircumflex(lemma)); } } List<String> list = new ArrayList<>(set); list.sort(Turkish.STRING_COMPARATOR_ASC); Files.write(Paths.get("zemberek.vocab"), list); }
private static void removeZemberekDictionaryWordsFromList(Path input, Path out) throws IOException { LinkedHashSet<String> list = new LinkedHashSet<>( Files.readAllLines(input, StandardCharsets.UTF_8)); System.out.println("Total amount of lines = " + list.size()); TurkishMorphology morphology = TurkishMorphology.create( RootLexicon.builder().addTextDictionaryResources( "tr/master-dictionary.dict", "tr/non-tdk.dict", "tr/proper.dict", "tr/proper-from-corpus.dict", "tr/abbreviations.dict" ).build()); List<String> toRemove = new ArrayList<>(); for (DictionaryItem item : morphology.getLexicon()) { if (list.contains(item.lemma)) { toRemove.add(item.lemma); } } System.out.println("Total amount to remove = " + toRemove.size()); list.removeAll(toRemove); try (PrintWriter pw = new PrintWriter(out.toFile(), "utf-8")) { list.forEach(pw::println); } }
static void foobar() throws IOException { Path path = Paths .get("/home/aaa/projects/zemberek-nlp/morphology/src/main/resources/tr/person-names.dict"); Path path2 = Paths .get( "/home/aaa/projects/zemberek-nlp/morphology/src/main/resources/tr/person-names-reduced.dict"); List<String> bb = Files.readAllLines(path); TurkishMorphology morphology = TurkishMorphology.create( RootLexicon.builder().addTextDictionaryResources( "tr/master-dictionary.dict", "tr/non-tdk.dict", "tr/proper.dict", "tr/proper-from-corpus.dict", "tr/abbreviations.dict").build()); List<String> r = new ArrayList<>(); for (String s : bb) { if (s.trim().length() == 0) { continue; } s = s.replaceAll("[ ]+", " ").trim(); DictionaryItem d = TurkishDictionaryLoader.loadFromString(s); if (!morphology.getLexicon().containsItem(d)) { r.add(s.trim()); } } r.sort(Turkish.STRING_COMPARATOR_ASC); Files.write(path2, r); }
public static void main(String[] args) { String[] number = {"A3sg", "A3pl"}; String[] possessives = {"P1sg", "P2sg", "P3sg"}; String[] cases = {"Dat", "Loc", "Abl"}; TurkishMorphology morphology = TurkishMorphology.builder().setLexicon("armut").disableCache().build(); DictionaryItem item = morphology.getLexicon().getMatchingItems("armut").get(0); for (String numberM : number) { for (String possessiveM : possessives) { for (String caseM : cases) { List<Result> results = morphology.getWordGenerator().generate(item, numberM, possessiveM, caseM); results.forEach(s->System.out.println(s.surface)); } } } }
private static void serializeDeserializeTest() throws IOException { TurkishMorphology morphology = TurkishMorphology.createWithDefaults(); RootLexicon lexicon = morphology.getLexicon(); Dictionary.Builder builder = Dictionary.newBuilder(); for (DictionaryItem item : lexicon.getAllItems()) { builder.addItems(convertToProto(item)); } Dictionary dictionary = builder.build(); System.out.println("Total size of serialized dictionary: " + dictionary.getSerializedSize()); Path f = Files.createTempFile("lexicon", ".bin"); BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(f.toFile())); bos.write(dictionary.toByteArray()); bos.close(); long start = System.currentTimeMillis(); byte[] serialized = Files.readAllBytes(f); long end = System.currentTimeMillis(); Log.info("Dictionary loaded in %d ms.", (end - start)); start = System.currentTimeMillis(); Dictionary readDictionary = Dictionary.parseFrom(serialized); end = System.currentTimeMillis(); Log.info("Dictionary deserialized in %d ms.", (end - start)); System.out.println("Total size of read dictionary: " + readDictionary.getSerializedSize()); start = System.currentTimeMillis(); RootLexicon loadedLexicon = new RootLexicon(); for (LexiconProto.DictionaryItem item : readDictionary.getItemsList()) { loadedLexicon.add(convertToDictionaryItem(item)); } end = System.currentTimeMillis(); Log.info("RootLexicon generated in %d ms.", (end - start)); }
.setLexicon(morphology.getLexicon()) .useInformalAnalysis() .ignoreDiacriticsInAnalysis()
public static void main(String[] args) { TurkishMorphology morphology = TurkishMorphology.createWithDefaults(); DictionaryItem newStem = morphology.getLexicon().getMatchingItems("poğaça").get(0); String word = "simidime"; Log.info("Input Word = " + word); WordAnalysis results = morphology.analyze(word); for (SingleAnalysis result : results) { List<Result> generated = morphology.getWordGenerator().generate(newStem, result.getMorphemes()); for (Result s : generated) { Log.info("Input analysis: " + result.formatLong()); Log.info("After stem change, word = " + s.surface); Log.info("After stem change, Analysis = " + s.analysis.formatLong()); } } }
public static void main(String[] args) { String[] number = {"A3sg", "A3pl"}; String[] possessives = {"P1sg", "P2sg", "P3sg"}; String[] cases = {"Dat", "Loc", "Abl"}; TurkishMorphology morphology = TurkishMorphology.builder().addDictionaryLines("armut").disableCache().build(); DictionaryItem item = morphology.getLexicon().getMatchingItems("armut").get(0); for (String numberM : number) { for (String possessiveM : possessives) { for (String caseM : cases) { List<Result> results = morphology.getWordGenerator().generate(item, numberM, possessiveM, caseM); results.forEach(s->System.out.println(s.surface)); } } } }
public void initializeController() { post("/generate_word", (req, res) -> { String word = req.queryParams("word"); String morphemes = req.queryParams("morphemes"); morphemes = morphemes.replace('|', '+'); morphemes = morphemes.replace('→', '+'); TurkishMorphology morphology = TurkishMorphology.builder().addDictionaryLines(word).disableCache().build(); DictionaryItem item = morphology.getLexicon().getMatchingItems(word).get(0); List<GenerateWordResult> wordResults = new ArrayList<>(); List<Result> results = morphology.getWordGenerator().generate(item, morphemes.split("\\+")); for (Result generateResult : results) { GenerateWordResult wordResult = new GenerateWordResult(); wordResult.word = word; wordResult.no_surface = generateResult.analysis.formatMorphemesLexical(); wordResult.surface = generateResult.surface; wordResult.analysis = generateResult.analysis.formatLong(); wordResults.add(wordResult); } return jsonConverter.toJson(wordResults); }); } }
public static void main(String[] args) { TurkishMorphology morphology = TurkishMorphology.createWithDefaults(); DictionaryItem newStem = morphology.getLexicon().getMatchingItems("poğaça").get(0); String word = "simidime"; Log.info("Input Word = " + word); WordAnalysis results = morphology.analyze(word); for (SingleAnalysis result : results) { List<Result> generated = morphology.getWordGenerator().generate(newStem, result.getMorphemes()); for (Result s : generated) { Log.info("Input analysis: " + result.formatLong()); Log.info("After stem change, word = " + s.surface); Log.info("After stem change, Analysis = " + s.analysis.formatLong()); } } }