public ZemberekContext(ZemberekGrpcConfiguration configuration) { tokenizer = TurkishTokenizer.ALL; morphology = TurkishMorphology.createWithDefaults(); this.configuration = configuration; }
public ZemberekContext() { tokenizer = TurkishTokenizer.ALL; morphology = TurkishMorphology.createWithDefaults(); }
Singleton() { morphology = TurkishMorphology.createWithDefaults(); } }
public AmbiguityStats() throws IOException { parser = TurkishMorphology.createWithDefaults(); }
private void generateSetWithLemmas(List<String> lines, Path lemmasPath) throws IOException { morphology = TurkishMorphology.createWithDefaults(); List<String> lemmas = lines .stream() .map(this::replaceWordsWithLemma) .map(this::removeNonWords) .map(s -> s.toLowerCase(Turkish.LOCALE)) .collect(Collectors.toList()); Files.write(lemmasPath, lemmas); }
static void saveUnambigious() throws IOException { Path goldTest = Paths.get("data/gold/gold-test.sentences"); //Path goldTest = Paths.get("data/gold/test.txt"); Path goldTestOut = Paths.get("data/gold/gold-test.txt"); TurkishMorphology morphology = TurkishMorphology.createWithDefaults(); saveUnambiguous(clean(Files.readAllLines(goldTest)), morphology, goldTestOut); }
public static void saveLemmas(int minLength) throws IOException { TurkishMorphology morphology = TurkishMorphology.createWithDefaults(); Set<String> set = new HashSet<>(); for (DictionaryItem item : morphology.getLexicon()) { String lemma = item.lemma; if (item.attributes.contains(RootAttribute.Dummy)) { continue; } if (lemma.length() < minLength) { continue; } if (item.primaryPos == PrimaryPos.Punctuation) { continue; } set.add(lemma); } List<String> list = new ArrayList<>(set); list.sort(Turkish.STRING_COMPARATOR_ASC); Files.write(Paths.get("zemberek.vocab"), list); }
private static void filterVocab(Path vocabFile, Path outFile) throws IOException { List<String> words = Files.readAllLines(vocabFile, StandardCharsets.UTF_8); TurkishMorphology morphology = TurkishMorphology.createWithDefaults(); List<String> result = new ArrayList<>(); for (String word : words) { WordAnalysis analysis = morphology.analyze(word); if (!analysis.isCorrect()) { Log.warn("Cannot analyze %s", word); continue; } result.add(word); } Files.write(outFile, result, StandardCharsets.UTF_8); }
public static void main(String[] args) throws IOException { // assumes you generated a model in my-model directory. Path modelRoot = Paths.get("my-model"); TurkishMorphology morphology = TurkishMorphology.createWithDefaults(); PerceptronNer ner = PerceptronNer.loadModel(modelRoot, morphology); String sentence = "Ali Kaan yarın İstanbul'a gidecek."; NerSentence result = ner.findNamedEntities(sentence); List<NamedEntity> namedEntities = result.getNamedEntities(); for (NamedEntity namedEntity : namedEntities) { System.out.println(namedEntity); } }
public static void main(String[] args) throws Exception { TurkishMorphology morphology = TurkishMorphology.createWithDefaults(); Path indexRoot = Paths.get("/home/aaa/data/zemberek/corpus-index"); CorpusSearcher searcher = new CorpusSearcher(indexRoot); AmbiguousExampleFinder finder = new AmbiguousExampleFinder(searcher); extractSentences(morphology, finder); }
public static DistanceBasedStemmer load(Path vector, Path distances, Path vocabFile) throws IOException { Log.info("Loading vector file."); List<WordVector> wordVectors = WordVector.loadFromBinary(vector); Map<String, WordVector> map = new HashMap<>(wordVectors.size()); for (WordVector wordVector : wordVectors) { map.put(wordVector.word, wordVector); } Log.info("Loading distances."); DistanceList experiment = DistanceList.readFromBinary(distances, vocabFile); TurkishMorphology morphology = TurkishMorphology.createWithDefaults(); return new DistanceBasedStemmer(map, experiment, morphology); }
public static void main(String[] args) throws IOException { // you will need ner-train and ner-test files to run this example. Path trainPath = Paths.get("ner-train"); Path testPath = Paths.get("ner-test"); Path modelRoot = Paths.get("my-model"); NerDataSet trainingSet = NerDataSet.load(trainPath, AnnotationStyle.BRACKET); Log.info(trainingSet.info()); // prints information NerDataSet testSet = NerDataSet.load(testPath, AnnotationStyle.BRACKET); Log.info(testSet.info()); TurkishMorphology morphology = TurkishMorphology.createWithDefaults(); // Training occurs here. Result is a PerceptronNer instance. // There will be 7 iterations with 0.1 learning rate. PerceptronNer ner = new PerceptronNerTrainer(morphology) .train(trainingSet, testSet, 13, 0.1f); Files.createDirectories(modelRoot); ner.saveModelAsText(modelRoot); }
public static void main(String[] args) { TurkishMorphology morphology = TurkishMorphology.createWithDefaults(); String sentence = "Keşke yarın hava güzel olsa."; Log.info("Sentence = " + sentence); SentenceAnalysis analysis = morphology.analyzeAndDisambiguate(sentence); for (SentenceWordAnalysis a : analysis) { PrimaryPos primaryPos = a.getBestAnalysis().getPos(); Log.info("%s : %s ", a.getWordAnalysis().getInput(), primaryPos); } }
public static void main(String[] args) { TurkishMorphology morphology = TurkishMorphology.createWithDefaults(); String sentence = "Bol baharatlı bir yemek yaptıralım."; Log.info("Sentence = " + sentence); List<WordAnalysis> analyses = morphology.analyzeSentence(sentence); Log.info("Sentence word analysis result:"); for (WordAnalysis entry : analyses) { Log.info("Word = " + entry.getInput()); for (SingleAnalysis analysis : entry) { Log.info(analysis.formatLong()); } } SentenceAnalysis result = morphology.disambiguate(sentence, analyses); Log.info("\nAfter ambiguity resolution : "); result.bestAnalysis().forEach(Log::info); } }
public static void trainAndTest( Path trainPath, Path testPath, Path modelRoot, Path reportPath) throws IOException { NerDataSet trainingSet = NerDataSet.load(trainPath, AnnotationStyle.BRACKET); Log.info(trainingSet.info()); NerDataSet testSet = NerDataSet.load(testPath, AnnotationStyle.BRACKET); Log.info(testSet.info()); TurkishMorphology morphology = TurkishMorphology.createWithDefaults(); PerceptronNer ner = new PerceptronNerTrainer(morphology) .train(trainingSet, testSet, 7, 0.1f); Files.createDirectories(modelRoot); ner.saveModelAsText(modelRoot); Log.info("Testing %d sentences.", testSet.sentences.size()); NerDataSet testResult = ner.evaluate(testSet); PerceptronNerTrainer.evaluationReport(testSet, testResult, reportPath); Log.info("Done."); }
public static void main(String[] args) { TurkishMorphology morphology = TurkishMorphology.createWithDefaults(); String word = "kutucuğumuz"; Log.info("Word = " + word); Log.info("Results: "); WordAnalysis results = morphology.analyze(word); for (SingleAnalysis result : results) { Log.info(result.formatLong()); Log.info("\tStems = " + result.getStems()); Log.info("\tLemmas = " + result.getLemmas()); } }
public static void main(String[] args) throws IOException { TurkishMorphology morphology = TurkishMorphology.createWithDefaults(); TurkishSpellChecker spellChecker = new TurkishSpellChecker(morphology); Log.info("Check if written correctly."); String[] words = {"Ankara'ya", "Ankar'aya", "yapbileceksen", "yapabileceğinizden"}; for (String word : words) { Log.info(word + " -> " + spellChecker.check(word)); } Log.info(); Log.info("Give suggestions."); String[] toSuggest = {"Kraamanda", "okumuştk", "yapbileceksen", "oukyamıyorum"}; for (String s : toSuggest) { Log.info(s + " -> " + spellChecker.suggestForWord(s)); } } }
public static void main(String[] args) { TurkishMorphology morphology = TurkishMorphology.createWithDefaults(); String word = "kalemi"; Log.info("Word = " + word); WordAnalysis results = morphology.analyze(word); for (SingleAnalysis result : results) { Log.info("Lexical and Surface : " + result.formatLong()); Log.info("Only Lexical : " + result.formatLexical()); Log.info("Oflazer style : " + AnalysisFormatters.OFLAZER_STYLE.format(result)); Log.info(); } }
public static void main(String[] args) { TurkishMorphology morphology = TurkishMorphology.createWithDefaults(); DictionaryItem newStem = morphology.getLexicon().getMatchingItems("poğaça").get(0); String word = "simidime"; Log.info("Input Word = " + word); WordAnalysis results = morphology.analyze(word); for (SingleAnalysis result : results) { List<Result> generated = morphology.getWordGenerator().generate(newStem, result.getMorphemes()); for (Result s : generated) { Log.info("Input analysis: " + result.formatLong()); Log.info("After stem change, word = " + s.surface); Log.info("After stem change, Analysis = " + s.analysis.formatLong()); } } }
public static void main(String[] args) throws IOException { TurkishMorphology morphology = TurkishMorphology.createWithDefaults(); AddNewDictionaryItem app = new AddNewDictionaryItem(morphology); Log.info("Proper Noun Test - 1 :"); app.test("Meydan'a", new DictionaryItem("Meydan", "meydan", "meydan", PrimaryPos.Noun, SecondaryPos.ProperNoun)); Log.info("----"); Log.info("Proper Noun Test - 2 :"); app.test("Meeeydan'a", new DictionaryItem("Meeeydan", "meeeydan", "meeeydan", PrimaryPos.Noun, SecondaryPos.ProperNoun)); Log.info("----"); Log.info("Verb Test : "); app.test("tweetleyeyazdım", new DictionaryItem("tweetlemek", "tweetle", "tivitle", PrimaryPos.Verb, SecondaryPos.None)); }