zemberek.morphology.TurkishMorphology.getLexicon java code examples

public static void saveLemmas(int minLength) throws IOException {
 TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
 Set<String> set = new HashSet<>();
 for (DictionaryItem item : morphology.getLexicon()) {
  String lemma = item.lemma;
  if (item.attributes.contains(RootAttribute.Dummy)) {
   continue;
  }
  if (lemma.length() < minLength) {
   continue;
  }
  if (item.primaryPos == PrimaryPos.Punctuation) {
   continue;
  }
  set.add(lemma);
 }
 List<String> list = new ArrayList<>(set);
 list.sort(Turkish.STRING_COMPARATOR_ASC);
 Files.write(Paths.get("zemberek.vocab"), list);
}

static TurkishStopWords generateFromDictionary() throws IOException {
 Set<PrimaryPos> pos = Sets.newHashSet(
   PrimaryPos.Adverb,
   PrimaryPos.Conjunction,
   PrimaryPos.Determiner,
   PrimaryPos.Interjection,
   PrimaryPos.PostPositive,
   PrimaryPos.Numeral,
   PrimaryPos.Pronoun,
   PrimaryPos.Question
 );
 TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
 Set<String> set = new HashSet<>();
 RootLexicon lexicon = morphology.getLexicon();
 for (DictionaryItem item : lexicon) {
  if (pos.contains(item.primaryPos)) {
   set.add(item.lemma);
  }
 }
 List<String> str = new ArrayList<>(set);
 str.sort(Turkish.STRING_COMPARATOR_ASC);
 return new TurkishStopWords(new LinkedHashSet<>(str));
}

public static void saveRegular() throws IOException {
 TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
 Set<String> set = new HashSet<>();
 for (DictionaryItem item : morphology.getLexicon()) {
  String lemma = item.lemma;
  if (item.attributes.contains(RootAttribute.Dummy)) {
   continue;
  }
  if (item.primaryPos == PrimaryPos.Punctuation
    /*|| item.secondaryPos == SecondaryPos.ProperNoun
    || item.secondaryPos == SecondaryPos.Abbreviation*/) {
   continue;
  }
  set.add(lemma);
  TurkishAlphabet alphabet = TurkishAlphabet.INSTANCE;
  if (alphabet.containsCircumflex(lemma)) {
   set.add(alphabet.normalizeCircumflex(lemma));
  }
 }
 List<String> list = new ArrayList<>(set);
 list.sort(Turkish.STRING_COMPARATOR_ASC);
 Files.write(Paths.get("zemberek.vocab"), list);
}

private static void removeZemberekDictionaryWordsFromList(Path input, Path out)
  throws IOException {
 LinkedHashSet<String> list = new LinkedHashSet<>(
   Files.readAllLines(input, StandardCharsets.UTF_8));
 System.out.println("Total amount of lines = " + list.size());
 TurkishMorphology morphology = TurkishMorphology.create(
   RootLexicon.builder().addTextDictionaryResources(
     "tr/master-dictionary.dict",
     "tr/non-tdk.dict",
     "tr/proper.dict",
     "tr/proper-from-corpus.dict",
     "tr/abbreviations.dict"
   ).build());
 List<String> toRemove = new ArrayList<>();
 for (DictionaryItem item : morphology.getLexicon()) {
  if (list.contains(item.lemma)) {
   toRemove.add(item.lemma);
  }
 }
 System.out.println("Total amount to remove = " + toRemove.size());
 list.removeAll(toRemove);
 try (PrintWriter pw = new PrintWriter(out.toFile(), "utf-8")) {
  list.forEach(pw::println);
 }
}

static void foobar() throws IOException {
 Path path = Paths
   .get("/home/aaa/projects/zemberek-nlp/morphology/src/main/resources/tr/person-names.dict");
 Path path2 = Paths
   .get(
     "/home/aaa/projects/zemberek-nlp/morphology/src/main/resources/tr/person-names-reduced.dict");
 List<String> bb = Files.readAllLines(path);
 TurkishMorphology morphology = TurkishMorphology.create(
   RootLexicon.builder().addTextDictionaryResources(
     "tr/master-dictionary.dict",
     "tr/non-tdk.dict",
     "tr/proper.dict",
     "tr/proper-from-corpus.dict",
     "tr/abbreviations.dict").build());
 List<String> r = new ArrayList<>();
 for (String s : bb) {
  if (s.trim().length() == 0) {
   continue;
  }
  s = s.replaceAll("[ ]+", " ").trim();
  DictionaryItem d = TurkishDictionaryLoader.loadFromString(s);
  if (!morphology.getLexicon().containsItem(d)) {
   r.add(s.trim());
  }
 }
 r.sort(Turkish.STRING_COMPARATOR_ASC);
 Files.write(path2, r);
}

public static void main(String[] args) {
 String[] number = {"A3sg", "A3pl"};
 String[] possessives = {"P1sg", "P2sg", "P3sg"};
 String[] cases = {"Dat", "Loc", "Abl"};
 TurkishMorphology morphology =
   TurkishMorphology.builder().setLexicon("armut").disableCache().build();
 DictionaryItem item = morphology.getLexicon().getMatchingItems("armut").get(0);
 for (String numberM : number) {
  for (String possessiveM : possessives) {
   for (String caseM : cases) {
    List<Result> results =
      morphology.getWordGenerator().generate(item, numberM, possessiveM, caseM);
    results.forEach(s->System.out.println(s.surface));
   }
  }
 }
}

private static void serializeDeserializeTest() throws IOException {
 TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
 RootLexicon lexicon = morphology.getLexicon();
 Dictionary.Builder builder = Dictionary.newBuilder();
 for (DictionaryItem item : lexicon.getAllItems()) {
  builder.addItems(convertToProto(item));
 }
 Dictionary dictionary = builder.build();
 System.out.println("Total size of serialized dictionary: " + dictionary.getSerializedSize());
 Path f = Files.createTempFile("lexicon", ".bin");
 BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(f.toFile()));
 bos.write(dictionary.toByteArray());
 bos.close();
 long start = System.currentTimeMillis();
 byte[] serialized = Files.readAllBytes(f);
 long end = System.currentTimeMillis();
 Log.info("Dictionary loaded in %d ms.", (end - start));
 start = System.currentTimeMillis();
 Dictionary readDictionary = Dictionary.parseFrom(serialized);
 end = System.currentTimeMillis();
 Log.info("Dictionary deserialized in %d ms.", (end - start));
 System.out.println("Total size of read dictionary: " + readDictionary.getSerializedSize());
 start = System.currentTimeMillis();
 RootLexicon loadedLexicon = new RootLexicon();
 for (LexiconProto.DictionaryItem item : readDictionary.getItemsList()) {
  loadedLexicon.add(convertToDictionaryItem(item));
 }
 end = System.currentTimeMillis();
 Log.info("RootLexicon generated in %d ms.", (end - start));
}

.setLexicon(morphology.getLexicon())
.useInformalAnalysis()
.ignoreDiacriticsInAnalysis()

public static void main(String[] args) {
 TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
 DictionaryItem newStem = morphology.getLexicon().getMatchingItems("poğaça").get(0);
 String word = "simidime";
 Log.info("Input Word = " + word);
 WordAnalysis results = morphology.analyze(word);
 for (SingleAnalysis result : results) {
  List<Result> generated =
    morphology.getWordGenerator().generate(newStem, result.getMorphemes());
  for (Result s : generated) {
   Log.info("Input analysis: " + result.formatLong());
   Log.info("After stem change, word = " + s.surface);
   Log.info("After stem change, Analysis = " + s.analysis.formatLong());
  }
 }
}

public static void main(String[] args) {
 String[] number = {"A3sg", "A3pl"};
 String[] possessives = {"P1sg", "P2sg", "P3sg"};
 String[] cases = {"Dat", "Loc", "Abl"};
 TurkishMorphology morphology =
   TurkishMorphology.builder().addDictionaryLines("armut").disableCache().build();
 DictionaryItem item = morphology.getLexicon().getMatchingItems("armut").get(0);
 for (String numberM : number) {
  for (String possessiveM : possessives) {
   for (String caseM : cases) {
    List<Result> results =
      morphology.getWordGenerator().generate(item, numberM, possessiveM, caseM);
    results.forEach(s->System.out.println(s.surface));
   }
  }
 }
}

  public void initializeController() {
    post("/generate_word", (req, res) -> {
      String word = req.queryParams("word");
      String morphemes = req.queryParams("morphemes");
      morphemes = morphemes.replace('|', '+');
      morphemes = morphemes.replace('→', '+');
      TurkishMorphology morphology =
          TurkishMorphology.builder().addDictionaryLines(word).disableCache().build();
      DictionaryItem item = morphology.getLexicon().getMatchingItems(word).get(0);
      List<GenerateWordResult> wordResults = new ArrayList<>();
      List<Result> results = morphology.getWordGenerator().generate(item, morphemes.split("\\+"));
      for (Result generateResult : results) {
        GenerateWordResult wordResult = new GenerateWordResult();
        wordResult.word = word;
        wordResult.no_surface = generateResult.analysis.formatMorphemesLexical();
        wordResult.surface = generateResult.surface;
        wordResult.analysis = generateResult.analysis.formatLong();
        wordResults.add(wordResult);
      }
      return jsonConverter.toJson(wordResults);
    });
  }
}

public static void main(String[] args) {
 TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
 DictionaryItem newStem = morphology.getLexicon().getMatchingItems("poğaça").get(0);
 String word = "simidime";
 Log.info("Input Word = " + word);
 WordAnalysis results = morphology.analyze(word);
 for (SingleAnalysis result : results) {
  List<Result> generated =
    morphology.getWordGenerator().generate(newStem, result.getMorphemes());
  for (Result s : generated) {
   Log.info("Input analysis: " + result.formatLong());
   Log.info("After stem change, word = " + s.surface);
   Log.info("After stem change, Analysis = " + s.analysis.formatLong());
  }
 }
}

Popular methods of TurkishMorphology

analyze
createWithDefaults
analyzeAndDisambiguate
Applies morphological analysis and disambiguation to a sentence.
builder
analyzeSentence
getWordGenerator
disambiguate
getMorphotactics
invalidateCache
<init>
analyzeWithCache
analyzeWithoutCache

Popular in Java

Running tasks concurrently on multiple threads
notifyDataSetChanged (ArrayAdapter)
findViewById (Activity)
getSystemService (Context)
InputStream (java.io)
A readable source of bytes.Most clients will use input streams that read data from the file system (
Timestamp (java.sql)
A Java representation of the SQL TIMESTAMP type. It provides the capability of representing the SQL
BitSet (java.util)
The BitSet class implements abit array [http://en.wikipedia.org/wiki/Bit_array]. Each element is eit
Deque (java.util)
A linear collection that supports element insertion and removal at both ends. The name deque is shor
Manifest (java.util.jar)
The Manifest class is used to obtain attribute information for a JarFile and its entries.
JOptionPane (javax.swing)
Github Copilot alternatives

How to use getLexiconmethodin zemberek.morphology.TurkishMorphology

Best Java code snippets using zemberek.morphology.TurkishMorphology.getLexicon (Showing top 12 results out of 315)

How to use
getLexicon
method
in
zemberek.morphology.TurkishMorphology