public String[] lemmatize(final String[] tokens, final String[] postags) { List<String> lemmas = new ArrayList<>(); for (int i = 0; i < tokens.length; i++) { lemmas.add(this.lemmatize(tokens[i], postags[i])); } return lemmas.toArray(new String[lemmas.size()]); }
@BeforeClass public static void loadDictionary() throws Exception { dictionaryLemmatizer = new DictionaryLemmatizer( DictionaryLemmatizerTest.class.getResourceAsStream( "/opennlp/tools/lemmatizer/smalldictionarymulti.dict") ); }
public List<List<String>> lemmatize(final List<String> tokens, final List<String> posTags) { List<List<String>> allLemmas = new ArrayList<>(); for (int i = 0; i < tokens.size(); i++) { allLemmas.add(this.getAllLemmas(tokens.get(i), posTags.get(i))); } return allLemmas; }
/** * Construct a hashmap from the input tab separated dictionary. * * The input file should have, for each line, word\tabpostag\tablemma. * Alternatively, if multiple lemmas are possible for each word,postag pair, * then the format should be word\tab\postag\tablemma01#lemma02#lemma03 * * @param dictionary * the input dictionary via inputstream */ public DictionaryLemmatizer(final InputStream dictionary) throws IOException { init(dictionary); }
/** * Lookup lemma in a dictionary. Outputs "O" if not found. * * @param word * the token * @param postag * the postag * @return the lemma */ private String lemmatize(final String word, final String postag) { String lemma; final List<String> keys = this.getDictKeys(word, postag); // lookup lemma as value of the map final List<String> keyValues = this.dictMap.get(keys); if ( keyValues != null && !keyValues.isEmpty()) { lemma = keyValues.get(0); } else { lemma = "O"; } return lemma; }
public DictionaryLemmatizer(File dictionaryFile) throws IOException { try (InputStream in = new FileInputStream(dictionaryFile)) { init(in); } }
/** * Lookup every lemma for a word,pos tag in a dictionary. Outputs "O" if not * found. * * @param word * the token * @param postag * the postag * @return every lemma */ private List<String> getAllLemmas(final String word, final String postag) { List<String> lemmasList = new ArrayList<>(); final List<String> keys = this.getDictKeys(word, postag); // lookup lemma as value of the map final List<String> keyValues = this.dictMap.get(keys); if (keyValues != null && !keyValues.isEmpty()) { lemmasList.addAll(keyValues); } else { lemmasList.add("O"); } return lemmasList; } }
@Test public void testForNullPointerException() { List<String> sentence = Arrays.asList("The","dogs","were","running","and","barking", "down","the","street"); List<String> sentencePOS = Arrays.asList("DT","NNS","VBD","VBG","CC","VBG","RP","DT","NN"); List<List<String>> expectedLemmas = new ArrayList<>(); expectedLemmas.add(Arrays.asList("the")); expectedLemmas.add(Arrays.asList("dog")); expectedLemmas.add(Arrays.asList("is")); expectedLemmas.add(Arrays.asList("run,run")); expectedLemmas.add(Arrays.asList("and")); expectedLemmas.add(Arrays.asList("bark,bark")); expectedLemmas.add(Arrays.asList("down")); expectedLemmas.add(Arrays.asList("the")); expectedLemmas.add(Arrays.asList("street")); List<List<String>> actualLemmas = dictionaryLemmatizer.lemmatize(sentence, sentencePOS); for (int i = 0; i < sentence.size(); i++) { // don't compare cases where the word is not in the dictionary... if (!actualLemmas.get(0).get(0).equals("O")) Assert.assertEquals(expectedLemmas.get(i), actualLemmas.get(i)); } }
/** * Construct a hashmap from the input tab separated dictionary. * * The input file should have, for each line, word\tabpostag\tablemma. * Alternatively, if multiple lemmas are possible for each word,postag pair, * then the format should be word\tab\postag\tablemma01#lemma02#lemma03 * * @param dictionary * the input dictionary via inputstream */ public DictionaryLemmatizer(final InputStream dictionary) throws IOException { init(dictionary); }
/** * Lookup lemma in a dictionary. Outputs "O" if not found. * * @param word * the token * @param postag * the postag * @return the lemma */ private String lemmatize(final String word, final String postag) { String lemma; final List<String> keys = this.getDictKeys(word, postag); // lookup lemma as value of the map final List<String> keyValues = this.dictMap.get(keys); if ( keyValues != null && !keyValues.isEmpty()) { lemma = keyValues.get(0); } else { lemma = "O"; } return lemma; }
@BeforeClass public static void loadDictionary() throws Exception { dictionaryLemmatizer = new DictionaryLemmatizer( DictionaryLemmatizerTest.class.getResourceAsStream("/opennlp/tools/lemmatizer/smalldictionary.dict") ); }
public List<List<String>> lemmatize(final List<String> tokens, final List<String> posTags) { List<List<String>> allLemmas = new ArrayList<>(); for (int i = 0; i < tokens.size(); i++) { allLemmas.add(this.getAllLemmas(tokens.get(i), posTags.get(i))); } return allLemmas; }
@Test public void testForNullPointerException() { String[] sentence = new String[]{"The","dogs","were","running","and","barking","down","the","street"}; String[] sentencePOS = new String[]{"DT","NNS","VBD","VBG","CC","VBG","RP","DT","NN"}; String[] expectedLemma = new String[]{"the","dog","is","run","and","bark","down","the","street"}; String[] actualLemma = dictionaryLemmatizer.lemmatize(sentence, sentencePOS); for (int i = 0;i < sentence.length;i++) { // don't compare cases where the word is not in the dictionary... if (!actualLemma[i].equals("O")) Assert.assertEquals(expectedLemma[i], actualLemma[i]); } }
/** * Construct a hashmap from the input tab separated dictionary. * * The input file should have, for each line, word\tabpostag\tablemma. * Alternatively, if multiple lemmas are possible for each word,postag pair, * then the format should be word\tab\postag\tablemma01#lemma02#lemma03 * * @param dictionary * the input dictionary via inputstream */ public DictionaryLemmatizer(final InputStream dictionary) throws IOException { init(dictionary); }
/** * Lookup lemma in a dictionary. Outputs "O" if not found. * * @param word * the token * @param postag * the postag * @return the lemma */ private String lemmatize(final String word, final String postag) { String lemma; final List<String> keys = this.getDictKeys(word, postag); // lookup lemma as value of the map final List<String> keyValues = this.dictMap.get(keys); if ( keyValues != null && !keyValues.isEmpty()) { lemma = keyValues.get(0); } else { lemma = "O"; } return lemma; }
/** * Creates a new lemmatizer that uses a dictionary. * @param dictionary The full path to the dictionary file. * @throws IOException Thrown if the dictionary cannot be opened. */ public DefaultLemmatizer(String dictionary) throws IOException { isModelBased = false; InputStream dictLemmatizer = new FileInputStream(dictionary); lemmatizer = new DictionaryLemmatizer(dictLemmatizer); dictLemmatizer.close(); }
public List<List<String>> lemmatize(final List<String> tokens, final List<String> posTags) { List<List<String>> allLemmas = new ArrayList<>(); for (int i = 0; i < tokens.size(); i++) { allLemmas.add(this.getAllLemmas(tokens.get(i), posTags.get(i))); } return allLemmas; }
public String[] lemmatize(final String[] tokens, final String[] postags) { List<String> lemmas = new ArrayList<>(); for (int i = 0; i < tokens.length; i++) { lemmas.add(this.lemmatize(tokens[i], postags[i])); } return lemmas.toArray(new String[lemmas.size()]); }
public DictionaryLemmatizer(File dictionaryFile) throws IOException { try (InputStream in = new FileInputStream(dictionaryFile)) { init(in); } }
/** * Lookup every lemma for a word,pos tag in a dictionary. Outputs "O" if not * found. * * @param word * the token * @param postag * the postag * @return every lemma */ private List<String> getAllLemmas(final String word, final String postag) { List<String> lemmasList = new ArrayList<>(); final List<String> keys = this.getDictKeys(word, postag); // lookup lemma as value of the map final List<String> keyValues = this.dictMap.get(keys); if (keyValues != null && !keyValues.isEmpty()) { lemmasList.addAll(keyValues); } else { lemmasList.add("O"); } return lemmasList; } }