/** * @param tokens * An iterable of tokens. * @param minN * minimum n-gram length. * @param maxN * maximum n-gram length. */ public NGramStringListIterable(Iterable<String> tokens, int minN, int maxN) { this.nGramList = createNGramList(tokens, minN, maxN); }
private List<List<String>> createNGramList(Iterable<String> tokens, int minN, int maxN) { if (minN > maxN) { throw new IllegalArgumentException("minN needs to be smaller or equal than maxN."); } List<List<String>> nGrams = new ArrayList<List<String>>(); // fill token list List<String> tokenList = new ArrayList<String>(); for (String t : tokens) { tokenList.add(t); } for (int k = minN; k <= maxN; k++) { // if the number of tokens is less than k => break if (tokenList.size() < k) { break; } nGrams.addAll(getNGrams(tokenList, k)); } return nGrams; }
private static FrequencyDistribution<String> documentBasedDistribution(JCas jcas, Annotation focus, boolean useCanonical, int minN, int maxN) { FrequencyDistribution<String> posNgrams = new FrequencyDistribution<String>(); List<String> postagstrings = new ArrayList<String>(); for (POS p : selectCovered(jcas, POS.class, focus)) { if (useCanonical) { postagstrings.add(p.getClass().getSimpleName()); } else { postagstrings.add(p.getPosValue()); } } String[] posarray = postagstrings.toArray(new String[postagstrings.size()]); for (List<String> ngram : new NGramStringListIterable(posarray, minN, maxN)) { posNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE)); } return posNgrams; }
public static FrequencyDistribution<String> getDocumentPosNgrams(JCas jcas, int minN, int maxN, boolean useCanonical) { FrequencyDistribution<String> posNgrams = new FrequencyDistribution<String>(); for (Sentence s : select(jcas, Sentence.class)) { List<String> postagstrings = new ArrayList<String>(); for (POS p : JCasUtil.selectCovered(jcas, POS.class, s)) { if (useCanonical) { postagstrings.add(p.getClass().getSimpleName()); } else { postagstrings.add(p.getPosValue()); } } String[] posarray = postagstrings.toArray(new String[postagstrings.size()]); for (List<String> ngram : new NGramStringListIterable(posarray, minN, maxN)) { posNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE)); } } return posNgrams; }
public static FrequencyDistribution<String> getDocumentPosNgrams(JCas jcas, int minN, int maxN, boolean useCanonical) { FrequencyDistribution<String> posNgrams = new FrequencyDistribution<String>(); for (Sentence s : select(jcas, Sentence.class)) { List<String> postagstrings = new ArrayList<String>(); for (POS p : JCasUtil.selectCovered(jcas, POS.class, s)) { if (useCanonical) { postagstrings.add(p.getClass().getSimpleName()); } else { postagstrings.add(p.getPosValue()); } } String[] posarray = postagstrings.toArray(new String[postagstrings.size()]); for (List<String> ngram : new NGramStringListIterable(posarray, minN, maxN)) { posNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE)); } } return posNgrams; }
/** * @param tokens * An iterable of tokens. * @param minN * minimum n-gram length. * @param maxN * maximum n-gram length. */ public NGramStringListIterable(Iterable<String> tokens, int minN, int maxN) { this.nGramList = createNGramList(tokens, minN, maxN); }
private List<List<String>> createNGramList(Iterable<String> tokens, int minN, int maxN) { if (minN > maxN) { throw new IllegalArgumentException("minN needs to be smaller or equal than maxN."); } List<List<String>> nGrams = new ArrayList<List<String>>(); // fill token list List<String> tokenList = new ArrayList<String>(); for (String t : tokens) { tokenList.add(t); } for (int k = minN; k <= maxN; k++) { // if the number of tokens is less than k => break if (tokenList.size() < k) { break; } nGrams.addAll(getNGrams(tokenList, k)); } return nGrams; }
private static FrequencyDistribution<String> sentenceBasedDistribution(JCas jcas, Annotation focus, boolean useCanonical, int minN, int maxN) { FrequencyDistribution<String> posNgrams = new FrequencyDistribution<String>(); for (Sentence s : selectCovered(jcas, Sentence.class, focus)) { List<String> postagstrings = new ArrayList<String>(); for (POS p : selectCovered(jcas, POS.class, s)) { if (useCanonical) { postagstrings.add(p.getClass().getSimpleName()); } else { postagstrings.add(p.getPosValue()); } } String[] posarray = postagstrings.toArray(new String[postagstrings.size()]); for (List<String> ngram : new NGramStringListIterable(posarray, minN, maxN)) { posNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE)); } } return posNgrams; }
/** * @param tokens * An array of tokens. * @param minN * minimum n-gram length. * @param maxN * maximum n-gram length. */ public NGramStringListIterable(String[] tokens, int minN, int maxN) { this.nGramList = createNGramList(Arrays.asList(tokens), minN, maxN); }
for (List<String> ngram : new NGramStringListIterable(keywordList.toArray(new String[keywordList.size()]), minN, maxN)) { String ngramString = StringUtils.join(ngram, GLUE); documentNgrams.inc(ngramString);
/** * @param tokens * An array of tokens. * @param minN * minimum n-gram length. * @param maxN * maximum n-gram length. */ public NGramStringListIterable(String[] tokens, int minN, int maxN) { this.nGramList = createNGramList(Arrays.asList(tokens), minN, maxN); }
public static FrequencyDistribution<String> getDocumentNgrams( JCas jcas, boolean lowerCaseNGrams, boolean filterPartialMatches, int minN, int maxN, Set<String> stopwords) { FrequencyDistribution<String> documentNgrams = new FrequencyDistribution<String>(); for (Sentence s : select(jcas, Sentence.class)) { // TODO parameterize type for (List<String> ngram : new NGramStringListIterable(toText(selectCovered(Token.class, s)), minN, maxN)) { if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) { String ngramString = StringUtils.join(ngram, NGRAM_GLUE); if (lowerCaseNGrams) { ngramString = ngramString.toLowerCase(); } documentNgrams.inc(ngramString); } } } return documentNgrams; }
for (List<String> ngram : new NGramStringListIterable( keywordList.toArray(new String[keywordList.size()]), minN, maxN)) { String ngramString = StringUtils.join(ngram, GLUE);
for (Sentence s : select(jcas, Sentence.class)) { List<String> strings = valuesToText(jcas, s, annotationClass.getName()); for (List<String> ngram : new NGramStringListIterable(strings, minN, maxN)) { if (lowerCaseNGrams) { ngram = lower(ngram);
for (List<String> ngram : new NGramStringListIterable(array, minN, maxN)) { phoneticNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE));
for (List<String> ngram : new NGramStringListIterable(toText(selectCovered( Token.class, s)), minN, maxN)) { for (List<String> ngram : new NGramStringListIterable(toText(selectCovered(Token.class, focusAnnotation)), minN, maxN)) {
for (List<String> ngram : new NGramStringListIterable(array, minN, maxN)) { phoneticNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE));
for (Sentence s : selectCovered(jcas, Sentence.class, aTarget)) { List<String> strings = valuesToText(jcas, s, annotationClass.getName()); for (List<String> ngram : new NGramStringListIterable(strings, minN, maxN)) { if (lowerCaseNGrams) { ngram = lower(ngram);
for (List<String> ngram : new NGramStringListIterable(toText(selectCovered( Token.class, s)), minN, maxN)) { for (List<String> ngram : new NGramStringListIterable(toText(selectCovered(Token.class, focusAnnotation)), minN, maxN)) {
for (List<String> ngram : new NGramStringListIterable( toText(selectCovered(Token.class, s)), minN, maxN)) { for (List<String> ngram : new NGramStringListIterable( toText(selectCovered(Token.class, focusAnnotation)), minN, maxN)) {