/** * @param tokens * An iterable of tokens. * @param minN * the minimal n-gram length. * @param maxN * the maximal n-gram length. */ public NGramStringIterable(Iterable<String> tokens, int minN, int maxN) { this.nGramList = createNGramList(tokens, minN, maxN); }
for (String ngram : new NGramStringIterable(words, 1, 3)) { ngrams.add(ngram);
private List<String> createNGramList(Iterable<String> tokens, int minN, int maxN) { if (minN > maxN) { throw new IllegalArgumentException("minN needs to be smaller or equal than maxN."); } List<String> nGrams = new ArrayList<String>(); // fill token list List<String> tokenList = new ArrayList<String>(); for (String t : tokens) { tokenList.add(t); } for (int k = minN; k <= maxN; k++) { // if the number of tokens is less than k => break if (tokenList.size() < k) { break; } nGrams.addAll(getNGrams(tokenList, k)); } return nGrams; }
for (String trigram : new NGramStringIterable(words, 3, 3)) { trigrams.add(trigram);
private List<String> createNGramList(Iterable<String> tokens, int minN, int maxN) { if (minN > maxN) { throw new IllegalArgumentException("minN needs to be smaller or equal than maxN."); } List<String> nGrams = new ArrayList<String>(); // fill token list List<String> tokenList = new ArrayList<String>(); for (String t : tokens) { tokenList.add(t); } for (int k = minN; k <= maxN; k++) { // if the number of tokens is less than k => break if (tokenList.size() < k) { break; } nGrams.addAll(getNGrams(tokenList, k)); } return nGrams; }
for (String trigram : new NGramStringIterable(words, 3, 3)) { trigrams.add(trigram);
/** * @param tokens * An iterable of tokens. * @param minN * the minimal n-gram length. * @param maxN * the maximal n-gram length. */ public NGramStringIterable(Iterable<String> tokens, int minN, int maxN) { this.nGramList = createNGramList(tokens, minN, maxN); }
public void add(JCas jcas, Set<String> inputPaths, Type sentenceType) throws IOException { ConditionalFrequencyDistribution<Integer, String> cfd = new ConditionalFrequencyDistribution<Integer, String>(); CAS cas = jcas.getCas(); for (AnnotationFS annotation : CasUtil.select(cas, sentenceType)) { for (String path : inputPaths) { String[] segments = path.split("/", 2); String typeName = segments[0]; Type type = getInputType(cas, typeName); List<AnnotationFS> tokens = CasUtil.selectCovered(cas, type, annotation); List<String> tokenStrings; try { tokenStrings = createStringList(tokens, segments); } catch (AnalysisEngineProcessException e) { throw new IOException(e); } for (int ngramLen = minNgramLength; ngramLen <= maxNgramLength; ngramLen++) { cfd.incAll(ngramLen, new NGramStringIterable(tokenStrings, ngramLen, ngramLen)); } } } add(cfd); }
/** * @param tokens An array of tokens. * @param minN * the minimal n-gram length. * @param maxN * the maximal n-gram length. */ public NGramStringIterable(String[] tokens, int minN, int maxN) { this.nGramList = createNGramList(Arrays.asList(tokens), minN, maxN); }
/** * @param tokens An array of tokens. * @param minN * the minimal n-gram length. * @param maxN * the maximal n-gram length. */ public NGramStringIterable(String[] tokens, int minN, int maxN) { this.nGramList = createNGramList(Arrays.asList(tokens), minN, maxN); }