public static void main(String[] args) { String[] tokens = StringUtils.split("This is a simple example sentence ."); for (String ngram : new NGramIterable(tokens, 2, 2)) { System.out.println(ngram); } } }
/** * @param tokens An iterable of tokens. */ public NGramIterable(Iterable<String> tokens, int minN, int maxN) { if (minN > maxN) { throw new IllegalArgumentException("minN needs to be smaller or equal than maxN."); } this.nGramList = createNGramList(tokens, minN, maxN); }
private List<String> createNGramList(Iterable<String> tokens, int minN, int maxN) { List<String> nGrams = new ArrayList<String>(); // fill token list List<String> tokenList = new ArrayList<String>(); for (String t : tokens) { tokenList.add(t); } for (int k = minN; k <= maxN; k++) { // if the number of tokens is less than k => break if (tokenList.size() < k) { break; } nGrams.addAll(getNGrams(tokenList, k)); } return nGrams; }
public BrownProvider(int minN, int maxN) throws Exception { BrownCorpus brownCorpus = new BrownCorpus(); cfd = new ConditionalFrequencyDistribution<Integer,String>(); if (minN > maxN) { throw new IllegalArgumentException("minN > maxN"); } for (int i=minN; i<=maxN; i++) { for (Sentence s : brownCorpus.getSentences()) { cfd.addSamples( i, new NGramIterable(s.getTokens(), i, i) ); } } }
/** * @param tokens An array of tokens. */ public NGramIterable(String[] tokens, int minN, int maxN) { if (minN > maxN) { throw new IllegalArgumentException("minN needs to be smaller or equal than maxN."); } this.nGramList = createNGramList(Arrays.asList(tokens), minN, maxN); }
@Test public void ngramTest() { String[] tokens = "This is a simple example sentence .".split(" "); int i=0; for (String ngram : new NGramIterable(tokens, 2, 2)) { if (i==0) { assertEquals("This is", ngram); } System.out.println(ngram); i++; } assertEquals(6, i); } }