public boolean hasNext() { return index < size(); }
private static int indexOf(StringList sentence, String token) { for (int i = 0; i < sentence.size(); i++) { if (token.equals(sentence.getToken(i))) { return i; } } return -1; }
/** * get the (n-1)th ngram of a given ngram, that is the same ngram except the last word in the ngram * * @param ngram a ngram * @return a ngram */ public static StringList getNMinusOneTokenFirst(StringList ngram) { String[] tokens = new String[ngram.size() - 1]; for (int i = 0; i < ngram.size() - 1; i++) { tokens[i] = ngram.getToken(i); } return tokens.length > 0 ? new StringList(tokens) : null; }
/** * get the (n-1)th ngram of a given ngram, that is the same ngram except the first word in the ngram * * @param ngram a ngram * @return a ngram */ public static StringList getNMinusOneTokenLast(StringList ngram) { String[] tokens = new String[ngram.size() - 1]; for (int i = 1; i < ngram.size(); i++) { tokens[i - 1] = ngram.getToken(i); } return tokens.length > 0 ? new StringList(tokens) : null; }
/** * Initializes the current instance with the given * {@link StringList} {@link Iterator}. * * @param tokenLists */ public Index(Iterator<StringList> tokenLists) { while (tokenLists.hasNext()) { StringList tokens = tokenLists.next(); for (int i = 0; i < tokens.size(); i++) { this.tokens.add(tokens.getToken(i)); } } }
@Override public String[] predictNextTokens(String... tokens) { double maxProb = Double.NEGATIVE_INFINITY; String[] token = null; for (StringList ngram : this) { String[] sequence = new String[ngram.size() + tokens.length]; for (int i = 0; i < tokens.length; i++) { sequence[i] = tokens[i]; } for (int i = 0; i < ngram.size(); i++) { sequence[i + tokens.length] = ngram.getToken(i); } double v = calculateProbability(sequence); if (v > maxProb) { maxProb = v; token = new String[ngram.size()]; for (int i = 0; i < ngram.size(); i++) { token[i] = ngram.getToken(i); } } } return token; }
/** * Compares to tokens list and ignores the case of the tokens. * * Note: This can cause problems with some locals. * * @param tokens * * @return true if identically with ignore the case otherwise false */ public boolean compareToIgnoreCase(StringList tokens) { if (size() == tokens.size()) { for (int i = 0; i < size(); i++) { if (getToken(i).compareToIgnoreCase( tokens.getToken(i)) != 0) { return false; } } } else { return false; } return true; }
@Override public StringList predictNextTokens(StringList tokens) { double maxProb = Double.NEGATIVE_INFINITY; StringList token = null; for (StringList ngram : this) { String[] sequence = new String[ngram.size() + tokens.size()]; for (int i = 0; i < tokens.size(); i++) { sequence[i] = tokens.getToken(i); } for (int i = 0; i < ngram.size(); i++) { sequence[i + tokens.size()] = ngram.getToken(i); } StringList sample = new StringList(sequence); double v = calculateProbability(sample); if (v > maxProb) { maxProb = v; token = ngram; } } return token; }
private static Double count(StringList ngram, Iterable<StringList> sentences) { Double count = 0d; for (StringList sentence : sentences) { int idx0 = indexOf(sentence, ngram.getToken(0)); if (idx0 >= 0 && sentence.size() >= idx0 + ngram.size()) { boolean match = true; for (int i = 1; i < ngram.size(); i++) { String sentenceToken = sentence.getToken(idx0 + i); String ngramToken = ngram.getToken(i); match &= sentenceToken.equals(ngramToken); } if (match) { count++; } } } return count; }
/** * Get the ngrams of dimension n of a certain input sequence of tokens. * * @param sequence a sequence of tokens * @param size the size of the resulting ngrmams * @return all the possible ngrams of the given size derivable from the input sequence */ public static Collection<StringList> getNGrams(StringList sequence, int size) { Collection<StringList> ngrams = new LinkedList<>(); if (size == -1 || size >= sequence.size()) { ngrams.add(sequence); } else { String[] ngram = new String[size]; for (int i = 0; i < sequence.size() - size + 1; i++) { ngram[0] = sequence.getToken(i); for (int j = 1; j < size; j++) { ngram[j] = sequence.getToken(i + j); } ngrams.add(new StringList(ngram)); } } return ngrams; }
/** * calculate the probability of a unigram in a vocabulary using maximum likelihood estimation * * @param word the only word in the unigram * @param set the vocabulary * @return the maximum likelihood probability */ public static double calculateUnigramMLProbability(String word, Collection<StringList> set) { double vocSize = 0d; for (StringList s : set) { vocSize += s.size(); } return count(new StringList(word), set) / vocSize; }
/** * Generates the context for a document using character ngrams. * @param document document to extract context from * @return the generated context */ @Override public String[] getContext(CharSequence document) { Collection<String> context = new ArrayList<>(); NGramModel model = new NGramModel(); model.add(normalizer.normalize(document), minLength, maxLength); for (StringList tokenList : model) { if (tokenList.size() > 0) { context.add(tokenList.getToken(0)); } } return context.toArray(new String[context.size()]); } }
public void createFeatures(List<String> features, String[] tokens, int index, String[] preds) { NGramModel model = new NGramModel(); model.add(tokens[index], minLength, maxLength); for (StringList tokenList : model) { if (tokenList.size() > 0) { features.add("ng=" + StringUtil.toLowerCase(tokenList.getToken(0))); } } } }
private void init(InputStream in) throws IOException { DictionaryEntryPersistor.create(in, entry -> { String operationString = entry.getAttributes().getValue("operation"); StringList word = entry.getTokens(); if (word.size() != 1) throw new InvalidFormatException("Each entry must have exactly one token! " + word); // parse operation Operation operation = Operation.parse(operationString); if (operation == null) throw new InvalidFormatException("Unknown operation type: " + operationString); operationTable.put(word.getToken(0), operation); }); }
private double stupidBackoff(StringList ngram) { int count = getCount(ngram); StringList nMinusOneToken = NGramUtils.getNMinusOneTokenFirst(ngram); if (nMinusOneToken == null || nMinusOneToken.size() == 0) { return (double) count / (double) size(); } else if (count > 0) { double countM1 = getCount(nMinusOneToken); if (countM1 == 0d) { countM1 = size(); // to avoid Infinite if n-1grams do not exist } return (double) count / countM1; } else { return 0.4 * stupidBackoff(NGramUtils.getNMinusOneTokenLast(ngram)); } }
@Override public String[] getContext(CharSequence document) { String[] superContext = super.getContext(document); List<String> context = new ArrayList(Arrays.asList(superContext)); document = this.normalizer.normalize(document); SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE; String[] words = tokenizer.tokenize(document.toString()); NGramModel tokenNgramModel = new NGramModel(); if (words.length > 0) { tokenNgramModel.add(new StringList(words), 1, 3); Iterator tokenNgramIterator = tokenNgramModel.iterator(); while (tokenNgramIterator.hasNext()) { StringList tokenList = (StringList) tokenNgramIterator.next(); if (tokenList.size() > 0) { context.add("tg=" + tokenList.toString()); } } } return context.toArray(new String[context.size()]); } }
/** * Tests {@link StringList#getToken(int)}. */ @Test public void testGetToken() { StringList l = new StringList("a", "b"); Assert.assertEquals(2, l.size()); Assert.assertEquals("a", l.getToken(0)); Assert.assertEquals("b", l.getToken(1)); }