private static int indexOf(StringList sentence, String token) { for (int i = 0; i < sentence.size(); i++) { if (token.equals(sentence.getToken(i))) { return i; } } return -1; }
public String next() { if (hasNext()) { return getToken(index++); } else { throw new NoSuchElementException(); } }
/** * Initializes the current instance with the given * {@link StringList} {@link Iterator}. * * @param tokenLists */ public Index(Iterator<StringList> tokenLists) { while (tokenLists.hasNext()) { StringList tokens = tokenLists.next(); for (int i = 0; i < tokens.size(); i++) { this.tokens.add(tokens.getToken(i)); } } }
@Override public String toString() { StringBuilder string = new StringBuilder(); string.append('['); for (int i = 0; i < size(); i++) { string.append(getToken(i)); if (i < size() - 1) { string.append(','); } } string.append(']'); return string.toString(); } }
private static Double count(StringList ngram, Iterable<StringList> sentences) { Double count = 0d; for (StringList sentence : sentences) { int idx0 = indexOf(sentence, ngram.getToken(0)); if (idx0 >= 0 && sentence.size() >= idx0 + ngram.size()) { boolean match = true; for (int i = 1; i < ngram.size(); i++) { String sentenceToken = sentence.getToken(idx0 + i); String ngramToken = ngram.getToken(i); match &= sentenceToken.equals(ngramToken); } if (match) { count++; } } } return count; }
/** * Get the ngrams of dimension n of a certain input sequence of tokens. * * @param sequence a sequence of tokens * @param size the size of the resulting ngrmams * @return all the possible ngrams of the given size derivable from the input sequence */ public static Collection<StringList> getNGrams(StringList sequence, int size) { Collection<StringList> ngrams = new LinkedList<>(); if (size == -1 || size >= sequence.size()) { ngrams.add(sequence); } else { String[] ngram = new String[size]; for (int i = 0; i < sequence.size() - size + 1; i++) { ngram[0] = sequence.getToken(i); for (int j = 1; j < size; j++) { ngram[j] = sequence.getToken(i + j); } ngrams.add(new StringList(ngram)); } } return ngrams; }
/** * Compares to tokens list and ignores the case of the tokens. * * Note: This can cause problems with some locals. * * @param tokens * * @return true if identically with ignore the case otherwise false */ public boolean compareToIgnoreCase(StringList tokens) { if (size() == tokens.size()) { for (int i = 0; i < size(); i++) { if (getToken(i).compareToIgnoreCase( tokens.getToken(i)) != 0) { return false; } } } else { return false; } return true; }
/** * Generates the context for a document using character ngrams. * @param document document to extract context from * @return the generated context */ @Override public String[] getContext(CharSequence document) { Collection<String> context = new ArrayList<>(); NGramModel model = new NGramModel(); model.add(normalizer.normalize(document), minLength, maxLength); for (StringList tokenList : model) { if (tokenList.size() > 0) { context.add(tokenList.getToken(0)); } } return context.toArray(new String[context.size()]); } }
public void createFeatures(List<String> features, String[] tokens, int index, String[] preds) { NGramModel model = new NGramModel(); model.add(tokens[index], minLength, maxLength); for (StringList tokenList : model) { if (tokenList.size() > 0) { features.add("ng=" + StringUtil.toLowerCase(tokenList.getToken(0))); } } } }
/** * get the (n-1)th ngram of a given ngram, that is the same ngram except the last word in the ngram * * @param ngram a ngram * @return a ngram */ public static StringList getNMinusOneTokenFirst(StringList ngram) { String[] tokens = new String[ngram.size() - 1]; for (int i = 0; i < ngram.size() - 1; i++) { tokens[i] = ngram.getToken(i); } return tokens.length > 0 ? new StringList(tokens) : null; }
private static StringList getNPlusOneNgram(StringList ngram, String word) { String[] tokens = new String[ngram.size() + 1]; for (int i = 0; i < ngram.size(); i++) { tokens[i] = ngram.getToken(i); } tokens[tokens.length - 1] = word; return new StringList(tokens); }
/** * Adds NGrams up to the specified length to the current instance. * * @param ngram the tokens to build the uni-grams, bi-grams, tri-grams, .. * from. * @param minLength - minimal length * @param maxLength - maximal length */ public void add(StringList ngram, int minLength, int maxLength) { if (minLength < 1 || maxLength < 1) throw new IllegalArgumentException("minLength and maxLength param must be at least 1. " + "minLength=" + minLength + ", maxLength= " + maxLength); if (minLength > maxLength) throw new IllegalArgumentException("minLength param must not be larger than " + "maxLength param. minLength=" + minLength + ", maxLength= " + maxLength); for (int lengthIndex = minLength; lengthIndex < maxLength + 1; lengthIndex++) { for (int textIndex = 0; textIndex + lengthIndex - 1 < ngram.size(); textIndex++) { String[] grams = new String[lengthIndex]; for (int i = textIndex; i < textIndex + lengthIndex; i++) { grams[i - textIndex] = ngram.getToken(i); } add(new StringList(grams)); } } }
/** * get the (n-1)th ngram of a given ngram, that is the same ngram except the first word in the ngram * * @param ngram a ngram * @return a ngram */ public static StringList getNMinusOneTokenLast(StringList ngram) { String[] tokens = new String[ngram.size() - 1]; for (int i = 1; i < ngram.size(); i++) { tokens[i - 1] = ngram.getToken(i); } return tokens.length > 0 ? new StringList(tokens) : null; }
@Override public String[] predictNextTokens(String... tokens) { double maxProb = Double.NEGATIVE_INFINITY; String[] token = null; for (StringList ngram : this) { String[] sequence = new String[ngram.size() + tokens.length]; for (int i = 0; i < tokens.length; i++) { sequence[i] = tokens[i]; } for (int i = 0; i < ngram.size(); i++) { sequence[i + tokens.length] = ngram.getToken(i); } double v = calculateProbability(sequence); if (v > maxProb) { maxProb = v; token = new String[ngram.size()]; for (int i = 0; i < ngram.size(); i++) { token[i] = ngram.getToken(i); } } } return token; }
throw new InvalidFormatException("Each entry must have exactly one token! " + word); newPosDict.dictionary.put(word.getToken(0), tags); });
@Override public StringList predictNextTokens(StringList tokens) { double maxProb = Double.NEGATIVE_INFINITY; StringList token = null; for (StringList ngram : this) { String[] sequence = new String[ngram.size() + tokens.size()]; for (int i = 0; i < tokens.size(); i++) { sequence[i] = tokens.getToken(i); } for (int i = 0; i < ngram.size(); i++) { sequence[i + tokens.size()] = ngram.getToken(i); } StringList sample = new StringList(sequence); double v = calculateProbability(sample); if (v > maxProb) { maxProb = v; token = ngram; } } return token; }
private void init(InputStream in) throws IOException { DictionaryEntryPersistor.create(in, entry -> { String operationString = entry.getAttributes().getValue("operation"); StringList word = entry.getTokens(); if (word.size() != 1) throw new InvalidFormatException("Each entry must have exactly one token! " + word); // parse operation Operation operation = Operation.parse(operationString); if (operation == null) throw new InvalidFormatException("Unknown operation type: " + operationString); operationTable.put(word.getToken(0), operation); }); }
assertEquals("Smith", personName.getToken(0)); personName = sampleStream.read(); assertNotNull(personName); assertEquals("Johnson", personName.getToken(0)); personName = sampleStream.read(); assertNotNull(personName); assertEquals("Williams", personName.getToken(0)); personName = sampleStream.read(); assertNotNull(personName); assertEquals("Jones", personName.getToken(0)); personName = sampleStream.read(); assertNotNull(personName); assertEquals("Brown", personName.getToken(0)); assertEquals("Mary", personName.getToken(0)); personName = sampleStream.read(); assertNotNull(personName); assertEquals("Patricia", personName.getToken(0)); personName = sampleStream.read(); assertNotNull(personName); assertEquals("Linda", personName.getToken(0)); personName = sampleStream.read(); assertNotNull(personName); assertEquals("Barbara", personName.getToken(0)); personName = sampleStream.read(); assertNotNull(personName); assertEquals("Elizabeth", personName.getToken(0)); assertEquals("James", personName.getToken(0));