/** * @param tokens * An iterable of tokens. * @param minN * minimum n-gram length. * @param maxN * maximum n-gram length. */ public NGramStringListIterable(Iterable<String> tokens, int minN, int maxN) { this.nGramList = createNGramList(tokens, minN, maxN); }
/** * @param tokens * An iterable of tokens. * @param minN * the minimal n-gram length. * @param maxN * the maximal n-gram length. */ public NGramStringIterable(Iterable<String> tokens, int minN, int maxN) { this.nGramList = createNGramList(tokens, minN, maxN); }
/** * @param token * A token * @param minN * the minimal n-gram length. * @param maxN * the maximal n-gram length. */ public CharacterNGramStringIterable(String token, int minN, int maxN) { this.nGramList = createNGramList(token, minN, maxN); }
private FrequencyDistribution<String> getCharacterNgramsFromCasText( String text, boolean lowerCaseNgrams, int minN, int maxN) { if (lowerCaseNgrams) { text = text.toLowerCase(); } text = CHAR_WORD_BEGIN + text.replaceAll(" ", CHAR_WORD_BEGIN + " " + CHAR_WORD_END) + CHAR_WORD_END; FrequencyDistribution<String> charNgrams = new FrequencyDistribution<String>(); for (String charNgram : new CharacterNGramStringIterable(text, minN, maxN)) { charNgrams.inc(charNgram); } return charNgrams; } }
private List<String> createNGramList(Iterable<String> tokens, int minN, int maxN) { if (minN > maxN) { throw new IllegalArgumentException("minN needs to be smaller or equal than maxN."); } List<String> nGrams = new ArrayList<String>(); // fill token list List<String> tokenList = new ArrayList<String>(); for (String t : tokens) { tokenList.add(t); } for (int k = minN; k <= maxN; k++) { // if the number of tokens is less than k => break if (tokenList.size() < k) { break; } nGrams.addAll(getNGrams(tokenList, k)); } return nGrams; }
private List<List<String>> createNGramList(Iterable<String> tokens, int minN, int maxN) { if (minN > maxN) { throw new IllegalArgumentException("minN needs to be smaller or equal than maxN."); } List<List<String>> nGrams = new ArrayList<List<String>>(); // fill token list List<String> tokenList = new ArrayList<String>(); for (String t : tokens) { tokenList.add(t); } for (int k = minN; k <= maxN; k++) { // if the number of tokens is less than k => break if (tokenList.size() < k) { break; } nGrams.addAll(getNGrams(tokenList, k)); } return nGrams; }
private List<String> createNGramList(String token, int minN, int maxN) { if (minN > maxN) { throw new IllegalArgumentException("minN needs to be smaller or equal than maxN."); } List<String> nGrams = new ArrayList<String>(); // fill character list List<String> charList = new ArrayList<String>(); for (char c : token.toCharArray()) { charList.add(Character.toString(c)); } for (int k = minN; k <= maxN; k++) { // if the number of tokens is less than k => break if (charList.size() < k) { break; } nGrams.addAll(getNGrams(charList, k)); } return nGrams; }
public static FrequencyDistribution<String> getDocumentCharacterNgrams(JCas jcas, boolean lowerCaseNgrams, int minN, int maxN) { FrequencyDistribution<String> charNgrams = new FrequencyDistribution<String>(); for (String charNgram : new CharacterNGramStringIterable(jcas.getDocumentText(), minN, maxN)) { if (lowerCaseNgrams) { charNgram = charNgram.toLowerCase(); } charNgrams.inc(charNgram); } return charNgrams; }
/** * @param tokens * An iterable of tokens. * @param minN * minimum n-gram length. * @param maxN * maximum n-gram length. */ public NGramStringListIterable(Iterable<String> tokens, int minN, int maxN) { this.nGramList = createNGramList(tokens, minN, maxN); }
/** * @param tokens * An iterable of tokens. * @param minN * the minimal n-gram length. * @param maxN * the maximal n-gram length. */ public NGramStringIterable(Iterable<String> tokens, int minN, int maxN) { this.nGramList = createNGramList(tokens, minN, maxN); }
/** * @param token * A token * @param minN * the minimal n-gram length. * @param maxN * the maximal n-gram length. */ public CharacterNGramStringIterable(String token, int minN, int maxN) { this.nGramList = createNGramList(token, minN, maxN); }
private List<String> createNGramList(Iterable<String> tokens, int minN, int maxN) { if (minN > maxN) { throw new IllegalArgumentException("minN needs to be smaller or equal than maxN."); } List<String> nGrams = new ArrayList<String>(); // fill token list List<String> tokenList = new ArrayList<String>(); for (String t : tokens) { tokenList.add(t); } for (int k = minN; k <= maxN; k++) { // if the number of tokens is less than k => break if (tokenList.size() < k) { break; } nGrams.addAll(getNGrams(tokenList, k)); } return nGrams; }
private List<List<String>> createNGramList(Iterable<String> tokens, int minN, int maxN) { if (minN > maxN) { throw new IllegalArgumentException("minN needs to be smaller or equal than maxN."); } List<List<String>> nGrams = new ArrayList<List<String>>(); // fill token list List<String> tokenList = new ArrayList<String>(); for (String t : tokens) { tokenList.add(t); } for (int k = minN; k <= maxN; k++) { // if the number of tokens is less than k => break if (tokenList.size() < k) { break; } nGrams.addAll(getNGrams(tokenList, k)); } return nGrams; }
private List<String> createNGramList(String token, int minN, int maxN) { if (minN > maxN) { throw new IllegalArgumentException("minN needs to be smaller or equal than maxN."); } List<String> nGrams = new ArrayList<String>(); // fill character list List<String> charList = new ArrayList<String>(); for (char c : token.toCharArray()) { charList.add(Character.toString(c)); } for (int k = minN; k <= maxN; k++) { // if the number of tokens is less than k => break if (charList.size() < k) { break; } nGrams.addAll(getNGrams(charList, k)); } return nGrams; }
/** * Creates a frequency distribution of character ngrams over the span of an annotation. The * boundary* parameter allows it to provide a string that is added additionally at the beginning * and end of the respective annotation span. If for instance the 'begin of sequence' or 'end of * sequence' of a span shall be marked the boundary parameter can be used. Provide an empty * character in case this parameters are not needed */ public static FrequencyDistribution<String> getAnnotationCharacterNgrams( Annotation focusAnnotation, boolean lowerCaseNgrams, int minN, int maxN, char boundaryBegin, char boundaryEnd) { FrequencyDistribution<String> charNgrams = new FrequencyDistribution<String>(); for (String charNgram : new CharacterNGramStringIterable(boundaryBegin + focusAnnotation.getCoveredText() + boundaryEnd, minN, maxN)) { if (lowerCaseNgrams) { charNgram = charNgram.toLowerCase(); } charNgrams.inc(charNgram); } return charNgrams; }
/** * @param tokens * An array of tokens. * @param minN * minimum n-gram length. * @param maxN * maximum n-gram length. */ public NGramStringListIterable(String[] tokens, int minN, int maxN) { this.nGramList = createNGramList(Arrays.asList(tokens), minN, maxN); }
/** * @param tokens An array of tokens. * @param minN * the minimal n-gram length. * @param maxN * the maximal n-gram length. */ public NGramStringIterable(String[] tokens, int minN, int maxN) { this.nGramList = createNGramList(Arrays.asList(tokens), minN, maxN); }
for (String charNgram : new CharacterNGramStringIterable(boundaryBegin + text + boundaryEnd, minN, maxN)) { charNgrams.inc(charNgram);
/** * @param tokens * An array of tokens. * @param minN * minimum n-gram length. * @param maxN * maximum n-gram length. */ public NGramStringListIterable(String[] tokens, int minN, int maxN) { this.nGramList = createNGramList(Arrays.asList(tokens), minN, maxN); }
/** * @param tokens An array of tokens. * @param minN * the minimal n-gram length. * @param maxN * the maximal n-gram length. */ public NGramStringIterable(String[] tokens, int minN, int maxN) { this.nGramList = createNGramList(Arrays.asList(tokens), minN, maxN); }