private FrequencyDistribution<String> getCharacterNgramsFromCasText( String text, boolean lowerCaseNgrams, int minN, int maxN) { if (lowerCaseNgrams) { text = text.toLowerCase(); } text = CHAR_WORD_BEGIN + text.replaceAll(" ", CHAR_WORD_BEGIN + " " + CHAR_WORD_END) + CHAR_WORD_END; FrequencyDistribution<String> charNgrams = new FrequencyDistribution<String>(); for (String charNgram : new CharacterNGramStringIterable(text, minN, maxN)) { charNgrams.inc(charNgram); } return charNgrams; } }
public static FrequencyDistribution<String> getDocumentCharacterNgrams(JCas jcas, boolean lowerCaseNgrams, int minN, int maxN) { FrequencyDistribution<String> charNgrams = new FrequencyDistribution<String>(); for (String charNgram : new CharacterNGramStringIterable(jcas.getDocumentText(), minN, maxN)) { if (lowerCaseNgrams) { charNgram = charNgram.toLowerCase(); } charNgrams.inc(charNgram); } return charNgrams; }
/** * Creates a frequency distribution of character ngrams over the span of an annotation. The * boundary* parameter allows it to provide a string that is added additionally at the beginning * and end of the respective annotation span. If for instance the 'begin of sequence' or 'end of * sequence' of a span shall be marked the boundary parameter can be used. Provide an empty * character in case this parameters are not needed */ public static FrequencyDistribution<String> getAnnotationCharacterNgrams( Annotation focusAnnotation, boolean lowerCaseNgrams, int minN, int maxN, char boundaryBegin, char boundaryEnd) { FrequencyDistribution<String> charNgrams = new FrequencyDistribution<String>(); for (String charNgram : new CharacterNGramStringIterable(boundaryBegin + focusAnnotation.getCoveredText() + boundaryEnd, minN, maxN)) { if (lowerCaseNgrams) { charNgram = charNgram.toLowerCase(); } charNgrams.inc(charNgram); } return charNgrams; }
for (String charNgram : new CharacterNGramStringIterable(boundaryBegin + text + boundaryEnd, minN, maxN)) { charNgrams.inc(charNgram);