private boolean areWordCharactersBackwards(int count,StringBuilder b) { for (int i=0; i<count; i++) { int checkIndex=b.length()-1-i; if (checkIndex<0) return false; if ( ! characterClasses.isLetterOrDigit(b.charAt(checkIndex))) return false; } return true; }
/** Returns true if this is a latin digit (other digits are not consistently parsed into numbers by Java) */ public boolean isLatinDigit(int c) { return Character.isDigit(c) && isLatin(c); }
if (characterClasses.isLetter(c)) { digitsOnly = false; } else if (characterClasses.isLatinDigit(c)) {
@Inject @SuppressWarnings("deprecation") public SimpleLinguistics() { this.normalizer = new SimpleNormalizer(); this.transformer = new SimpleTransformer(); this.detector = new SimpleDetector(); this.characterClasses = new CharacterClasses(); this.gramSplitter = new GramSplitter(characterClasses); }
private int indexOfNonWordChar(String s) { for (int i = 0; i < s.length(); i++) { if (!characterClasses.isLetterOrDigit(s.codePointAt(i))) { return i; } } return -1; }
/** * Returns true for code points which are letters in unicode 3 or 4, plus some additional characters * which are useful to view as letters even though not defined as such in unicode. */ public boolean isLetter(int c) { if (java.lang.Character.isLetter(c)) return true; if (Character.isDigit(c) && ! isLatin(c)) return true; // Not considering these digits, so treat them as letters // if (c == '_') return true; // Ticket 3864695, some CJK punctuation YST defined as word characters if (c == '\u3008' || c == '\u3009' || c == '\u300a' || c == '\u300b' || c == '\u300c' || c == '\u300d' || c == '\u300e' || c == '\u300f' || c == '\u3010' || c == '\u3011') { return true; } int type = java.lang.Character.getType(c); return type == java.lang.Character.NON_SPACING_MARK || type == java.lang.Character.COMBINING_SPACING_MARK || type == java.lang.Character.ENCLOSING_MARK; }
private boolean nextIsLetterOrDigit(MatchTokenStrippingCharacterIterator characters) { return characterClasses.isLetterOrDigit(characters.peek()); }
private Gram findNext() { while (i < input.length() && !characterClasses.isLetterOrDigit(input.codePointAt(i))) { i++; isFirstAfterSeparator = true;
characters.next(); else if ( ! characterClasses.isLetterOrDigit(c)) { if (atBoldingSeparator) inBolding=!inBolding;
if (characterClasses.isLetterOrDigit(c) || (c == '\'' && acceptApostropheAsWordCharacter(currentIndex))) { i = consumeWordOrNumber(i, currentIndex);