/** * Returns the next input Token whose term() is not a stop word. */ @Override protected boolean accept() { return !stopWords.contains(termAtt.buffer(), 0, termAtt.length()); }
final CharArraySet stopWords = SearchFieldAnalyzer.getStopWords(); for (String word : words) { if (stopWords.contains(word)) { continue;
@Override public boolean matches(char s[], int len) { return super.matches(s, len) && !exceptions.contains(s, 0, len); } }
private int rule17(char s[], int len) { if (len > 4 && endsWith(s, len, "ηστε")) { len -= 4; if (exc17.contains(s, 0, len)) len += 3; // add back the -ηστ } return len; }
@Override public boolean accept() { return words.contains(termAtt.buffer(), 0, termAtt.length()); } }
@Override protected boolean shouldFilter() { boolean b = protectedTerms.contains(termAtt.buffer(), 0, termAtt.length()); return b == false; }
private int rule4(char s[], int len) { if (len > 3 && (endsWith(s, len, "εωσ") || endsWith(s, len, "εων"))) { len -= 3; if (exc4.contains(s, 0, len)) len++; // add back -ε } return len; }
/** * Determines if the current token is a common term * * @return {@code true} if the current token is a common term, {@code false} otherwise */ private boolean isCommon() { return commonWords != null && commonWords.contains(termAttribute.buffer(), 0, termAttribute.length()); }
@Override protected boolean isKeyword() { return keywordSet.contains(termAtt.buffer(), 0, termAtt.length()); }
/** * Find the unique stem(s) of the provided word * * @param word Word to find the stems for * @return List of stems for the word */ public List<CharsRef> uniqueStems(char word[], int length) { List<CharsRef> stems = stem(word, length); if (stems.size() < 2) { return stems; } CharArraySet terms = new CharArraySet(8, dictionary.ignoreCase); List<CharsRef> deduped = new ArrayList<>(); for (CharsRef s : stems) { if (!terms.contains(s)) { deduped.add(s); terms.add(s); } } return deduped; }
private int rule16(char s[], int len) { boolean removed = false; if (len > 4 && endsWith(s, len, "ησου")) { len -= 4; removed = true; } else if (len > 3 && (endsWith(s, len, "ησε") || endsWith(s, len, "ησα"))) { len -= 3; removed = true; } if (removed && exc16.contains(s, 0, len)) len += 2; // add back -ησ return len; }
/** * Increments the {@link TokenStream} with a {@link CharTermAttribute} without elisioned start */ @Override public final boolean incrementToken() throws IOException { if (input.incrementToken()) { char[] termBuffer = termAtt.buffer(); int termLength = termAtt.length(); int index = -1; for (int i = 0; i < termLength; i++) { char ch = termBuffer[i]; if (ch == '\'' || ch == '\u2019') { index = i; break; } } // An apostrophe has been found. If the prefix is an article strip it off. if (index >= 0 && articles.contains(termBuffer, 0, index)) { termAtt.copyBuffer(termBuffer, index + 1, termLength - (index + 1)); } return true; } else { return false; } } }
private int rule19(char s[], int len) { boolean removed = false; if (len > 6 && (endsWith(s, len, "ησουμε") || endsWith(s, len, "ηθουμε"))) { len -= 6; removed = true; } else if (len > 4 && endsWith(s, len, "ουμε")) { len -= 4; removed = true; } if (removed && exc19.contains(s, 0, len)) { len += 3; s[len - 3] = 'ο'; s[len - 2] = 'υ'; s[len - 1] = 'μ'; } return len; }
private int rule18(char s[], int len) { boolean removed = false; if (len > 6 && (endsWith(s, len, "ησουνε") || endsWith(s, len, "ηθουνε"))) { len -= 6; removed = true; } else if (len > 4 && endsWith(s, len, "ουνε")) { len -= 4; removed = true; } if (removed && exc18.contains(s, 0, len)) { len += 3; s[len - 3] = 'ο'; s[len - 2] = 'υ'; s[len - 1] = 'ν'; } return len; }
@Override public boolean incrementToken() throws IOException { while (input.incrementToken()) { final char term[] = termAttribute.buffer(); final int length = termAttribute.length(); final int posIncrement = posIncAttribute.getPositionIncrement(); if (posIncrement > 0) { previous.clear(); } boolean duplicate = (posIncrement == 0 && previous.contains(term, 0, length)); // clone the term, and add to the set of seen terms. char saved[] = new char[length]; System.arraycopy(term, 0, saved, 0, length); previous.add(saved); if (!duplicate) { return true; } } return false; }
private int rule6(char s[], int len) { boolean removed = false; if (len > 3 && (endsWith(s, len, "ικα") || endsWith(s, len, "ικο"))) { len -= 3; removed = true; } else if (len > 4 && (endsWith(s, len, "ικου") || endsWith(s, len, "ικων"))) { len -= 4; removed = true; } if (removed) { if (endsWithVowel(s, len) || exc6.contains(s, 0, len)) len += 2; // add back -ικ } return len; }
private int rule7(char s[], int len) { if (len == 5 && endsWith(s, len, "αγαμε")) return len - 1; if (len > 7 && endsWith(s, len, "ηθηκαμε")) len -= 7; else if (len > 6 && endsWith(s, len, "ουσαμε")) len -= 6; else if (len > 5 && (endsWith(s, len, "αγαμε") || endsWith(s, len, "ησαμε") || endsWith(s, len, "ηκαμε"))) len -= 5; if (len > 3 && endsWith(s, len, "αμε")) { len -= 3; if (exc7.contains(s, 0, len)) len += 2; // add back -αμ } return len; }
private int rule13(char s[], int len) { if (len > 6 && endsWith(s, len, "ηθηκεσ")) { len -= 6; } else if (len > 5 && (endsWith(s, len, "ηθηκα") || endsWith(s, len, "ηθηκε"))) { len -= 5; } boolean removed = false; if (len > 4 && endsWith(s, len, "ηκεσ")) { len -= 4; removed = true; } else if (len > 3 && (endsWith(s, len, "ηκα") || endsWith(s, len, "ηκε"))) { len -= 3; removed = true; } if (removed && (exc13.contains(s, 0, len) || endsWith(s, len, "σκωλ") || endsWith(s, len, "σκουλ") || endsWith(s, len, "ναρθ") || endsWith(s, len, "σφ") || endsWith(s, len, "οθ") || endsWith(s, len, "πιθ"))) { len += 2; // add back the -ηκ } return len; }
private int rule14(char s[], int len) { boolean removed = false; if (len > 5 && endsWith(s, len, "ουσεσ")) { len -= 5; removed = true; } else if (len > 4 && (endsWith(s, len, "ουσα") || endsWith(s, len, "ουσε"))) { len -= 4; removed = true; } if (removed && (exc14.contains(s, 0, len) || endsWithVowel(s, len) || endsWith(s, len, "ποδαρ") || endsWith(s, len, "βλεπ") || endsWith(s, len, "πανταχ") || endsWith(s, len, "φρυδ") || endsWith(s, len, "μαντιλ") || endsWith(s, len, "μαλλ") || endsWith(s, len, "κυματ") || endsWith(s, len, "λαχ") || endsWith(s, len, "ληγ") || endsWith(s, len, "φαγ") || endsWith(s, len, "ομ") || endsWith(s, len, "πρωτ"))) { len += 3; // add back -ουσ } return len; }