TokenStream tokenStream = analyzer.tokenStream(fieldName, reader); OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class); while (tokenStream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String term = termAttribute.term(); }
String value = termAtt.term(); String phonetic = null; try {
public static String removeStopWordsAndStem(String input) throws IOException { Set<String> stopWords = new HashSet<String>(); stopWords.add("a"); stopWords.add("I"); stopWords.add("the"); TokenStream tokenStream = new StandardTokenizer( Version.LUCENE_30, new StringReader(input)); tokenStream = new StopFilter(true, tokenStream, stopWords); tokenStream = new PorterStemFilter(tokenStream); StringBuilder sb = new StringBuilder(); TermAttribute termAttr = tokenStream.getAttribute(TermAttribute.class); while (tokenStream.incrementToken()) { if (sb.length() > 0) { sb.append(" "); } sb.append(termAttr.term()); } return sb.toString(); }
@Override public boolean incrementToken() throws IOException { boolean result = false; if (input.incrementToken()) { String docid = termAtt.term(); termAtt.setTermBuffer(LsiIndex.PAYLOAD_TERM_TEXT); payAtt.setPayload(new Payload(PayloadEncoder.encodePayloadId(docid))); return true; } return result; } }
final int startOffset = offsetAtt.startOffset(); final int endOffset = offsetAtt.endOffset(); final String text = termAtt.term(); return new AToken() { @Override
@Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { System.out.println("TERM: " + termAtt.term()); return true; } return false; } }
private List<String> tokenize(TokenStream stream) throws IOException { List<String> tokens = new ArrayList<String>(); TermAttribute term = (TermAttribute) stream .addAttribute(TermAttribute.class); while (stream.incrementToken()) { // Not sure if we somehow can use termBuffer() to get a char[] // so we do no have to create a new String for each term tokens.add(term.term()); } return tokens; }
result.append(term.term()); result.append(" ");
return false; terms = termAttr.term().split("_");
private void splitIntoTokens() { String term = termAtt.term(); String[] termParts = splitTerm(term); if(termParts.length > 1) { int termPos = offsetAtt.startOffset(); for (int i = 0; i < termParts.length; i++) { String termPart = termParts[i]; int termPartPos = termPos + term.indexOf(termPart); int termPartEndPos = termPartPos + termPart.length(); Token newToken = new Token(termPart, termPartPos, termPartEndPos); newToken.setPositionIncrement(0); // in the same position tokens.add( newToken ); } } }
/** * Extracts terms from text * * @param text * @return a map of terms to their offsets in text * @throws IOException */ public static Map<String, List<Integer>> extractTextTerms(String text) throws IOException { Map<String, List<Integer>> terms = new HashMap<String, List<Integer>>(); TokenStream tokenStream = fileAnalyzer.tokenStream(Field.CONTENTS.toString(), new StringReader(text)); TermAttribute termAtt = (TermAttribute) tokenStream.addAttribute(TermAttribute.class); OffsetAttribute offsetAtt = (OffsetAttribute) tokenStream.addAttribute(OffsetAttribute.class); while(tokenStream.incrementToken()) { String termText = termAtt.term().toLowerCase(Locale.ENGLISH);// t.termText().toLowerCase(Locale.ENGLISH); int offset = offsetAtt.startOffset(); List<Integer> offsets = terms.get(termText); if( offsets == null ) { offsets = new LinkedList<Integer>(); terms.put(termText, offsets); } offsets.add(offset); } tokenStream.close(); return terms; }
private final KeywordTokenizer keywordTokenizer = new KeywordTokenizer(new StringReader("")); private final ASCIIFoldingFilter asciiFoldingFilter = new ASCIIFoldingFilter(keywordTokenizer); private final TermAttribute termAttribute = (TermAttribute) asciiFoldingFilter.getAttribute(TermAttribute.class); public String process(String line) { if (line != null) { try { keywordTokenizer.reset(new StringReader(line)); if (asciiFoldingFilter.incrementToken()) { return termAttribute.term(); } } catch (IOException e) { logger.warn("Failed to parse: " + line, e); } } return null; }
try { while (tok.incrementToken()) { String token = term.term(); if (!StringUtils.isEmpty(token)) tokenList.add(token);
/** * Creates a query to find intervals a number is in. * @param name The name of the field to search. * @param value The search value. * @param precisionStep The precision step used when indexing the field. */ public InNumericIntervalQuery(final String name, final long value, final int precisionStep) { super(true); this.value = value; TokenStream stream = new NumericTokenStream(precisionStep) .setLongValue(value); try { stream.reset(); while (stream.incrementToken()) { this.add( new TermQuery( new Term(name, stream.getAttribute(TermAttribute.class).term())), BooleanClause.Occur.SHOULD); } } catch (IOException e) { throw new IllegalStateException( "This should never happen - NumericTokenStream does no IO."); } }
String word = term.term(); tokenCount++; if (tokenCount > maxNumTokensParsed) {
String word = term.term(); tokenCount++; if (tokenCount > maxNumTokensParsed) {
public List<TmHit> searchSimpleConcordance(String query, int threshold, int max, Metadata metadata) { if (threshold < 0 || threshold > 100) { throw new IllegalArgumentException(""); } float searchThreshold = (float) threshold; if (threshold < 0) searchThreshold = 0.0f; if (threshold > 100) searchThreshold = 100.0f; // create basic ngram analyzer to tokenize query TokenStream queryTokenStream = defaultFuzzyAnalyzer.tokenStream(TranslationUnitField.SOURCE .name(), new StringReader(query)); // get the TermAttribute from the TokenStream TermAttribute termAtt = (TermAttribute) queryTokenStream.addAttribute(TermAttribute.class); SimpleConcordanceFuzzyQuery fQuery = new SimpleConcordanceFuzzyQuery(searchThreshold); try { queryTokenStream.reset(); while (queryTokenStream.incrementToken()) { Term t = new Term(TranslationUnitField.SOURCE.name(), termAtt.term()); fQuery.add(t); } queryTokenStream.end(); queryTokenStream.close(); } catch (IOException e) { throw new OkapiIOException(e.getMessage(), e); } return getConcordanceHits(max, fQuery, query, metadata); }
/** * {@inheritDoc} */ public Query createPhraseQuery(String field, String value) { NlsNullPointerException.checkNotNull("field", field); NlsNullPointerException.checkNotNull("value", value); SearchFieldConfiguration fieldConfiguration = getSearchFields().getOrCreateFieldConfiguration( field); SearchFieldType fieldType = fieldConfiguration.getType(); Query result; if (fieldType == SearchFieldType.TEXT) { PhraseQuery phraseQuery = new PhraseQuery(); result = phraseQuery; try { TokenStream tokenStream = this.analyzer.tokenStream(field, new StringReader(value)); TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class); while (tokenStream.incrementToken()) { phraseQuery.add(new Term(field, termAttribute.term())); } } catch (IOException e) { throw new RuntimeIoException(e, IoMode.READ); } } else { result = new TermQuery(createTerm(field, value)); } return result; }
String tokenText = termAtt.term();
... WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test)); int count = 0; int numItalics = 0; int numBoldItalics = 0; int numCategory = 0; int numCitation = 0; TermAttribute termAtt = tf.addAttribute(TermAttribute.class); TypeAttribute typeAtt = tf.addAttribute(TypeAttribute.class); while (tf.incrementToken()) { String tokText = termAtt.term(); //System.out.println("Text: " + tokText + " Type: " + token.type()); String expectedType = (String) tcm.get(tokText); assertTrue("expectedType is null and it shouldn't be for: " + tf.toString(), expectedType != null); assertTrue(typeAtt.type() + " is not equal to " + expectedType + " for " + tf.toString(), typeAtt.type().equals(expectedType) == true); count++; if (typeAtt.type().equals(WikipediaTokenizer.ITALICS) == true){ numItalics++; } else if (typeAtt.type().equals(WikipediaTokenizer.BOLD_ITALICS) == true){ numBoldItalics++; } else if (typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true){ numCategory++; } else if (typeAtt.type().equals(WikipediaTokenizer.CITATION) == true){ numCitation++; } } ...