TokenStream tokenStream = analyzer.tokenStream(fieldName, reader); OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class); while (tokenStream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String term = termAttribute.term(); }
public final class LuceneUtils { public static List<String> parseKeywords(Analyzer analyzer, String field, String keywords) { List<String> result = new ArrayList<String>(); TokenStream stream = analyzer.tokenStream(field, new StringReader(keywords)); try { while(stream.incrementToken()) { result.add(stream.getAttribute(TermAttribute.class).term()); } } catch(IOException e) { // not thrown b/c we're using a string reader... } return result; } }
@Override protected Word nextWord() { try { if(this.stream.incrementToken()) { return new AnalysisWord(this.stream.getAttribute(CharTermAttribute.class)); } } catch (IOException e) { throw new TokenizerException(e); } return null; } }
@Override protected Word nextWord() { try { if(this.stream.incrementToken()) { return new AnalysisWord(this.stream.getAttribute(CharTermAttribute.class)); } } catch (IOException e) { throw new TokenizerException(e); } return null; } }
public final class LuceneUtil { private LuceneUtil() {} public static List<String> tokenizeString(Analyzer analyzer, String string) { List<String> result = new ArrayList<String>(); try { TokenStream stream = analyzer.tokenStream(null, new StringReader(string)); stream.reset(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } } catch (IOException e) { // not thrown b/c we're using a string reader... throw new RuntimeException(e); } return result; } }
/** * Creates simple boolean query from the cached tokenstream contents */ protected Query analyzeBoolean(String field, TokenStream stream) throws IOException { TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); stream.reset(); List<Term> terms = new ArrayList<>(); while (stream.incrementToken()) { terms.add(new Term(field, termAtt.getBytesRef())); } return newSynonymQuery(terms.toArray(new Term[terms.size()])); }
/** * Creates simple term query from the cached tokenstream contents */ protected Query analyzeTerm(String field, TokenStream stream) throws IOException { TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); stream.reset(); if (!stream.incrementToken()) { throw new AssertionError(); } return newTermQuery(new Term(field, termAtt.getBytesRef())); }
CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) {
/** * Creates a span query from the tokenstream. In the case of a single token, a simple <code>SpanTermQuery</code> is * returned. When multiple tokens, an ordered <code>SpanNearQuery</code> with slop 0 is returned. */ protected SpanQuery createSpanQuery(TokenStream in, String field) throws IOException { TermToBytesRefAttribute termAtt = in.getAttribute(TermToBytesRefAttribute.class); if (termAtt == null) { return null; } List<SpanTermQuery> terms = new ArrayList<>(); while (in.incrementToken()) { terms.add(new SpanTermQuery(new Term(field, termAtt.getBytesRef()))); } if (terms.isEmpty()) { return null; } else if (terms.size() == 1) { return terms.get(0); } else { return new SpanNearQuery(terms.toArray(new SpanTermQuery[0]), 0, true); } }
CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) {
public static List<String> keywords( String source ) { List<String> keywords = new ArrayList<String>(); TokenStream ts = null; try { ts = analyzer.tokenStream( "keywords", new StringReader( source ) ); ts.reset(); while ( ts.incrementToken() ) { keywords.add( ts.getAttribute( CharTermAttribute.class ).toString() ); } ts.end(); } catch ( IOException e ) { logger.error( "Error getting keywords ", e ); } finally { try { ts.close(); } catch (IOException ignored) {} } return keywords; } }
/** * Creates complex boolean query from the cached tokenstream contents */ protected Query analyzeMultiBoolean(String field, TokenStream stream, BooleanClause.Occur operator) throws IOException { BooleanQuery.Builder q = newBooleanQuery(); List<Term> currentQuery = new ArrayList<>(); TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class); stream.reset(); while (stream.incrementToken()) { if (posIncrAtt.getPositionIncrement() != 0) { add(q, currentQuery, operator); currentQuery.clear(); } currentQuery.add(new Term(field, termAtt.getBytesRef())); } add(q, currentQuery, operator); return q.build(); }
mpqb.setSlop(slop); TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class); int position = -1;
/** * Creates simple phrase query from the cached tokenstream contents */ protected Query analyzePhrase(String field, TokenStream stream, int slop) throws IOException { PhraseQuery.Builder builder = new PhraseQuery.Builder(); builder.setSlop(slop); TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class); int position = -1; stream.reset(); while (stream.incrementToken()) { if (enablePositionIncrements) { position += posIncrAtt.getPositionIncrement(); } else { position += 1; } builder.add(new Term(field, termAtt.getBytesRef()), position); } return builder.build(); }
@Override public Set<String> segment(String text) { Set<String> result = InsertionOrderUtil.newSet(); Reader reader = new StringReader(text); try (TokenStream tokenStream = ANALYZER.tokenStream("text", reader)) { tokenStream.reset(); CharTermAttribute term = null; while (tokenStream.incrementToken()) { term = tokenStream.getAttribute(CharTermAttribute.class); result.add(term.toString()); } } catch (Exception e) { throw new HugeException("SmartCN segment text '%s' failed", e, text); } return result; } }
@Override public Query phraseQuery(String field, TokenStream stream, int slop, boolean enablePosIncrements) throws IOException { PhraseQuery.Builder builder = new PhraseQuery.Builder(); builder.setSlop(slop); TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class); int position = -1; stream.reset(); while (stream.incrementToken()) { if (enablePosIncrements) { position += posIncrAtt.getPositionIncrement(); } else { position += 1; } builder.add(new Term(field, termAtt.getBytesRef()), position); } return builder.build(); }
private void finishInnerStream() throws IOException { input.end(); inputStreamExhausted = true; // check for gaps at the end of the tokenstream endToken.posIncAtt.setPositionIncrement(this.incAtt.getPositionIncrement()); OffsetAttribute inputOffsets = input.getAttribute(OffsetAttribute.class); endToken.offsetAtt.setOffset(inputOffsets.startOffset(), inputOffsets.endOffset()); }
@Test public void testCJKFilter() throws Exception { String s = "then quickbrownfoxjumpedoverthelazy dogss dog 2000 普林斯顿大学"; Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer(); TokenStream ts = analyzer.tokenStream(FIELD, s); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); ts.reset(); Map<String, Integer> tokens = new HashMap<>(); while (ts.incrementToken()) { String t = termAtt.toString(); Integer count = tokens.get(t); count = (count == null) ? count = 0 : count; count++; tokens.put(t, count); } ts.end(); ts.close(); assertEquals(7, tokens.size()); assertEquals(new Integer(1), tokens.get("林斯")); }
@Test public void testTokenCountFilter() throws Exception { AnalyzerManager analyzerManager = AnalyzerManager.newInstance(1000000); StringBuilder sb = new StringBuilder(); for (int i = 0; i < 1001000; i++) { sb.append("the "); } TokenStream ts = analyzerManager.getGeneralAnalyzer().tokenStream("f", sb.toString()); ts.reset(); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); int tokens = 0; while (ts.incrementToken()) { tokens++; } assertEquals(1000000, tokens); }
@Test public void testGeneral() throws Exception { AnalyzerManager analyzerManager = AnalyzerManager.newInstance(100000); Analyzer general = analyzerManager.getGeneralAnalyzer(); TokenStream ts = general.tokenStream("f", "tHe quick aaaa aaa anD dirty dog"); ts.reset(); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); Set<String> seen = new HashSet<>(); while (ts.incrementToken()) { seen.add(termAtt.toString()); } ts.end(); ts.close(); assertTrue(seen.contains("the")); assertTrue(seen.contains("and")); assertTrue(seen.contains("dog")); }