Refine search
private void fillCache() throws IOException { while (input.incrementToken()) { cache.add(captureState()); } // capture final state input.end(); finalState = captureState(); }
TokenStream stream = analyzer.tokenStream(null, new StringReader(text)); CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { System.out.println(cattr.toString()); } stream.end(); stream.close();
public static List<String> keywords( String source ) { List<String> keywords = new ArrayList<String>(); TokenStream ts = null; try { ts = analyzer.tokenStream( "keywords", new StringReader( source ) ); ts.reset(); while ( ts.incrementToken() ) { keywords.add( ts.getAttribute( CharTermAttribute.class ).toString() ); } ts.end(); } catch ( IOException e ) { logger.error( "Error getting keywords ", e ); } finally { try { ts.close(); } catch (IOException ignored) {} } return keywords; } }
private SToken[] getTokens(String text) throws IOException { //FIXME somehow integrate below cycle to getSummary to save the cloning and memory, //also creating Tokens is suboptimal with 3.0.0 , this whole class could be replaced by highlighter ArrayList<SToken> result = new ArrayList<>(); try (TokenStream ts = analyzer.tokenStream("full", text)) { CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); ts.reset(); while (ts.incrementToken()) { SToken t = new SToken(term.buffer(), 0, term.length(), offset.startOffset(), offset.endOffset()); result.add(t); } ts.end(); } return result.toArray(new SToken[result.size()]); }
final TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class); ts.reset(); if (ts.incrementToken() == false) { throw new IllegalStateException("The normalization token stream is " + "expected to produce exactly 1 token, but got 0 for analyzer " if (ts.incrementToken()) { throw new IllegalStateException("The normalization token stream is " + "expected to produce exactly 1 token, but got 2+ for analyzer " + this + " and input \"" + text + "\""); ts.end(); return term;
int prevIncr = 1; int state = -1; while (in.incrementToken()) { int currentIncr = posIncAtt.getPositionIncrement(); if (pos == -1 && currentIncr < 1) { in.end(); if (state != -1) { builder.setAccept(state, true);
termsHashPerField.start(field, first); while (stream.incrementToken()) { stream.end();
Position posData = null; int maxOffset = 0; while (in.incrementToken()) { int posInc = posIncAtt.getPositionIncrement(); if (preservePositionIncrements == false && posInc > 1) { in.end();
/** * @param text * @return */ public Set<String> getToken(String text) { Set<String> list = new LinkedHashSet<>(); if (CommonUtils.notEmpty(text)) { try (StringReader stringReader = new StringReader(text); TokenStream tokenStream = dao.getAnalyzer().tokenStream(CommonConstants.BLANK, stringReader)) { CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { list.add(charTermAttribute.toString()); } tokenStream.end(); return list; } catch (IOException e) { return list; } } return list; }
/** * @param text * @return */ public Set<String> getToken(String text) { Set<String> list = new LinkedHashSet<>(); if (CommonUtils.notEmpty(text)) { try (StringReader stringReader = new StringReader(text); TokenStream tokenStream = dao.getAnalyzer().tokenStream(CommonConstants.BLANK, stringReader)) { CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { list.add(charTermAttribute.toString()); } tokenStream.end(); return list; } catch (IOException e) { return list; } } return list; }
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { skipTerms.add(new Term(fieldName, termAtt.toString())); ts.end();
while (ts.incrementToken()) { String word = termAtt.toString(); tokenCount++; ts.end();
@Override public boolean incrementToken() throws IOException { while (sources[currentSource].incrementToken() == false) { if (currentSource >= sources.length - 1) return false; sources[currentSource].end(); OffsetAttribute att = sourceOffsets[currentSource]; if (att != null) offsetIncrement += att.endOffset(); currentSource++; } clearAttributes(); sources[currentSource].copyTo(this); offsetAtt.setOffset(offsetAtt.startOffset() + offsetIncrement, offsetAtt.endOffset() + offsetIncrement); return true; }
map.put(field, tokenMap); while (ts.incrementToken()) { String token = termAtt.toString(); MutableInt cnt = tokenMap.get(token); ts.end();
private static int findGoodEndForNoHighlightExcerpt(int noMatchSize, Analyzer analyzer, String fieldName, String contents) throws IOException { try (TokenStream tokenStream = analyzer.tokenStream(fieldName, contents)) { if (!tokenStream.hasAttribute(OffsetAttribute.class)) { // Can't split on term boundaries without offsets return -1; } int end = -1; tokenStream.reset(); while (tokenStream.incrementToken()) { OffsetAttribute attr = tokenStream.getAttribute(OffsetAttribute.class); if (attr.endOffset() >= noMatchSize) { // Jump to the end of this token if it wouldn't put us past the boundary if (attr.endOffset() == noMatchSize) { end = noMatchSize; } return end; } end = attr.endOffset(); } tokenStream.end(); // We've exhausted the token stream so we should just highlight everything. return end; } } }
@Override protected AToken computeNext() { try { if (!tkstream.incrementToken()) { tkstream.end(); tkstream.close(); return endOfData();
@Test public void testCJKFilter() throws Exception { String s = "then quickbrownfoxjumpedoverthelazy dogss dog 2000 普林斯顿大å¦"; Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer(); TokenStream ts = analyzer.tokenStream(FIELD, s); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); ts.reset(); Map<String, Integer> tokens = new HashMap<>(); while (ts.incrementToken()) { String t = termAtt.toString(); Integer count = tokens.get(t); count = (count == null) ? count = 0 : count; count++; tokens.put(t, count); } ts.end(); ts.close(); assertEquals(7, tokens.size()); assertEquals(new Integer(1), tokens.get("æž—æ–¯")); }
@Test public void testCommon() throws Exception { AnalyzerManager analyzerManager = AnalyzerManager.newInstance(100000); Analyzer common = analyzerManager.getCommonTokensAnalyzer(); TokenStream ts = common.tokenStream("f", "the 5,000.12 5000 and dirty dog"); ts.reset(); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); Set<String> seen = new HashSet<>(); while (ts.incrementToken()) { String t = termAtt.toString(); if (AlphaIdeographFilterFactory.isAlphabetic(t.toCharArray(), t.length()) && t.contains("5")) { fail("Shouldn't have found a numeric"); } seen.add(termAtt.toString()); } ts.end(); ts.close(); assertTrue(seen.contains("dirty")); assertFalse(seen.contains("the")); }
@Test public void testGeneral() throws Exception { AnalyzerManager analyzerManager = AnalyzerManager.newInstance(100000); Analyzer general = analyzerManager.getGeneralAnalyzer(); TokenStream ts = general.tokenStream("f", "tHe quick aaaa aaa anD dirty dog"); ts.reset(); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); Set<String> seen = new HashSet<>(); while (ts.incrementToken()) { seen.add(termAtt.toString()); } ts.end(); ts.close(); assertTrue(seen.contains("the")); assertTrue(seen.contains("and")); assertTrue(seen.contains("dog")); }
if (input.incrementToken()) { input.end(); finalPosInc = posIncAtt.getPositionIncrement(); finalOffset = offsetAtt.endOffset();