public final class LuceneUtils { public static List<String> parseKeywords(Analyzer analyzer, String field, String keywords) { List<String> result = new ArrayList<String>(); TokenStream stream = analyzer.tokenStream(field, new StringReader(keywords)); try { while(stream.incrementToken()) { result.add(stream.getAttribute(TermAttribute.class).term()); } } catch(IOException e) { // not thrown b/c we're using a string reader... } return result; } }
/** * Closes the CPE Index. */ @Override public synchronized void close() { final int count = INSTANCE.usageCount.get() - 1; if (count <= 0) { INSTANCE.usageCount.set(0); if (searchingAnalyzer != null) { searchingAnalyzer.close(); searchingAnalyzer = null; } if (indexReader != null) { try { indexReader.close(); } catch (IOException ex) { LOGGER.trace("", ex); } indexReader = null; } queryParser = null; indexSearcher = null; if (index != null) { index.close(); index = null; } } }
tokReader = new StringReader(field.stringValue()); tokens = analyzer.reusableTokenStream(field.name(), tokReader); if (position > 0) position += analyzer.getPositionIncrementGap(field.name()); tokens.reset(); // reset the TokenStream to the first token offsetAttribute = (OffsetAttribute) tokens.addAttribute(OffsetAttribute.class); .addAttribute(PositionIncrementAttribute.class); Term term = new Term(field.name(), termAttribute.toString()); ThriftTerm tterm = new ThriftTerm(term.field()).setText( ByteBuffer.wrap(term.text().getBytes("UTF-8"))).setIs_binary(false); position += (posIncrAttribute.getPositionIncrement() - 1); offsetVector.add(lastOffset + offsetAttribute.startOffset()); offsetVector.add(lastOffset + offsetAttribute.endOffset());
try (TokenStream stream = tokenStream = field.tokenStream(docState.analyzer, tokenStream)) { stream.reset(); invertState.setAttributeSource(stream); termsHashPerField.start(field, first); while (stream.incrementToken()) { int posIncr = invertState.posIncrAttribute.getPositionIncrement(); invertState.position += posIncr; if (invertState.position < invertState.lastPosition) { int startOffset = invertState.offset + invertState.offsetAttribute.startOffset(); int endOffset = invertState.offset + invertState.offsetAttribute.endOffset(); if (startOffset < invertState.lastStartOffset || endOffset < startOffset) { throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, and offsets must not go backwards " stream.end(); invertState.position += invertState.posIncrAttribute.getPositionIncrement(); invertState.offset += invertState.offsetAttribute.endOffset(); invertState.position += docState.analyzer.getPositionIncrementGap(fieldInfo.name); invertState.offset += docState.analyzer.getOffsetGap(fieldInfo.name);
/** * @param text * @return */ public Set<String> getToken(String text) { Set<String> list = new LinkedHashSet<>(); if (CommonUtils.notEmpty(text)) { try (StringReader stringReader = new StringReader(text); TokenStream tokenStream = dao.getAnalyzer().tokenStream(CommonConstants.BLANK, stringReader)) { CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { list.add(charTermAttribute.toString()); } tokenStream.end(); return list; } catch (IOException e) { return list; } } return list; }
@Test public void testParentPathSearchingTokenization() throws Exception { try { TokenStream ts = parentPathSearchingAnalyzer.tokenStream("text", new StringReader("/jcr:a/b/jcr:c")); assertTokenStreamContents(ts, new String[]{"/jcr:a/b"}); } finally { parentPathSearchingAnalyzer.close(); } }
testM.invoke(testC, testA.tokenStream("refs", new StringReader(input)), output, null, null, null, null, null, input.length(), true); System.out.println("Testing full with " + name); testM.invoke(testC, testA.tokenStream("full", new StringReader(input)), output, null, null, null, null, null, input.length(), true);
@SuppressWarnings("MismatchedQueryAndUpdateOfCollection") @Test public void testLemmatization() throws Exception { final TokenStream ts = analyzer.tokenStream("foo", new StringReader("מינהל")); ts.reset(); Set<String> terms = new HashSet<>(); while (ts.incrementToken()) { CharTermAttribute att = ts.getAttribute(CharTermAttribute.class); terms.add(new String(att.buffer(), 0, att.length())); //System.out.println(new String(att.buffer(), 0, att.length())); } }
public static void main(String[] args) throws IOException { String TEST_STR = "Hé jij И! раскази и повѣсти. Ст]' Дѣдо Нисторъ. Ива"; try (Analyzer a = new BLStandardAnalyzer()) { TokenStream ts = a.tokenStream("test", new StringReader(TEST_STR)); CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class); while (ts.incrementToken()) { System.out.println(new String(ta.buffer(), 0, ta.length())); } TokenStream ts2 = a.tokenStream(ComplexFieldUtil.propertyField("test", null, "s"), new StringReader(TEST_STR)); ta = ts2.addAttribute(CharTermAttribute.class); while (ts2.incrementToken()) { System.out.println(new String(ta.buffer(), 0, ta.length())); } } }
public class Tokens { private static void printTokens(String string, Analyzer analyzer) throws IOException { System.out.println("Using " + analyzer.getClass().getName()); TokenStream ts = analyzer.tokenStream("default", new StringReader(string)); OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class); while(ts.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String term = charTermAttribute.toString(); System.out.println(term + " (" + startOffset + " " + endOffset + ")"); } System.out.println(); } public static void main(String[] args) throws IOException { printTokens("foo-bar 1-2-3", new StandardAnalyzer(Version.LUCENE_40)); printTokens("foo-bar 1-2-3", new ClassicAnalyzer(Version.LUCENE_40)); QueryParser standardQP = new QueryParser(Version.LUCENE_40, "", new StandardAnalyzer(Version.LUCENE_40)); BooleanQuery q1 = (BooleanQuery) standardQP.parse("someField:(foo\\-bar\\ 1\\-2\\-3)"); System.out.println(q1.toString() + " # of clauses:" + q1.getClauses().length); } }
public Iterator<AToken> parseDocumentField(String fieldName, String content) { final TokenStream tkstream = analyzer.tokenStream(fieldName, new StringReader(content)); final TermAttribute termAtt = tkstream.addAttribute(TermAttribute.class); final PositionIncrementAttribute posIncrAttribute = tkstream.addAttribute(PositionIncrementAttribute.class); final OffsetAttribute offsetAtt = tkstream.addAttribute(OffsetAttribute.class);
private Token[] parseText(String text) throws IOException { if (text == null || text.trim().equals("")) return new Token[0]; final ArrayList result = new ArrayList(); final TokenStream ts = analyzer.tokenStream(DocumentBuilder.CONTENT_FIELD_NAME, new StringReader(text)); for (Token token = ts.next(); token != null; token = ts.next()) { result.add(token); } return (Token[]) result.toArray(new Token[result.size()]); }
public static List<String> tokenizedTermValues(Analyzer analyzer, String field, String text) throws IOException { final List<String> tokenList = new ArrayList<String>(); final TokenStream stream = analyzer.tokenStream( field, new StringReader( text ) ); try { CharTermAttribute term = stream.addAttribute( CharTermAttribute.class ); stream.reset(); while ( stream.incrementToken() ) { String s = new String( term.buffer(), 0, term.length() ); tokenList.add( s ); } stream.end(); } finally { stream.close(); } return tokenList; }
try (TokenStream ts = analyzer.tokenStream(fieldName, text)) { CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { skipTerms.add(new Term(fieldName, termAtt.toString())); ts.end(); BytesRef text; while ((text = termsEnum.next()) != null) { skipTerms.add(new Term(fieldName, text.utf8ToString()));
private SToken[] getTokens(String text) throws IOException { //FIXME somehow integrate below cycle to getSummary to save the cloning and memory, //also creating Tokens is suboptimal with 3.0.0 , this whole class could be replaced by highlighter ArrayList<SToken> result = new ArrayList<>(); try (TokenStream ts = analyzer.tokenStream("full", text)) { CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); ts.reset(); while (ts.incrementToken()) { SToken t = new SToken(term.buffer(), 0, term.length(), offset.startOffset(), offset.endOffset()); result.add(t); } ts.end(); } return result.toArray(new SToken[result.size()]); }
@Override @JsonIgnore final public Query getQuery(final QueryContext queryContext) throws IOException { final BooleanQuery.Builder builder = new BooleanQuery.Builder(); final String resolvedField = resolveField(queryContext.getFieldMap()); try (final TokenStream tokenStream = queryContext.getQueryAnalyzer().tokenStream(resolvedField, query_string)) { final CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); final PositionIncrementAttribute pocincrAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class); tokenStream.reset(); int pos = 0; while (tokenStream.incrementToken()) { final String charTerm = charTermAttribute.toString(); int start = pos - distance; if (start < 0) start = 0; final int end = pos + distance + 1; for (int i = start; i < end; i++) { final float dist = Math.abs(i - pos) + 1; final float boost = 1 / dist; final SpanTermQuery spanTermQuery = new SpanTermQuery(new Term(resolvedField, charTerm)); Query query = new BoostQuery(new SpanPositionRangeQuery(spanTermQuery, i, i + 1), boost); builder.add(new BooleanClause(query, BooleanClause.Occur.SHOULD)); } pos += pocincrAttribute.getPositionIncrement(); } return builder.build(); } }
try (TokenStream ts = analyzer.tokenStream("", text)) { CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); ts.reset(); reuse.clear(); while (ts.incrementToken()) { int length = termAtt.length(); if (length == 0) { throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token"); if (posIncAtt.getPositionIncrement() != 1) { throw new IllegalArgumentException("term: " + text + " analyzed to a token (" + termAtt + ") with position increment != 1 (got: " + posIncAtt.getPositionIncrement() + ")"); reuse.setLength(reuse.length() + 1); System.arraycopy(termAtt.buffer(), 0, reuse.chars(), end, length); reuse.setLength(reuse.length() + length);
private ArrayList<Data> analyze(Analyzer analyzer1) throws IOException { ArrayList<Data> results = new ArrayList<>(50); TokenStream ts = analyzer1.tokenStream("foo", text); ts.reset(); while (ts.incrementToken()) { Data data = new Data(); OffsetAttribute offsetAttribute = ts.getAttribute(OffsetAttribute.class); data.startOffset = offsetAttribute.startOffset(); data.endOffset = offsetAttribute.endOffset(); data.positionLength = ts.getAttribute(PositionLengthAttribute.class).getPositionLength(); data.positionIncGap = ts.getAttribute(PositionIncrementAttribute.class).getPositionIncrement(); data.tokenType = ts.getAttribute(HebrewTokenTypeAttribute.class).getType().toString(); data.term = ts.getAttribute(CharTermAttribute.class).toString(); if (ts.getAttribute(KeywordAttribute.class) != null) data.isKeyword = ts.getAttribute(KeywordAttribute.class).isKeyword(); // System.out.println(data.term + " " + data.tokenType); results.add(data); } ts.close(); return results; } }
@Override protected boolean doProcess(Record record) { try { List outputValues = record.get(outputFieldName); for (Object value : record.get(inputFieldName)) { reader.setValue(value.toString()); TokenStream tokenStream = analyzer.tokenStream("content", reader); tokenStream.reset(); while (tokenStream.incrementToken()) { if (token.length() > 0) { // incrementToken() updates the token! String tokenStr = new String(token.buffer(), 0, token.length()); outputValues.add(tokenStr); } } tokenStream.end(); tokenStream.close(); } } catch (IOException e) { throw new MorphlineRuntimeException(e); } // pass record to next command in chain: return super.doProcess(record); }
private static int findGoodEndForNoHighlightExcerpt(int noMatchSize, Analyzer analyzer, String fieldName, String contents) throws IOException { try (TokenStream tokenStream = analyzer.tokenStream(fieldName, contents)) { if (!tokenStream.hasAttribute(OffsetAttribute.class)) { // Can't split on term boundaries without offsets return -1; } int end = -1; tokenStream.reset(); while (tokenStream.incrementToken()) { OffsetAttribute attr = tokenStream.getAttribute(OffsetAttribute.class); if (attr.endOffset() >= noMatchSize) { // Jump to the end of this token if it wouldn't put us past the boundary if (attr.endOffset() == noMatchSize) { end = noMatchSize; } return end; } end = attr.endOffset(); } tokenStream.end(); // We've exhausted the token stream so we should just highlight everything. return end; } } }