Refine search
@Override protected AToken computeNext() { try { if (!tkstream.incrementToken()) { tkstream.end(); tkstream.close(); return endOfData(); final int startOffset = offsetAtt.startOffset(); final int endOffset = offsetAtt.endOffset(); final String text = termAtt.term(); return new AToken() { @Override
@Override public boolean incrementToken() throws IOException { boolean result = false; if (input.incrementToken()) { String docid = termAtt.term(); termAtt.setTermBuffer(LsiIndex.PAYLOAD_TERM_TEXT); payAtt.setPayload(new Payload(PayloadEncoder.encodePayloadId(docid))); return true; } return result; } }
public static void main(String[] args) throws IOException { FeatureVectorEncoder encoder = new StaticWordValueEncoder("text"); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31); StringReader in = new StringReader("text to magically vectorize"); TokenStream ts = analyzer.tokenStream("body", in); TermAttribute termAtt = ts.addAttribute(TermAttribute.class); Vector v1 = new RandomAccessSparseVector(100); while (ts.incrementToken()) { char[] termBuffer = termAtt.termBuffer(); int termLen = termAtt.termLength(); String w = new String(termBuffer, 0, termLen); encoder.addToVector(w, 1, v1); } System.out.printf("%s\n", new SequentialAccessSparseVector(v1)); }
public List<TmHit> searchSimpleConcordance(String query, int threshold, int max, Metadata metadata) { if (threshold < 0 || threshold > 100) { throw new IllegalArgumentException(""); } float searchThreshold = (float) threshold; if (threshold < 0) searchThreshold = 0.0f; if (threshold > 100) searchThreshold = 100.0f; // create basic ngram analyzer to tokenize query TokenStream queryTokenStream = defaultFuzzyAnalyzer.tokenStream(TranslationUnitField.SOURCE .name(), new StringReader(query)); // get the TermAttribute from the TokenStream TermAttribute termAtt = (TermAttribute) queryTokenStream.addAttribute(TermAttribute.class); SimpleConcordanceFuzzyQuery fQuery = new SimpleConcordanceFuzzyQuery(searchThreshold); try { queryTokenStream.reset(); while (queryTokenStream.incrementToken()) { Term t = new Term(TranslationUnitField.SOURCE.name(), termAtt.term()); fQuery.add(t); } queryTokenStream.end(); queryTokenStream.close(); } catch (IOException e) { throw new OkapiIOException(e.getMessage(), e); } return getConcordanceHits(max, fQuery, query, metadata); }
TermAttribute termAttribute = tokenStream.addAttribute(TermAttribute.class); PayloadAttribute payloadAttribute = tokenStream.addAttribute(PayloadAttribute.class); tokenStream.incrementToken(); tokenStream.end(); tokenStream.close(); String value = new String(termAttribute.termBuffer(), 0, termAttribute.termLength()); if (value.startsWith(namePrefix)) { String path = getNamespaceMappings().translatePath(p); value = FieldNames.createNamedValue(path, rawValue); termAttribute.setTermBuffer(value); PropertyMetaData pdm = PropertyMetaData .fromByteArray(payloadAttribute
TokenStream tokenStream = analyzer.tokenStream(fieldName, reader); OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class); while (tokenStream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String term = termAttribute.term(); }
private List<String> tokenize(TokenStream stream) throws IOException { List<String> tokens = new ArrayList<String>(); TermAttribute term = (TermAttribute) stream .addAttribute(TermAttribute.class); while (stream.incrementToken()) { // Not sure if we somehow can use termBuffer() to get a char[] // so we do no have to create a new String for each term tokens.add(term.term()); } return tokens; }
ts.reset(); TermAttribute termAtt = (TermAttribute) ts .addAttribute(TermAttribute.class); while (ts.incrementToken()) { if (printGate != null && printGate.filter(wordsCount)) { list.add(new CToken(termAtt.term(), wordsCount));
/** * Creates a query to find intervals a number is in. * @param name The name of the field to search. * @param value The search value. * @param precisionStep The precision step used when indexing the field. */ public InNumericIntervalQuery(final String name, final long value, final int precisionStep) { super(true); this.value = value; TokenStream stream = new NumericTokenStream(precisionStep) .setLongValue(value); try { stream.reset(); while (stream.incrementToken()) { this.add( new TermQuery( new Term(name, stream.getAttribute(TermAttribute.class).term())), BooleanClause.Occur.SHOULD); } } catch (IOException e) { throw new IllegalStateException( "This should never happen - NumericTokenStream does no IO."); } }
@Override public boolean incrementToken() throws IOException { if (!input.incrementToken()) return false; CharSequence text = CharBuffer.wrap(termAtt.termBuffer(), 0, termAtt.termLength()); Matcher m = p.matcher(text); if (all) { termAtt.setTermBuffer(m.replaceAll(replacement)); } else { termAtt.setTermBuffer(m.replaceFirst(replacement)); } return true; }
/** * Extracts terms from text * * @param text * @return a map of terms to their offsets in text * @throws IOException */ public static Map<String, List<Integer>> extractTextTerms(String text) throws IOException { Map<String, List<Integer>> terms = new HashMap<String, List<Integer>>(); TokenStream tokenStream = fileAnalyzer.tokenStream(Field.CONTENTS.toString(), new StringReader(text)); TermAttribute termAtt = (TermAttribute) tokenStream.addAttribute(TermAttribute.class); OffsetAttribute offsetAtt = (OffsetAttribute) tokenStream.addAttribute(OffsetAttribute.class); while(tokenStream.incrementToken()) { String termText = termAtt.term().toLowerCase(Locale.ENGLISH);// t.termText().toLowerCase(Locale.ENGLISH); int offset = offsetAtt.startOffset(); List<Integer> offsets = terms.get(termText); if( offsets == null ) { offsets = new LinkedList<Integer>(); terms.put(termText, offsets); } offsets.add(offset); } tokenStream.close(); return terms; }
public final boolean incrementToken() throws IOException { if (input.incrementToken()) { int len = termAtt.termLength(); len++; termAtt.resizeTermBuffer(len); termAtt.termBuffer()[len - 1] = WILDCARD_OPERATOR; termAtt.setTermLength(len); return true; } else { return false; } }
@Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { System.out.println("TERM: " + termAtt.term()); return true; } return false; } }
@Override public boolean incrementToken() throws IOException { while (input.incrementToken()) { if (words.contains(termAtt.termBuffer(), 0, termAtt.termLength())) return true; } return false; } }
if (!input.incrementToken()) return false; if (termAtt.termLength()==0) return true; String value = termAtt.term(); String phonetic = null; try { termAtt.setTermBuffer(phonetic); return true; termAtt.setTermBuffer(phonetic); return true;
@Override public boolean incrementToken() throws IOException { if( save != null ) { // clearAttributes(); // not currently necessary restoreState(save); save = null; return true; } if (!input.incrementToken()) return false; // pass through zero-length terms int oldLen = termAtt.termLength(); if (oldLen ==0) return true; int origOffset = posAtt.getPositionIncrement(); if (withOriginal == true){ posAtt.setPositionIncrement(0); save = captureState(); } char [] buffer = termAtt.resizeTermBuffer(oldLen + 1); buffer[oldLen] = markerChar; //String reversed = reverseAndMark(value, markerChar); ReverseStringFilter.reverse(buffer, oldLen + 1); posAtt.setPositionIncrement(origOffset); termAtt.setTermBuffer(buffer, 0, oldLen +1); return true; }
/** * Fills Lucene token with the current token text. */ final void getText(TermAttribute t) { t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead); }
return false; terms = termAttr.term().split("_"); termAttr.setTermBuffer(terms[pos++]); if (pos == terms.length) { terms = null;
private void splitIntoTokens() { String term = termAtt.term(); String[] termParts = splitTerm(term); if(termParts.length > 1) { int termPos = offsetAtt.startOffset(); for (int i = 0; i < termParts.length; i++) { String termPart = termParts[i]; int termPartPos = termPos + term.indexOf(termPart); int termPartEndPos = termPartPos + termPart.length(); Token newToken = new Token(termPart, termPartPos, termPartEndPos); newToken.setPositionIncrement(0); // in the same position tokens.add( newToken ); } } }
@Override public boolean incrementToken() { if (this.i >= this.segments.size()) { return false; } clearAttributes(); IntervalSegment interval = this.segments.get(this.i); long value = interval.getStart(); int shift = interval.getShift(); final char[] buffer = this.termAtt.resizeTermBuffer(NumericUtils.BUF_SIZE_LONG); this.termAtt.setTermLength( NumericUtils.longToPrefixCoded(value, shift, buffer)); this.i++; return true; }