public void add(String field, String content) throws IOException { memoryIndex.addField(field, content, generalAnalyzer); //memoryIndex.addField(field+ALPHA_IDEOGRAPH_SUFFIX, // content, alphaIdeographAnalyzer); count(field); //count(field+ALPHA_IDEOGRAPH_SUFFIX); }
/** * Equivalent to <code>addField(fieldName, stream, 1.0f)</code>. * * @param fieldName * a name to be associated with the text * @param stream * the token stream to retrieve tokens from */ public void addField(String fieldName, TokenStream stream) { addField(fieldName, stream, 1.0f); }
/** * Adds a lucene {@link IndexableField} to the MemoryIndex using the provided analyzer * @param field the field to add * @param analyzer the analyzer to use for term analysis * @throws IllegalArgumentException if the field is a DocValues or Point field, as these * structures are not supported by MemoryIndex */ public void addField(IndexableField field, Analyzer analyzer) { addField(field, analyzer, 1.0f); }
/** * Iterates over the given token stream and adds the resulting terms to the index; * Equivalent to adding a tokenized, indexed, termVectorStored, unstored, * Lucene {@link org.apache.lucene.document.Field}. * Finally closes the token stream. Note that untokenized keywords can be added with this method via * {@link #keywordTokenStream(Collection)}, the Lucene <code>KeywordTokenizer</code> or similar utilities. * * @param fieldName * a name to be associated with the text * @param stream * the token stream to retrieve tokens from. */ public void addField(String fieldName, TokenStream stream) { addField(fieldName, stream, 0); }
/** * Iterates over the given token stream and adds the resulting terms to the index; * Equivalent to adding a tokenized, indexed, termVectorStored, unstored, * Lucene {@link org.apache.lucene.document.Field}. * Finally closes the token stream. Note that untokenized keywords can be added with this method via * {@link #keywordTokenStream(Collection)}, the Lucene <code>KeywordTokenizer</code> or similar utilities. * * @param fieldName * a name to be associated with the text * @param stream * the token stream to retrieve tokens from. * * @param positionIncrementGap * the position increment gap if fields with the same name are added more than once * */ public void addField(String fieldName, TokenStream stream, int positionIncrementGap) { addField(fieldName, stream, positionIncrementGap, 1); }
/** * Iterates over the given token stream and adds the resulting terms to the index; * Equivalent to adding a tokenized, indexed, termVectorStored, unstored, * Lucene {@link org.apache.lucene.document.Field}. * Finally closes the token stream. Note that untokenized keywords can be added with this method via * {@link #keywordTokenStream(Collection)}, the Lucene <code>KeywordTokenizer</code> or similar utilities. * * @param fieldName * a name to be associated with the text * @param stream * the token stream to retrieve tokens from. * @param boost * the boost factor for hits for this field * * @see org.apache.lucene.document.Field#setBoost(float) */ public void addField(String fieldName, TokenStream stream, float boost) { addField(fieldName, stream, boost, 0); }
/** * Builds a MemoryIndex from a lucene {@link Document} using an analyzer * @param document the document to index * @param analyzer the analyzer to use * @param storeOffsets <code>true</code> if offsets should be stored * @param storePayloads <code>true</code> if payloads should be stored * @param maxReusedBytes the number of bytes that should remain in the internal memory pools after {@link #reset()} is called * @return a MemoryIndex */ public static MemoryIndex fromDocument(Document document, Analyzer analyzer, boolean storeOffsets, boolean storePayloads, long maxReusedBytes) { MemoryIndex mi = new MemoryIndex(storeOffsets, storePayloads, maxReusedBytes); for (IndexableField field : document) { mi.addField(field, analyzer); } return mi; }
/** * Builds a MemoryIndex from a lucene {@link Document} using an analyzer * @param document the document to index * @param analyzer the analyzer to use * @param storeOffsets <code>true</code> if offsets should be stored * @param storePayloads <code>true</code> if payloads should be stored * @param maxReusedBytes the number of bytes that should remain in the internal memory pools after {@link #reset()} is called * @return a MemoryIndex */ public static MemoryIndex fromDocument(Iterable<? extends IndexableField> document, Analyzer analyzer, boolean storeOffsets, boolean storePayloads, long maxReusedBytes) { MemoryIndex mi = new MemoryIndex(storeOffsets, storePayloads, maxReusedBytes); for (IndexableField field : document) { mi.addField(field, analyzer); } return mi; }
/** * Convenience method; Tokenizes the given field text and adds the resulting * terms to the index; Equivalent to adding an indexed non-keyword Lucene * {@link org.apache.lucene.document.Field} that is tokenized, not stored, * termVectorStored with positions (or termVectorStored with positions and offsets), * * @param fieldName * a name to be associated with the text * @param text * the text to tokenize and index. * @param analyzer * the analyzer to use for tokenization */ public void addField(String fieldName, String text, Analyzer analyzer) { if (fieldName == null) throw new IllegalArgumentException("fieldName must not be null"); if (text == null) throw new IllegalArgumentException("text must not be null"); if (analyzer == null) throw new IllegalArgumentException("analyzer must not be null"); TokenStream stream = analyzer.tokenStream(fieldName, text); addField(fieldName, stream, 1.0f, analyzer.getPositionIncrementGap(fieldName), analyzer.getOffsetGap(fieldName)); }
public class MemoryIndexTest { private static final String DATE_FIELD = "date"; MemoryIndex index = new MemoryIndex(); NumericTokenStream numericTokenStream = new NumericTokenStream(); @Before public void init() { index.addField(DATE_FIELD, numericTokenStream.setIntValue(20141116)); } @Test public void testRange() { Query query = NumericRangeQuery.newIntRange(DATE_FIELD, 20141115, 20141118, true, true); assertTrue(index.search(query) > 0); } }
private SingletonDocumentBatch(Collection<InputDocument> documents, Similarity similarity) { super(documents, similarity); assert documents.size() == 1; memoryindex.setSimilarity(similarity); for (InputDocument doc : documents) { for (IndexableField field : doc.getDocument()) { memoryindex.addField(field, doc.getAnalyzers()); } } memoryindex.freeze(); reader = (LeafReader) memoryindex.createSearcher().getIndexReader(); }
private static HashSet<String> performSearch(Analyzer a) throws IOException { HashSet<String> results = new HashSet<>(); for (File file : getTestFiles()) { MemoryIndex memoryIndex = new MemoryIndex(true); final List<String> lines = Files.readAllLines(file.toPath(), Charset.forName("UTF-8")); memoryIndex.addField("title", lines.get(0), a); StringBuilder sb = new StringBuilder(); for (String line : lines) { sb.append(line); } memoryIndex.addField("content", sb.toString(), a); IndexSearcher searcher = memoryIndex.createSearcher(); ExistsCollector collector = new ExistsCollector(); searcher.search(new TermQuery(new Term("content", "אני")), collector); if (collector.exists()) { results.add(file.getName()); } } return results; }
private boolean matchField(Object iLeft, Object iRight, OLuceneFullTextIndex index, MemoryIndex memoryIndex) throws IOException, ParseException { for (IndexableField field : index.buildDocument(iLeft).getFields()) { memoryIndex.addField(field, index.indexAnalyzer()); } return memoryIndex.search(index.buildQuery(iRight)) > 0.0f; }
/** * Adds a lucene {@link IndexableField} to the MemoryIndex using the provided analyzer * @param field the field to add * @param analyzer the analyzer to use for term analysis * @param boost a field boost * @throws IllegalArgumentException if the field is a DocValues or Point field, as these * structures are not supported by MemoryIndex */ public void addField(IndexableField field, Analyzer analyzer, float boost) { if (field.fieldType().docValuesType() != DocValuesType.NONE) throw new IllegalArgumentException("MemoryIndex does not support DocValues fields"); if (analyzer == null) { addField(field.name(), field.tokenStream(null, null), boost); } else { addField(field.name(), field.tokenStream(analyzer, null), boost, analyzer.getPositionIncrementGap(field.name()), analyzer.getOffsetGap(field.name())); } }
private Fields generateTermVectors(Collection<GetField> getFields, boolean withOffsets, @Nullable Map<String, String> perFieldAnalyzer, Set<String> fields) throws IOException { /* store document in memory index */ MemoryIndex index = new MemoryIndex(withOffsets); for (GetField getField : getFields) { String field = getField.getName(); if (fields.contains(field) == false) { // some fields are returned even when not asked for, eg. _timestamp continue; } Analyzer analyzer = getAnalyzerAtField(field, perFieldAnalyzer); for (Object text : getField.getValues()) { index.addField(field, text.toString(), analyzer); } } /* and read vectors from it */ return MultiFields.getFields(index.createSearcher().getIndexReader()); }
public boolean isDeleted(Document document, Object key, OIdentifiable value) { boolean match = false; List<String> strings = deleted.get(value.getIdentity().toString()); if (strings != null) { MemoryIndex memoryIndex = new MemoryIndex(); for (String string : strings) { Query q = engine.deleteQuery(string, value); memoryIndex.reset(); for (IndexableField field : document.getFields()) { memoryIndex.addField(field.name(), field.stringValue(), new KeywordAnalyzer()); } match = match || (memoryIndex.search(q) > 0.0f); } return match; } return match; }
MemoryIndex indexDoc(ParseContext.Document d, Analyzer analyzer, MemoryIndex memoryIndex) { for (IndexableField field : d.getFields()) { if (field.fieldType().indexOptions() == IndexOptions.NONE && field.name().equals(UidFieldMapper.NAME)) { continue; } try { // TODO: instead of passing null here, we can have a CTL<Map<String,TokenStream>> and pass previous, // like the indexer does try (TokenStream tokenStream = field.tokenStream(analyzer, null)) { if (tokenStream != null) { memoryIndex.addField(field.name(), tokenStream, field.boost()); } } } catch (IOException e) { throw new ElasticsearchException("Failed to create token stream", e); } } return memoryIndex; }
@Override public Object execute(Object iThis, OIdentifiable iCurrentRecord, Object iCurrentResult, Object[] params, OCommandContext ctx) { OElement element = iThis instanceof OElement ? (OElement) iThis : ((OResult) iThis).toElement(); String indexName = (String) params[0]; OLuceneFullTextIndex index = searchForIndex(ctx, indexName); if (index == null) return false; String query = (String) params[1]; MemoryIndex memoryIndex = getOrCreateMemoryIndex(ctx); List<Object> key = index.getDefinition().getFields().stream().map(s -> element.getProperty(s)).collect(Collectors.toList()); try { for (IndexableField field : index.buildDocument(key).getFields()) { memoryIndex.addField(field, index.indexAnalyzer()); } ODocument metadata = getMetadata(params); OLuceneKeyAndMetadata keyAndMetadata = new OLuceneKeyAndMetadata( new OLuceneCompositeKey(Arrays.asList(query)).setContext(ctx), metadata); return memoryIndex.search(index.buildQuery(keyAndMetadata)) > 0.0f; } catch (ParseException e) { OLogManager.instance().error(this, "error occurred while building query", e); } return null; }
@Override public void prepare(PercolateContext context, ParsedDocument parsedDocument) { MemoryIndex memoryIndex = cache.get(); for (IndexableField field : parsedDocument.rootDoc().getFields()) { if (field.fieldType().indexOptions() == IndexOptions.NONE && field.name().equals(UidFieldMapper.NAME)) { continue; } try { Analyzer analyzer = context.mapperService().documentMapper(parsedDocument.type()).mappers().indexAnalyzer(); // TODO: instead of passing null here, we can have a CTL<Map<String,TokenStream>> and pass previous, // like the indexer does try (TokenStream tokenStream = field.tokenStream(analyzer, null)) { if (tokenStream != null) { memoryIndex.addField(field.name(), tokenStream, field.boost()); } } } catch (Exception e) { throw new ElasticsearchException("Failed to create token stream for [" + field.name() + "]", e); } } context.initialize(new DocEngineSearcher(memoryIndex), parsedDocument); }