@Override public List<Document> call() { final List<Document> localScoreDocs = new ArrayList<>(); try (final DirectoryReader directoryReader = DirectoryReader.open(FSDirectory.open(indexDirectory))) { final IndexSearcher searcher = new IndexSearcher(directoryReader); final TopDocs topDocs = searcher.search(luceneQuery, 10000000); logger.info("For {}, Top Docs has {} hits; reading Lucene results", indexDirectory, topDocs.scoreDocs.length); if (topDocs.totalHits > 0) { for (final ScoreDoc scoreDoc : topDocs.scoreDocs) { final int docId = scoreDoc.doc; final Document d = directoryReader.document(docId); localScoreDocs.add(d); } } hits.addAndGet(localScoreDocs.size()); } catch (final IndexNotFoundException e) { } catch (final IOException ioe) { throw new RuntimeException(ioe); } return localScoreDocs; } };
public static void outputTheWholeThing(IndexWriter writer) throws IOException { DirectoryReader reader = DirectoryReader.open(writer, true); for (int i=0; i<reader.maxDoc(); i++) { Document doc = reader.document(i); System.out.println(doc); } System.out.println("Pending deletions:" + reader.numDeletedDocs()); }
public String convertLuceneDocidToDocid(int docid) throws IOException { Document d = reader.document(docid); IndexableField doc = d.getField(LuceneDocumentGenerator.FIELD_ID); if (doc == null) { // Really shouldn't happen! throw new RuntimeException(); } return doc.stringValue(); }
continue; Document doc = reader.document(i); DQCategory dqCat = DictionaryUtils.categoryFromDocument(doc); sharedMetadata.put(dqCat.getId(), dqCat);
@Override public void getNext(CAS cas) throws IOException, CollectionException { JCas jcas = null; try { jcas = cas.getJCas(); } catch (CASException e) { e.printStackTrace(); throw new IOException(e); } Document doc = ireader.document(docNum++); IndexableField textField = doc.getField(fieldName); while(textField == null){ doc = ireader.document(docNum++); textField = doc.getField(fieldName); } StringBuffer text = new StringBuffer(textField.stringValue()); int pos; while((pos = XMLUtils.checkForNonXmlCharacters(text.toString())) != -1){ text.setCharAt(pos, ' '); } jcas.setDocumentText(text.toString().replaceAll("__+", " ")); DocumentID docId = new DocumentID(jcas); docId.setDocumentID("doc" + docNum); docId.addToIndexes(); wordNum += text.length() / CHARS_PER_WORD; }
@Override public void getNext(CAS cas) throws IOException, CollectionException { JCas jcas = null; try { jcas = cas.getJCas(); } catch (CASException e) { e.printStackTrace(); throw new IOException(e); } Document doc = ireader.document(docNum++); IndexableField textField = doc.getField(fieldName); while(textField == null){ doc = ireader.document(docNum++); textField = doc.getField(fieldName); } StringBuffer text = new StringBuffer(textField.stringValue()); int pos; while((pos = XMLUtils.checkForNonXmlCharacters(text.toString())) != -1){ text.setCharAt(pos, ' '); } jcas.setDocumentText(text.toString().replaceAll("__+", " ")); DocumentID docId = new DocumentID(jcas); docId.setDocumentID("doc" + docNum); docId.addToIndexes(); wordNum += text.length() / CHARS_PER_WORD; }
public String getRawDocument(String docid) throws IOException, NotStoredException { Document d = reader.document(convertDocidToLuceneDocid(docid)); IndexableField doc = d.getField(LuceneDocumentGenerator.FIELD_RAW); if (doc == null) { throw new NotStoredException("Raw documents not stored!"); } return doc.stringValue(); }
public String getTransformedDocument(String docid) throws IOException, NotStoredException { Document d = reader.document(convertDocidToLuceneDocid(docid)); IndexableField doc = d.getField(LuceneDocumentGenerator.FIELD_BODY); if (doc == null) { throw new NotStoredException("Transformed documents not stored!"); } return doc.stringValue(); }
/** * initialize a list of serializable BroadcastDocumentObject from existing lucene Directory */ static List<BroadcastDocumentObject> readDocumentsFromIndex(Directory indexDir) throws IOException { List<BroadcastDocumentObject> dictionaryObject = new ArrayList<>(); DirectoryReader reader = DirectoryReader.open(indexDir); Bits liveDocs = MultiFields.getLiveDocs(reader); for (int i = 0; i < reader.maxDoc(); i++) { if (liveDocs != null && !liveDocs.get(i)) { continue; } Document doc = reader.document(i); String catId = doc.getField(DictionarySearcher.F_CATID).stringValue(); Set<String> valueSet = new HashSet<String>(); // original values must be read from the F_RAW field for (IndexableField syntermField : doc.getFields(DictionarySearcher.F_RAW)) { valueSet.add(syntermField.stringValue()); } dictionaryObject.add(new BroadcastDocumentObject(catId, valueSet)); } return dictionaryObject; }
String indexField = iter.next(); String acField = fieldMap.get(indexField); IndexableField field = reader.document(i).getField(indexField); String value = field != null ? reader.document(i).getField(indexField).stringValue() : null; System.err.println("Since AC phrase field would be null, this doc will not be created: " + reader.document(i)); phraseFieldEmpty = true; break;
String [] values = reader.document(i).getValues(oldFieldName); copyFields.put(fieldName, values); String excludeFieldValue = null; for (String fieldName : ignoreFields.keySet()) { String [] values = reader.document(i).getValues(fieldName); field = reader.document(i).getField(fieldName.substring(1, fieldName.length() - 1)); } else { field = reader.document(i).getField(fieldName);
@Override public FacetLabel getPath(int ordinal) throws IOException { ensureOpen(); // Since the cache is shared with DTR instances allocated from // doOpenIfChanged, we need to ensure that the ordinal is one that this DTR // instance recognizes. Therefore we do this check up front, before we hit // the cache. if (ordinal < 0 || ordinal >= indexReader.maxDoc()) { return null; } // TODO: can we use an int-based hash impl, such as IntToObjectMap, // wrapped as LRU? Integer catIDInteger = Integer.valueOf(ordinal); synchronized (categoryCache) { FacetLabel res = categoryCache.get(catIDInteger); if (res != null) { return res; } } Document doc = indexReader.document(ordinal); FacetLabel ret = new FacetLabel(FacetsConfig.stringToPath(doc.get(Consts.FULL))); synchronized (categoryCache) { categoryCache.put(catIDInteger, ret); } return ret; }
@Override public FacetLabel getPath(int ordinal) throws IOException { ensureOpen(); // Since the cache is shared with DTR instances allocated from // doOpenIfChanged, we need to ensure that the ordinal is one that this DTR // instance recognizes. Therefore we do this check up front, before we hit // the cache. if (ordinal < 0 || ordinal >= indexReader.maxDoc()) { return null; } // TODO: can we use an int-based hash impl, such as IntToObjectMap, // wrapped as LRU? Integer catIDInteger = Integer.valueOf(ordinal); synchronized (categoryCache) { FacetLabel res = categoryCache.get(catIDInteger); if (res != null) { return res; } } Document doc = indexReader.document(ordinal); FacetLabel ret = new FacetLabel(FacetsConfig.stringToPath(doc.get(Consts.FULL))); synchronized (categoryCache) { categoryCache.put(catIDInteger, ret); } return ret; }