/** Adds an entry to document index for empty document @param docid, only if IndexEmptyDocuments is set to true. */ protected void indexEmpty(Map<String,String> docProperties) throws IOException { if (! IndexEmptyDocuments) return; /* add doc to documentindex, even though it's empty */ logger.warn("Adding empty document "+docProperties.get("docno")); docIndexBuilder.addEntryToBuffer(emptyDocIndexEntry); metaBuilder.writeDocumentEntry(docProperties); }
/** * Closes the underlying file after finished processing the collections. */ public void finishedCollections() { final int maxDocsEncodedDocid = Integer.parseInt( ApplicationSetup.getProperty("indexing.max.encoded."+structureName+"index.docs","5000")); if (index != null) { if (structureName.equals("document")) index.setIndexProperty("num.Documents", ""+numberOfDocumentIndexEntries); index.addIndexStructure(structureName, numberOfDocumentIndexEntries > maxDocsEncodedDocid ? "org.terrier.structures.FSADocumentIndex" : "org.terrier.structures.FSADocumentIndexInMem", "org.terrier.structures.IndexOnDisk,java.lang.String", "index,structureName"); index.addIndexStructureInputStream(structureName, "org.terrier.structures.FSADocumentIndex$FSADocumentIndexIterator", "org.terrier.structures.IndexOnDisk,java.lang.String", "index,structureName"); } close(); } }
try { final DocumentIndexBuilder docidOutput = new DocumentIndexBuilder(destIndex, "document"); final String[] metaTags = ArrayUtils.parseCommaDelimitedString(srcIndex1.getIndexProperty("index.meta.key-names", "docno")); final int[] metaTagLengths = ArrayUtils.parseCommaDelimitedInts(srcIndex1.getIndexProperty("index.meta.value-lengths", "20")); DocumentIndexEntry die = docidInput1.next(); DocumentIndexEntry dieNew = (fieldCount > 0) ? die : new SimpleDocumentIndexEntry(die); docidOutput.addEntryToBuffer(dieNew); metaBuilder.writeDocumentEntry(metaInput1.next()); DocumentIndexEntry die = docidInput2.next(); DocumentIndexEntry dieNew = (fieldCount > 0) ? die : new SimpleDocumentIndexEntry(die); docidOutput.addEntryToBuffer(dieNew); metaBuilder.writeDocumentEntry(metaInput2.next()); docidOutput.finishedCollections(); docidOutput.close(); metaBuilder.close(); IndexUtil.close(docidInput1);
final DocumentIndexBuilder dios = new DocumentIndexBuilder(index, "document-df"); final Iterator<DocumentIndexEntry> docidInput = (Iterator<DocumentIndexEntry>)index.getIndexStructureInputStream("document"); dios.addEntryToBuffer(die); dis.close(); Files.delete(offsetsFilename); dios.close(); IndexUtil.renameIndexStructure(index, "document-df", "document");
createMemoryPostings(); currentIndex = Index.createNewIndex(path, prefix); docIndexBuilder = new DocumentIndexBuilder(currentIndex, "document"); metaBuilder = createMetaIndexBuilder(); docIndexBuilder.finishedCollections(); if (FieldScore.FIELDS_COUNT > 0)
.getProperty("indexer.meta.reverse.keys", ""))); DocumentIndexBuilder docOut = new DocumentIndexBuilder(index, "document"); docOut.addEntryToBuffer(die); docOut.finishedCollections(); docOut.close(); metaOut.close();
.getProperty("indexer.meta.reverse.keys", ""))); DocumentIndexBuilder docOut = new DocumentIndexBuilder(newIndex, "document"); docOut.addEntryToBuffer(docIter.next()); docOut.close(); metaOut.close();
createMemoryPostings(); currentIndex = Index.createNewIndex(path, prefix); docIndexBuilder = new DocumentIndexBuilder(currentIndex, "document"); metaBuilder = createMetaIndexBuilder(); docIndexBuilder.finishedCollections(); if (FieldScore.FIELDS_COUNT > 0)
protected void mergeDirectFiles() { try { final DocumentIndexBuilder docidOutput = new DocumentIndexBuilder(destIndex, "document"); docidOutput.addEntryToBuffer(die); metaBuilder.writeDocumentEntry(metaInput1.getAllItems(sourceDocid)); sourceDocid++; docidOutput.addEntryToBuffer(die); metaBuilder.writeDocumentEntry(metaInput2.getAllItems(sourceDocid)); sourceDocid++; docidOutput.finishedCollections(); docidOutput.close();
docIndexBuilder = new DocumentIndexBuilder(currentIndex, "document"); metaBuilder = createMetaIndexBuilder(); emptyDocIndexEntry = (FieldScore.FIELDS_COUNT > 0) ? new FieldDocumentIndexEntry(FieldScore.FIELDS_COUNT) : new BasicDocumentIndexEntry(); docIndexBuilder.finishedCollections();
/** Adds an entry to document index for empty document @param docid, only if IndexEmptyDocuments is set to true. */ protected void indexEmpty(Map<String,String> docProperties) throws IOException { if (seenDocnos.contains(docProperties.get("docno"))) return; else seenDocnos.add(docProperties.get("docno")); if (! IndexEmptyDocuments) return; /* add doc to documentindex, even though it's empty */ logger.warn("Adding empty document "+docProperties.get("docno")); docIndexBuilder.addEntryToBuffer(emptyDocIndexEntry); metaBuilder.writeDocumentEntry(docProperties); }
logger.error("Cannot make DirectInvertedOutputStream:", ioe); docIndexBuilder = new DocumentIndexBuilder(currentIndex, "document"); metaBuilder = createMetaIndexBuilder(); emptyDocIndexEntry = (FieldScore.FIELDS_COUNT > 0) ? new FieldDocumentIndexEntry(FieldScore.FIELDS_COUNT) : new BasicDocumentIndexEntry(); docIndexBuilder.finishedCollections();
/** * {@inheritDoc}. * This implementation only places content in the runs in memory, which will eventually be flushed to disk. */ @Override protected void indexDocument(Map<String,String> docProperties, DocumentPostingList termsInDocument) throws Exception { if (seenDocnos.contains(docProperties.get("docno"))) return; else seenDocnos.add(docProperties.get("docno")); if (termsInDocument.getDocumentLength() > 0) { numberOfDocsSinceCheck++; numberOfDocsSinceFlush++; checkFlush(); mp.addTerms(termsInDocument, currentId); DocumentIndexEntry die = termsInDocument.getDocumentStatistics(); docIndexBuilder.addEntryToBuffer((FieldScore.FIELDS_COUNT > 0) ? die : new SimpleDocumentIndexEntry(die)); metaBuilder.writeDocumentEntry(docProperties); currentId++; numberOfDocuments++; } }
/** * This adds a document to the direct and document indexes, as well * as it's terms to the lexicon. Handled internally by the methods * indexFieldDocument and indexNoFieldDocument. * @param docProperties Map<String,String> properties of the document * @param _termsInDocument DocumentPostingList the terms in the document. * */ protected void indexDocument(Map<String,String> docProperties, DocumentPostingList _termsInDocument) throws Exception { /* add words to lexicontree */ lexiconBuilder.addDocumentTerms(_termsInDocument); /* add doc postings to the direct index */ BitIndexPointer dirIndexPost = directIndexBuilder.writePostings(_termsInDocument.getPostings2(termCodes)); /* add doc to documentindex */ DocumentIndexEntry die = _termsInDocument.getDocumentStatistics(); die.setBitIndexPointer(dirIndexPost); docIndexBuilder.addEntryToBuffer(die); /** add doc metadata to index */ metaBuilder.writeDocumentEntry(docProperties); }
/** * {@inheritDoc}. * This implementation only places content in the runs in memory, which will eventually be flushed to disk. */ @Override protected void indexDocument(Map<String,String> docProperties, DocumentPostingList termsInDocument) throws Exception { if (termsInDocument.getDocumentLength() > 0) { numberOfDocsSinceCheck++; numberOfDocsSinceFlush++; checkFlush(); mp.addTerms(termsInDocument, currentId); DocumentIndexEntry die = termsInDocument.getDocumentStatistics(); docIndexBuilder.addEntryToBuffer((FieldScore.FIELDS_COUNT > 0) ? die : new SimpleDocumentIndexEntry(die)); metaBuilder.writeDocumentEntry(docProperties); currentId++; numberOfDocuments++; } }
/** * This adds a document to the direct and document indexes, as well * as it's terms to the lexicon. Handled internally by the methods * indexFieldDocument and indexNoFieldDocument. * @param docProperties Map<String,String> properties of the document * @param _termsInDocument DocumentPostingList the terms in the document. * */ protected void indexDocument(Map<String,String> docProperties, DocumentPostingList _termsInDocument) throws Exception { /* add words to lexicontree */ lexiconBuilder.addDocumentTerms(_termsInDocument); /* add doc postings to the direct index */ BitIndexPointer dirIndexPost = directIndexBuilder.writePostings(_termsInDocument.getPostings2(termCodes)); //.addDocument(termsInDocument.getPostings()); /* add doc to documentindex */ DocumentIndexEntry die = _termsInDocument.getDocumentStatistics(); die.setBitIndexPointer(dirIndexPost); docIndexBuilder.addEntryToBuffer(die); /** add doc metadata to index */ metaBuilder.writeDocumentEntry(docProperties); }