/** Increment the statistics by the specified amount */ public void addStatistics(CollectionStatistics cs) { numberOfDocuments += cs.getNumberOfDocuments(); numberOfPointers += cs.getNumberOfPointers(); numberOfTokens += cs.getNumberOfTokens(); numberOfUniqueTerms = Math.max(cs.getNumberOfUniqueTerms(), numberOfUniqueTerms); final long[] otherFieldTokens = cs.getFieldTokens(); for(int fi=0;fi<numberOfFields;fi++) fieldTokens[fi] += otherFieldTokens[fi]; relcaluateAverageLengths(); }
/** Increment the statistics by the specified amount */ public void addStatistics(CollectionStatistics cs) { numberOfDocuments += cs.getNumberOfDocuments(); numberOfPointers += cs.getNumberOfPointers(); numberOfTokens += cs.getNumberOfTokens(); numberOfUniqueTerms = Math.max(cs.getNumberOfUniqueTerms(), numberOfUniqueTerms); final long[] otherFieldTokens = cs.getFieldTokens(); for(int fi=0;fi<numberOfFields;fi++) fieldTokens[fi] += otherFieldTokens[fi]; relcaluateAverageLengths(); }
@SuppressWarnings("deprecation") @Override public int run(String[] args) { Index.setIndexLoadingProfileAsRetrieval(false); Index i = IndexFactory.of(IndexRef.of(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX)); System.out.println("Collection statistics:"); System.out.println("number of indexed documents: " + i.getCollectionStatistics().getNumberOfDocuments()); System.out.println("size of vocabulary: " + i.getCollectionStatistics().getNumberOfUniqueTerms()); System.out.println("number of tokens: " + i.getCollectionStatistics().getNumberOfTokens()); System.out.println("number of pointers: " + i.getCollectionStatistics().getNumberOfPointers()); try { i.close(); } catch (IOException e) {} return 0; }
@SuppressWarnings("deprecation") @Override public int run(String[] args) { Index.setIndexLoadingProfileAsRetrieval(false); Index i = IndexFactory.of(IndexRef.of(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX)); System.out.println("Collection statistics:"); System.out.println("number of indexed documents: " + i.getCollectionStatistics().getNumberOfDocuments()); System.out.println("size of vocabulary: " + i.getCollectionStatistics().getNumberOfUniqueTerms()); System.out.println("number of tokens: " + i.getCollectionStatistics().getNumberOfTokens()); System.out.println("number of pointers: " + i.getCollectionStatistics().getNumberOfPointers()); try { i.close(); } catch (IOException e) {} return 0; }
valueFactoryClass); TerrierTimer tt = new TerrierTimer("Recompressing inverted index", index.getCollectionStatistics().getNumberOfPointers()); tt.start(); try{
valueFactoryClass); TerrierTimer tt = new TerrierTimer("Recompressing inverted index", index.getCollectionStatistics().getNumberOfPointers()); tt.start(); try{
/** * Constructor. */ public static MultiStats factory(CollectionStatistics[] stats) { int numDocs = 0, numTerms = 0; long numTokens = 0, numPointers = 0; long[] fieldTokens = new long[] { 0 }; for (CollectionStatistics stat : stats) { numDocs += stat.getNumberOfDocuments(); numTokens += stat.getNumberOfTokens(); numPointers += stat.getNumberOfPointers(); if (stat.getNumberOfUniqueTerms() > numTerms) numTerms = stat.getNumberOfUniqueTerms(); } return new MultiStats(numDocs, numTerms, numTokens, numPointers, fieldTokens); }
@Override public void setCollectionStatistics(CollectionStatistics _cs) { super.setCollectionStatistics(_cs); int fieldCount = _cs.getNumberOfFields(); if (fieldCount < 1) throw new IllegalStateException("Fields must be 1 or more"); long tokens = 0; final long[] tokensf = _cs.getFieldTokens(); for(int fieldId : activeFieldIds) { tokens += tokensf[fieldId]; } super.numberOfTokens = tokens; super.averageDocumentLength = (double)tokens / (double)_cs.getNumberOfDocuments(); basicModel.setCollectionStatistics( new CollectionStatistics(_cs.getNumberOfDocuments(), _cs.getNumberOfUniqueTerms(), tokens, _cs.getNumberOfPointers(), new long[0])); }
@Override public void setCollectionStatistics(CollectionStatistics _cs) { super.setCollectionStatistics(_cs); int fieldCount = _cs.getNumberOfFields(); if (fieldCount < 1) throw new IllegalStateException("Fields must be 1 or more"); long tokens = 0; final long[] tokensf = _cs.getFieldTokens(); for(int fieldId : activeFieldIds) { tokens += tokensf[fieldId]; } super.numberOfTokens = tokens; super.averageDocumentLength = (double)tokens / (double)_cs.getNumberOfDocuments(); basicModel.setCollectionStatistics( new CollectionStatistics(_cs.getNumberOfDocuments(), _cs.getNumberOfUniqueTerms(), tokens, _cs.getNumberOfPointers(), new long[0])); }
index.getProperties().put("num.Pointers", String.valueOf(this.getCollectionStatistics().getNumberOfPointers())); index.getProperties().put("num.Terms", String.valueOf(this.getCollectionStatistics().getNumberOfUniqueTerms())); index.getProperties().put("num.Tokens", String.valueOf(this.getCollectionStatistics().getNumberOfTokens()));
/** * prepare */ public void prepare() { averageDocumentLength = cs.getAverageDocumentLength(); numberOfDocuments = (double)cs.getNumberOfDocuments(); i.setNumberOfDocuments(numberOfDocuments); numberOfTokens = (double)cs.getNumberOfTokens(); numberOfUniqueTerms = (double)cs.getNumberOfUniqueTerms(); numberOfPointers = (double)cs.getNumberOfPointers(); documentFrequency = (double)getOverflowed(es.getDocumentFrequency()); termFrequency = (double)getOverflowed(es.getFrequency()); }
/** * prepare */ public void prepare() { averageDocumentLength = cs.getAverageDocumentLength(); numberOfDocuments = (double)cs.getNumberOfDocuments(); i.setNumberOfDocuments(numberOfDocuments); numberOfTokens = (double)cs.getNumberOfTokens(); numberOfUniqueTerms = (double)cs.getNumberOfUniqueTerms(); numberOfPointers = (double)cs.getNumberOfPointers(); documentFrequency = (double)getOverflowed(es.getDocumentFrequency()); termFrequency = (double)getOverflowed(es.getFrequency()); }
@Test public void testWritable() throws Exception { CollectionStatistics cs1 = new CollectionStatistics(5, 6, 7, 8, new long[]{2}); ByteArrayOutputStream baos = new ByteArrayOutputStream(); DataOutputStream dos = new DataOutputStream(baos); cs1.write(dos); dos.flush(); final byte[] bytes = baos.toByteArray(); assertTrue(bytes.length > 0); CollectionStatistics cs2 = new CollectionStatistics(); cs2.readFields(new DataInputStream(new ByteArrayInputStream(bytes))); assertEquals(cs1.getNumberOfDocuments(), cs2.getNumberOfDocuments()); assertEquals(cs1.getNumberOfUniqueTerms(), cs2.getNumberOfUniqueTerms()); assertEquals(cs1.getNumberOfPointers(), cs2.getNumberOfPointers()); assertEquals(cs1.getNumberOfTokens(), cs2.getNumberOfTokens()); assertEquals(cs1.getAverageDocumentLength(), cs2.getAverageDocumentLength(), 0.0d); //TODO: test fields }
protected void checkCollectionStatistics(Index index) { final CollectionStatistics cs = index.getCollectionStatistics(); System.err.println("num docs=" + cs.getNumberOfDocuments()); assertEquals("Number of documents doesn't match", DOCUMENT_LENGTHS.length, cs.getNumberOfDocuments()); assertEquals("Number of tokens doesn't match", StaTools.sum(DOCUMENT_LENGTHS), cs.getNumberOfTokens()); assertEquals("Average document length doesn't match", StaTools.mean(DOCUMENT_LENGTHS), cs.getAverageDocumentLength(), 0.0d); assertEquals("Number of pointers doesnt match", NUMBER_POINTERS, cs.getNumberOfPointers()); assertEquals("Number of unique terms doesn't match", NUMBER_UNIQUE_TERMS, cs.getNumberOfUniqueTerms()); }
int[] fieldFs = null; TerrierTimer tt = new TerrierTimer("Inverted index processing for this iteration", index.getCollectionStatistics().getNumberOfPointers()); tt.start(); try{
int[] fieldFs = null; TerrierTimer tt = new TerrierTimer("Inverted index processing for this iteration", index.getCollectionStatistics().getNumberOfPointers()); tt.start(); try{
assertEquals(stats1.getNumberOfTokens(), stats2.getNumberOfTokens()); assertEquals(stats1.getNumberOfPointers(), stats2.getNumberOfPointers()); assertEquals(stats1.getAverageDocumentLength(), stats2.getAverageDocumentLength(), 0.0d);
assertEquals(5, rs.getCollectionStatistics().getNumberOfPointers()); assertEquals(5, rs.getCollectionStatistics().getNumberOfTokens()); assertEquals(5, rs.getCollectionStatistics().getNumberOfUniqueTerms());
assertEquals(1, frInput.getCollectionStatistics().getNumberOfPointers()); assertEquals(1, frInput.getCollectionStatistics().getNumberOfTokens()); assertEquals(1, frInput.getCollectionStatistics().getNumberOfUniqueTerms());
assertEquals(1, frInput.getCollectionStatistics().getNumberOfPointers()); assertEquals(1, frInput.getCollectionStatistics().getNumberOfTokens()); assertEquals(1, frInput.getCollectionStatistics().getNumberOfUniqueTerms());