private int[] createCounts(BitSet bitsets[], BitSet neededCounts) { // TODO use the neededCounts bit set to avoid the creation of bit sets which are not needed // TODO Check the minimum frequency at this stage --> all BitSets with a lower cardinality can be set to null // and all following don't have to be created. BitSet[] combinations = new BitSet[(1 << bitsets.length)]; int pos, pos2; for (int i = 0; i < bitsets.length; ++i) { pos = (1 << i); combinations[pos] = bitsets[i]; pos2 = pos + 1; for (int j = 1; j < pos; ++j) { combinations[pos2] = ((BitSet) bitsets[i].clone()); combinations[pos2].intersect(combinations[j]); ++pos2; } } int cardinalities[] = new int[combinations.length]; for (int i = 1; i < combinations.length; ++i) { cardinalities[i] = (int) combinations[i].cardinality(); } return cardinalities; } }
protected void visit(int state, int cardinality, BitSet documents, IntStack path) { // Check minimum base cluster cardinality. assert cardinality >= minBaseClusterSize; /* * Consider certain special cases of internal suffix tree nodes. */ if (!checkAcceptablePhrase(path)) { return; } // Calculate "effective phrase length", which is the number of non-stopwords. final int effectivePhraseLen = effectivePhraseLength(path); if (effectivePhraseLen == 0) { return; } /* * Calculate base cluster's score as a function of effective phrase's length. * STC originally used a linear gradient, we modified it to penalize very long * phrases (which usually correspond to duplicated snippets anyway). */ final float score = baseClusterScore(effectivePhraseLen, cardinality); candidates.add( new ClusterCandidate(path.toArray(), (BitSet) documents.clone(), cardinality, score)); } }.visit();