/** * Creates a {@link Cluster} with the provided <code>phrase</code> to be used as the * cluster's label and <code>documents</code> contained in the cluster. * * @param phrase the phrase to form the cluster's label * @param documents documents contained in the cluster */ public Cluster(String phrase, Document... documents) { addPhrases(phrase); addDocuments(documents); }
private void addToCluster(Map<Object, Cluster> clusters, Object fieldValue, Document document) { if (fieldValue == null) { return; } Cluster cluster = clusters.get(fieldValue); if (cluster == null) { cluster = new Cluster(); cluster.addPhrases(buildClusterLabel(fieldValue)); clusters.put(fieldValue, cluster); } cluster.addDocuments(document); }
/** * Create the junk (unassigned documents) cluster and create the final * set of clusters in Carrot2 format. */ private void postProcessing(List<ClusterCandidate> clusters) { // Adapt to Carrot2 classes, counting used documents on the way. final BitSet all = new BitSet(documents.size()); final ArrayList<Document> docs = Lists.newArrayListWithCapacity(documents.size()); final ArrayList<String> phrases = Lists.newArrayListWithCapacity(3); for (ClusterCandidate c : clusters) { final Cluster c2 = new Cluster(); c2.addPhrases(collectPhrases(phrases, c)); c2.addDocuments(collectDocuments(docs, c.documents)); c2.setScore((double) c.score); this.clusters.add(c2); all.or(c.documents); docs.clear(); phrases.clear(); } Collections.sort(this.clusters, Cluster.byReversedWeightedScoreAndSizeComparator(scoreWeight)); Cluster.appendOtherTopics(this.documents, this.clusters); }
private static List<Cluster> sanityCheck(List<Cluster> in, Predicate<Document> docFilter) { List<Cluster> cloned = Lists.newArrayListWithCapacity(in.size()); for (Cluster c : in) { Cluster c2 = new Cluster(); c2.addPhrases(c.getPhrases()); c2.addDocuments( Iterables.filter(c.getDocuments(), docFilter)); c2.addSubclusters(sanityCheck(c.getSubclusters(), docFilter)); cloned.add(c2); } return cloned; } }
cluster.addPhrases(clusterLabel); clusters.add(cluster); documentsInClusters.addAll(indexes);
if (rawCluster.size() > 1) cluster.addPhrases(getLabels(rawCluster, vsmContext.termDocumentMatrix, rowToStemIndex, preprocessingContext.allStems.mostFrequentOriginalWordIndex,
cluster.addPhrases(labelFormatter.format(context, labelFeature)); cluster.setAttribute(Cluster.SCORE, clusterLabelScore[i]);