/** * Creates a {@link Cluster} with the provided <code>phrase</code> to be used as the * cluster's label and <code>documents</code> contained in the cluster. * * @param phrase the phrase to form the cluster's label * @param documents documents contained in the cluster */ public Cluster(String phrase, Document... documents) { addPhrases(phrase); addDocuments(documents); }
/** * Replace document refids with the actual references upon deserialization. */ private void documentIdToReference(Cluster cluster, Map<String, Document> documents) { if (cluster.documentIds != null) { for (Cluster.DocumentRefid documentRefid : cluster.documentIds) { cluster.addDocuments(documents.get(documentRefid.refid)); } } for (Cluster subcluster : cluster.getSubclusters()) { documentIdToReference(subcluster, documents); } }
private void addToCluster(Map<Object, Cluster> clusters, Object fieldValue, Document document) { if (fieldValue == null) { return; } Cluster cluster = clusters.get(fieldValue); if (cluster == null) { cluster = new Cluster(); cluster.addPhrases(buildClusterLabel(fieldValue)); clusters.put(fieldValue, cluster); } cluster.addDocuments(document); }
/** * Create the junk (unassigned documents) cluster and create the final * set of clusters in Carrot2 format. */ private void postProcessing(List<ClusterCandidate> clusters) { // Adapt to Carrot2 classes, counting used documents on the way. final BitSet all = new BitSet(documents.size()); final ArrayList<Document> docs = Lists.newArrayListWithCapacity(documents.size()); final ArrayList<String> phrases = Lists.newArrayListWithCapacity(3); for (ClusterCandidate c : clusters) { final Cluster c2 = new Cluster(); c2.addPhrases(collectPhrases(phrases, c)); c2.addDocuments(collectDocuments(docs, c.documents)); c2.setScore((double) c.score); this.clusters.add(c2); all.or(c.documents); docs.clear(); phrases.clear(); } Collections.sort(this.clusters, Cluster.byReversedWeightedScoreAndSizeComparator(scoreWeight)); Cluster.appendOtherTopics(this.documents, this.clusters); }
cluster.addDocuments(subcluster.getDocuments()); cluster.addSubclusters(subcluster.getSubclusters()); cluster.addDocuments(documents[documentIndex.intValue()]);
/** * Builds an "Other Topics" cluster that groups those documents from * <code>allDocument</code> that were not referenced in any cluster in * <code>clusters</code>. * * @param allDocuments all documents to check against * @param clusters list of clusters with assigned documents * @param label label for the "Other Topics" group * @return the "Other Topics" cluster */ public static Cluster buildOtherTopics(List<Document> allDocuments, List<Cluster> clusters, String label) { final Set<Document> unclusteredDocuments = Sets.newLinkedHashSet(allDocuments); final Set<Document> assignedDocuments = Sets.newHashSet(); for (Cluster cluster : clusters) { collectAllDocuments(cluster, assignedDocuments); } unclusteredDocuments.removeAll(assignedDocuments); final Cluster otherTopics = new Cluster(label); otherTopics.addDocuments(unclusteredDocuments); otherTopics.setOtherTopics(true); return otherTopics; }
private static List<Cluster> sanityCheck(List<Cluster> in, Predicate<Document> docFilter) { List<Cluster> cloned = Lists.newArrayListWithCapacity(in.size()); for (Cluster c : in) { Cluster c2 = new Cluster(); c2.addPhrases(c.getPhrases()); c2.addDocuments( Iterables.filter(c.getDocuments(), docFilter)); c2.addSubclusters(sanityCheck(c.getSubclusters(), docFilter)); cloned.add(c2); } return cloned; } }
clustersForLanguage.size() == 1 && clustersForLanguage.get(0).isOtherTopics()) languageCluster.addDocuments(languageDocuments);
for (int j = 0; j < rawCluster.size(); j++) cluster.addDocuments(documents.get(rawCluster.get(j)));
for (int bit = bs.nextSetBit(0); bit >= 0; bit = bs.nextSetBit(bit + 1)) cluster.addDocuments(documents.get(bit));