/** * Returns the size of the cluster calculated as the number of unique documents it * contains, including its subclusters. * * @return size of the cluster */ public int size() { return getAllDocuments().size(); }
private static List<Cluster> sanityCheck(List<Cluster> in, Predicate<Document> docFilter) { List<Cluster> cloned = Lists.newArrayListWithCapacity(in.size()); for (Cluster c : in) { Cluster c2 = new Cluster(); c2.addPhrases(c.getPhrases()); c2.addDocuments( Iterables.filter(c.getDocuments(), docFilter)); c2.addSubclusters(sanityCheck(c.getSubclusters(), docFilter)); cloned.add(c2); } return cloned; } }
@Override public String toString() { return "[Cluster, label: " + getLabel() + ", docs: " + size() + ", subclusters: " + getSubclusters().size() + "]"; } }
private static List<Cluster> flatten(Collection<Cluster> hierarchical, List<Cluster> flat) { for (Cluster c : hierarchical) { flat.add(c); flatten(c.getSubclusters(), flat); } return flat; }
/** * A recursive routine for collecting unique documents from this cluster and * subclusters. */ private static Set<Document> collectAllDocuments(Cluster cluster, Set<Document> docs) { if (cluster == null) { return docs; } docs.addAll(cluster.getDocuments()); final List<Cluster> subclusters = cluster.getSubclusters(); for (final Cluster subcluster : subclusters) { collectAllDocuments(subcluster, docs); } return docs; }
List<String> labels = outCluster.getPhrases(); if (labels.size() > maxLabels) { labels = labels.subList(0, maxLabels); final Double score = outCluster.getScore(); if (score != null) { cluster.add("score", score); if (outCluster.isOtherTopics()) { cluster.add("other-topics", outCluster.isOtherTopics()); List<Document> docs = outputSubClusters ? outCluster.getDocuments() : outCluster.getAllDocuments(); List<Object> docList = new ArrayList<>(); cluster.add("docs", docList); if (outputSubClusters && !outCluster.getSubclusters().isEmpty()) { List<NamedList<Object>> subclusters = new ArrayList<>(); cluster.add("clusters", subclusters); clustersToNamedList(outCluster.getSubclusters(), subclusters, outputSubClusters, maxLabels);
private void clustersToNamedList(List<Cluster> outputClusters, List parent, boolean outputSubClusters, int maxLabels) { for (Cluster outCluster : outputClusters) { NamedList cluster = new SimpleOrderedMap(); parent.add(cluster); List<String> labels = outCluster.getPhrases(); if (labels.size() > maxLabels) labels = labels.subList(0, maxLabels); cluster.add("labels", labels); List<Document> docs = outputSubClusters ? outCluster.getDocuments() : outCluster.getAllDocuments(); List docList = new ArrayList(); cluster.add("docs", docList); for (Document doc : docs) { docList.add(doc.getField("solrId")); } if (outputSubClusters) { List subclusters = new ArrayList(); cluster.add("clusters", subclusters); clustersToNamedList(outCluster.getSubclusters(), subclusters, outputSubClusters, maxLabels); } } }
List<Document> docs = cluster.getAllDocuments(); int return_size=0; if (docs != null && docs.size() > 0) { builder.field("size",docs.size()); builder.field("name",cluster.getLabel()); builder.startArray("documents"); for (Document document : docs) { List<String> phrases = cluster.getPhrases(); if (phrases != null && phrases.size() > 0) { builder.startArray("phrases");
for (Cluster cluster : clusters) final List<Cluster> subclusters = cluster.getSubclusters(); for (Cluster subcluster : subclusters) if (!subcluster.isOtherTopics()) Cluster.appendOtherTopics(documents, flattenedClusters); return flattenedClusters; flattenedClusters.addAll(majorLanguageCluster.getSubclusters()); final Cluster otherLanguages = new Cluster("Other Languages"); otherLanguages.addSubclusters(clusters); flattenedClusters.add(otherLanguages); return flattenedClusters;
labels.add(c.getLabel()); for (Document doc : c.getDocuments()) { docs.add(doc.getTitle());
final List<Document> clusterDocuments = cluster.getAllDocuments(); if (cluster.isOtherTopics() || clusterDocuments.size() == 0) if (bestFMeasureCluster != null) bestFMeasureCluster.setAttribute(BEST_F_MEASURE_PARTITION, partition);
for (Cluster cluster : this.clusters) final int clusterSize = cluster.size(); for (Object partition : partitions) final List<Document> clusterDocuments = cluster.getAllDocuments(); if (cluster.isOtherTopics() || clusterDocuments.size() == 0)
/** * Create the junk (unassigned documents) cluster and create the final * set of clusters in Carrot2 format. */ private void postProcessing(List<ClusterCandidate> clusters) { // Adapt to Carrot2 classes, counting used documents on the way. final BitSet all = new BitSet(documents.size()); final ArrayList<Document> docs = Lists.newArrayListWithCapacity(documents.size()); final ArrayList<String> phrases = Lists.newArrayListWithCapacity(3); for (ClusterCandidate c : clusters) { final Cluster c2 = new Cluster(); c2.addPhrases(collectPhrases(phrases, c)); c2.addDocuments(collectDocuments(docs, c.documents)); c2.setScore((double) c.score); this.clusters.add(c2); all.or(c.documents); docs.clear(); phrases.clear(); } Collections.sort(this.clusters, Cluster.byReversedWeightedScoreAndSizeComparator(scoreWeight)); Cluster.appendOtherTopics(this.documents, this.clusters); }
for (int i = 0; i < clusterLabelIndex.length; i++) final Cluster cluster = new Cluster(); cluster.addPhrases(labelFormatter.format(context, labelFeature)); cluster.setAttribute(Cluster.SCORE, clusterLabelScore[i]); for (int bit = bs.nextSetBit(0); bit >= 0; bit = bs.nextSetBit(bit + 1)) cluster.addDocuments(documents.get(bit)); Cluster.byReversedWeightedScoreAndSizeComparator(scoreWeight)); Cluster.appendOtherTopics(documents, clusters);
/** * If there are unclustered documents, appends the "Other Topics" group to the * <code>clusters</code>. * * @see #buildOtherTopics(List, List, String) */ public static Cluster appendOtherTopics(List<Document> allDocuments, List<Cluster> clusters, String label) { final Cluster otherTopics = buildOtherTopics(allDocuments, clusters, label); if (!otherTopics.getDocuments().isEmpty()) { clusters.add(otherTopics); } return otherTopics; }
final Cluster languageCluster = new Cluster( languageCode != null ? languageCode.toString() : "Unknown Language"); clustersForLanguage.size() == 1 && clustersForLanguage.get(0).isOtherTopics()) languageCluster.addDocuments(languageDocuments); languageCluster.addSubclusters(clustersForLanguage);
/** * Locate the first cluster that has id equal to <code>id</code>. The search includes * all the clusters in the input and their sub-clusters. The first cluster with * matching identifier is returned or <code>null</code> if no such cluster could be * found. */ public static Cluster find(int id, Collection<Cluster> clusters) { for (Cluster c : clusters) { if (c != null) { if (c.id != null && c.id == id) { return c; } if (!c.getSubclusters().isEmpty()) { final Cluster sub = find(id, c.getSubclusters()); if (sub != null) { return sub; } } } } return null; }
/** * Replace document refids with the actual references upon deserialization. */ private void documentIdToReference(Cluster cluster, Map<String, Document> documents) { if (cluster.documentIds != null) { for (Cluster.DocumentRefid documentRefid : cluster.documentIds) { cluster.addDocuments(documents.get(documentRefid.refid)); } } for (Cluster subcluster : cluster.getSubclusters()) { documentIdToReference(subcluster, documents); } }
public void calculate() { final int partitionCount = getPartitionsCount(documents); if (partitionCount == 0) { return; } int weightSum = 0; double contaminationSum = 0; for (Cluster cluster : clusters) { if (cluster.isOtherTopics()) { continue; } final double contamination = calculate(cluster, partitionCount); cluster.setAttribute(CONTAMINATION, contamination); contaminationSum += contamination * cluster.size(); weightSum += cluster.size(); } weightedAverageContamination = contaminationSum / weightSum; }
/** * Builds an "Other Topics" cluster that groups those documents from * <code>allDocument</code> that were not referenced in any cluster in * <code>clusters</code>. * * @param allDocuments all documents to check against * @param clusters list of clusters with assigned documents * @param label label for the "Other Topics" group * @return the "Other Topics" cluster */ public static Cluster buildOtherTopics(List<Document> allDocuments, List<Cluster> clusters, String label) { final Set<Document> unclusteredDocuments = Sets.newLinkedHashSet(allDocuments); final Set<Document> assignedDocuments = Sets.newHashSet(); for (Cluster cluster : clusters) { collectAllDocuments(cluster, assignedDocuments); } unclusteredDocuments.removeAll(assignedDocuments); final Cluster otherTopics = new Cluster(label); otherTopics.addDocuments(unclusteredDocuments); otherTopics.setOtherTopics(true); return otherTopics; }