/** * If there are unclustered documents, appends the "Other Topics" group to the * <code>clusters</code>. * * @see #buildOtherTopics(List, List) */ public static Cluster appendOtherTopics(List<Document> allDocuments, List<Cluster> clusters) { return appendOtherTopics(allDocuments, clusters, OTHER_TOPICS_LABEL); }
/** * Performs by URL clustering. */ @Override public void process() throws ProcessingException { // Just in case we get a linked list, create an array of documents final Document [] documentArray = this.documents .toArray(new Document [this.documents.size()]); // Prepare an array of url parts final String [][] urlParts = buildUrlParts(documentArray); // Recursively build the cluster structure final List<Integer> documentIndexes = new ArrayList<Integer>(documentArray.length); for (int i = 0; i < documentArray.length; i++) { documentIndexes.add(i); } this.clusters = createClusters(documentArray, documentIndexes, urlParts, 0, ""); if (clusters.size() == 0) { Cluster.appendOtherTopics(documents, clusters, "Other Sites"); } }
/** * Performs by URL clustering. */ @Override public void process() throws ProcessingException { final Map<Object, Cluster> clusterMap = Maps.newHashMap(); for (Document document : documents) { final Object field = document.getField(fieldName); if (field instanceof Collection<?>) { for (Object value : (Collection<?>) field) { addToCluster(clusterMap, value, document); } } else { addToCluster(clusterMap, field, document); } } clusters = Lists.newArrayList(clusterMap.values()); Collections.sort(clusters, Cluster.BY_REVERSED_SIZE_AND_LABEL_COMPARATOR); Cluster.appendOtherTopics(documents, clusters); }
/** * Create the junk (unassigned documents) cluster and create the final * set of clusters in Carrot2 format. */ private void postProcessing(List<ClusterCandidate> clusters) { // Adapt to Carrot2 classes, counting used documents on the way. final BitSet all = new BitSet(documents.size()); final ArrayList<Document> docs = Lists.newArrayListWithCapacity(documents.size()); final ArrayList<String> phrases = Lists.newArrayListWithCapacity(3); for (ClusterCandidate c : clusters) { final Cluster c2 = new Cluster(); c2.addPhrases(collectPhrases(phrases, c)); c2.addDocuments(collectDocuments(docs, c.documents)); c2.setScore((double) c.score); this.clusters.add(c2); all.or(c.documents); docs.clear(); phrases.clear(); } Collections.sort(this.clusters, Cluster.byReversedWeightedScoreAndSizeComparator(scoreWeight)); Cluster.appendOtherTopics(this.documents, this.clusters); }
Cluster.appendOtherTopics(documentsInCluster, clusters, "Other Sites");
Cluster.appendOtherTopics(documents, flattenedClusters); return flattenedClusters;
Cluster.appendOtherTopics(documents, clusters);
Cluster.appendOtherTopics(documents, clusters); return clusters;
Cluster.appendOtherTopics(documents, clusters);