org.carrot2.core.Cluster.addDocuments java code examples

/**
 * Creates a {@link Cluster} with the provided <code>phrase</code> to be used as the
 * cluster's label and <code>documents</code> contained in the cluster.
 * 
 * @param phrase the phrase to form the cluster's label
 * @param documents documents contained in the cluster
 */
public Cluster(String phrase, Document... documents)
{
  addPhrases(phrase);
  addDocuments(documents);
}

/**
 * Replace document refids with the actual references upon deserialization.
 */
private void documentIdToReference(Cluster cluster, Map<String, Document> documents)
{
  if (cluster.documentIds != null)
  {
    for (Cluster.DocumentRefid documentRefid : cluster.documentIds)
    {
      cluster.addDocuments(documents.get(documentRefid.refid));
    }
  }
  for (Cluster subcluster : cluster.getSubclusters())
  {
    documentIdToReference(subcluster, documents);
  }
}

private void addToCluster(Map<Object, Cluster> clusters, Object fieldValue,
  Document document)
{
  if (fieldValue == null)
  {
    return;
  }
  Cluster cluster = clusters.get(fieldValue);
  if (cluster == null)
  {
    cluster = new Cluster();
    cluster.addPhrases(buildClusterLabel(fieldValue));
    clusters.put(fieldValue, cluster);
  }
  cluster.addDocuments(document);
}

/**
 * Create the junk (unassigned documents) cluster and create the final
 * set of clusters in Carrot2 format. 
 */
private void postProcessing(List<ClusterCandidate> clusters)
{
  // Adapt to Carrot2 classes, counting used documents on the way.
  final BitSet all = new BitSet(documents.size());
  final ArrayList<Document> docs = Lists.newArrayListWithCapacity(documents.size());
  final ArrayList<String> phrases = Lists.newArrayListWithCapacity(3);
  for (ClusterCandidate c : clusters)
  {
    final Cluster c2 = new Cluster();
    c2.addPhrases(collectPhrases(phrases, c));
    c2.addDocuments(collectDocuments(docs, c.documents));
    c2.setScore((double) c.score);
    this.clusters.add(c2);
    all.or(c.documents);
    docs.clear(); 
    phrases.clear();
  }
  Collections.sort(this.clusters,
    Cluster.byReversedWeightedScoreAndSizeComparator(scoreWeight));
  Cluster.appendOtherTopics(this.documents, this.clusters);
}

cluster.addDocuments(subcluster.getDocuments());
cluster.addSubclusters(subcluster.getSubclusters());
  cluster.addDocuments(documents[documentIndex.intValue()]);

/**
 * Builds an "Other Topics" cluster that groups those documents from
 * <code>allDocument</code> that were not referenced in any cluster in
 * <code>clusters</code>.
 * 
 * @param allDocuments all documents to check against
 * @param clusters list of clusters with assigned documents
 * @param label label for the "Other Topics" group
 * @return the "Other Topics" cluster
 */
public static Cluster buildOtherTopics(List<Document> allDocuments,
  List<Cluster> clusters, String label)
{
  final Set<Document> unclusteredDocuments = Sets.newLinkedHashSet(allDocuments);
  final Set<Document> assignedDocuments = Sets.newHashSet();
  for (Cluster cluster : clusters)
  {
    collectAllDocuments(cluster, assignedDocuments);
  }
  unclusteredDocuments.removeAll(assignedDocuments);
  final Cluster otherTopics = new Cluster(label);
  otherTopics.addDocuments(unclusteredDocuments);
  otherTopics.setOtherTopics(true);
  return otherTopics;
}

  private static List<Cluster> sanityCheck(List<Cluster> in, Predicate<Document> docFilter)
  {
    List<Cluster> cloned = Lists.newArrayListWithCapacity(in.size());
    for (Cluster c : in) {
      Cluster c2 = new Cluster();
      c2.addPhrases(c.getPhrases());
      c2.addDocuments(
        Iterables.filter(c.getDocuments(), docFilter));
      c2.addSubclusters(sanityCheck(c.getSubclusters(), docFilter));
      cloned.add(c2);
    }
    return cloned;
  }    
}

clustersForLanguage.size() == 1 && clustersForLanguage.get(0).isOtherTopics())
languageCluster.addDocuments(languageDocuments);

for (int j = 0; j < rawCluster.size(); j++)
  cluster.addDocuments(documents.get(rawCluster.get(j)));

for (int bit = bs.nextSetBit(0); bit >= 0; bit = bs.nextSetBit(bit + 1))
  cluster.addDocuments(documents.get(bit));

Javadoc

Adds document to this cluster.

Popular methods of Cluster

getAllDocuments
Returns all documents in this cluster ordered according to the provided comparator. See Document for
getDocuments
Returns all documents contained in this cluster. The returned list is unmodifiable.
getPhrases
Returns all phrases describing this cluster. The returned list is unmodifiable.
getLabel
Formats this cluster's label. If there is more than one phrase describing this cluster, phrases will
getSubclusters
Returns all subclusters of this cluster. The returned list is unmodifiable.
isOtherTopics
Returns true if this cluster is the #OTHER_TOPICS cluster.
<init>
Creates a Cluster with the provided phrase to be used as the cluster's label and documents containe
addPhrases
Adds phrases to the description of this cluster.
addSubclusters
Adds subclusters to this cluster
appendOtherTopics
If there are unclustered documents, appends the "Other Topics" group to the clusters.
assignClusterIds
Assigns sequential identifiers to the provided clusters (and their sub-clusters). If any cluster alr
buildOtherTopics
Builds an "Other Topics" cluster that groups those documents fromallDocument that were not referenc

Popular in Java

Running tasks concurrently on multiple threads
getApplicationContext (Context)
findViewById (Activity)
getSystemService (Context)
GregorianCalendar (java.util)
GregorianCalendar is a concrete subclass of Calendarand provides the standard calendar used by most
HashSet (java.util)
HashSet is an implementation of a Set. All optional operations (adding and removing) are supported.
HttpServlet (javax.servlet.http)
Provides an abstract class to be subclassed to create an HTTP servlet suitable for a Web site. A sub
LogFactory (org.apache.commons.logging)
Factory for creating Log instances, with discovery and configuration features similar to that employ
BufferedImage (java.awt.image)
The BufferedImage subclass describes an java.awt.Image with an accessible buffer of image data. All
JFrame (javax.swing)
Github Copilot alternatives

How to use addDocumentsmethodin org.carrot2.core.Cluster

Best Java code snippets using org.carrot2.core.Cluster.addDocuments (Showing top 10 results out of 315)

How to use
addDocuments
method
in
org.carrot2.core.Cluster