org.carrot2.core.Cluster java code examples

/**
 * Returns the size of the cluster calculated as the number of unique documents it
 * contains, including its subclusters.
 * 
 * @return size of the cluster
 */
public int size()
{
  return getAllDocuments().size();
}

  private static List<Cluster> sanityCheck(List<Cluster> in, Predicate<Document> docFilter)
  {
    List<Cluster> cloned = Lists.newArrayListWithCapacity(in.size());
    for (Cluster c : in) {
      Cluster c2 = new Cluster();
      c2.addPhrases(c.getPhrases());
      c2.addDocuments(
        Iterables.filter(c.getDocuments(), docFilter));
      c2.addSubclusters(sanityCheck(c.getSubclusters(), docFilter));
      cloned.add(c2);
    }
    return cloned;
  }    
}

  @Override
  public String toString()
  {
    return "[Cluster, label: " + getLabel() + ", docs: " + size() + ", subclusters: " + getSubclusters().size() + "]";
  }
}

private static List<Cluster> flatten(Collection<Cluster> hierarchical, List<Cluster> flat)
{
  for (Cluster c : hierarchical)
  {
    flat.add(c);
    flatten(c.getSubclusters(), flat);
  }
  return flat;
}

/**
 * A recursive routine for collecting unique documents from this cluster and
 * subclusters.
 */
private static Set<Document> collectAllDocuments(Cluster cluster, Set<Document> docs)
{
  if (cluster == null)
  {
    return docs;
  }
  docs.addAll(cluster.getDocuments());
  final List<Cluster> subclusters = cluster.getSubclusters();
  for (final Cluster subcluster : subclusters)
  {
    collectAllDocuments(subcluster, docs);
  }
  return docs;
}

List<String> labels = outCluster.getPhrases();
if (labels.size() > maxLabels) {
 labels = labels.subList(0, maxLabels);
final Double score = outCluster.getScore();
if (score != null) {
 cluster.add("score", score);
if (outCluster.isOtherTopics()) {
 cluster.add("other-topics", outCluster.isOtherTopics());
List<Document> docs = outputSubClusters ? outCluster.getDocuments() : outCluster.getAllDocuments();
List<Object> docList = new ArrayList<>();
cluster.add("docs", docList);
if (outputSubClusters && !outCluster.getSubclusters().isEmpty()) {
 List<NamedList<Object>> subclusters = new ArrayList<>();
 cluster.add("clusters", subclusters);
 clustersToNamedList(outCluster.getSubclusters(), subclusters,
     outputSubClusters, maxLabels);

private void clustersToNamedList(List<Cluster> outputClusters,
                 List parent, boolean outputSubClusters, int maxLabels) {
 for (Cluster outCluster : outputClusters) {
  NamedList cluster = new SimpleOrderedMap();
  parent.add(cluster);
  List<String> labels = outCluster.getPhrases();
  if (labels.size() > maxLabels)
   labels = labels.subList(0, maxLabels);
  cluster.add("labels", labels);
  List<Document> docs = outputSubClusters ? outCluster.getDocuments() : outCluster.getAllDocuments();
  List docList = new ArrayList();
  cluster.add("docs", docList);
  for (Document doc : docs) {
   docList.add(doc.getField("solrId"));
  }
  if (outputSubClusters) {
   List subclusters = new ArrayList();
   cluster.add("clusters", subclusters);
   clustersToNamedList(outCluster.getSubclusters(), subclusters,
       outputSubClusters, maxLabels);
  }
 }
}

List<Document> docs = cluster.getAllDocuments();
int return_size=0;
if (docs != null && docs.size() > 0) {
  builder.field("size",docs.size());
  builder.field("name",cluster.getLabel());
  builder.startArray("documents");
  for (Document document : docs) {
List<String> phrases = cluster.getPhrases();
if (phrases != null && phrases.size() > 0) {
  builder.startArray("phrases");

for (Cluster cluster : clusters)
  final List<Cluster> subclusters = cluster.getSubclusters();
  for (Cluster subcluster : subclusters)
    if (!subcluster.isOtherTopics())
Cluster.appendOtherTopics(documents, flattenedClusters);
return flattenedClusters;
    flattenedClusters.addAll(majorLanguageCluster.getSubclusters());
    final Cluster otherLanguages = new Cluster("Other Languages");
    otherLanguages.addSubclusters(clusters);
    flattenedClusters.add(otherLanguages);
    return flattenedClusters;

labels.add(c.getLabel());
for (Document doc : c.getDocuments()) {
  docs.add(doc.getTitle());

  final List<Document> clusterDocuments = cluster.getAllDocuments();
  if (cluster.isOtherTopics() || clusterDocuments.size() == 0)
if (bestFMeasureCluster != null)
  bestFMeasureCluster.setAttribute(BEST_F_MEASURE_PARTITION, partition);

for (Cluster cluster : this.clusters)
  final int clusterSize = cluster.size();
  for (Object partition : partitions)
    final List<Document> clusterDocuments = cluster.getAllDocuments();
    if (cluster.isOtherTopics() || clusterDocuments.size() == 0)

/**
 * Create the junk (unassigned documents) cluster and create the final
 * set of clusters in Carrot2 format. 
 */
private void postProcessing(List<ClusterCandidate> clusters)
{
  // Adapt to Carrot2 classes, counting used documents on the way.
  final BitSet all = new BitSet(documents.size());
  final ArrayList<Document> docs = Lists.newArrayListWithCapacity(documents.size());
  final ArrayList<String> phrases = Lists.newArrayListWithCapacity(3);
  for (ClusterCandidate c : clusters)
  {
    final Cluster c2 = new Cluster();
    c2.addPhrases(collectPhrases(phrases, c));
    c2.addDocuments(collectDocuments(docs, c.documents));
    c2.setScore((double) c.score);
    this.clusters.add(c2);
    all.or(c.documents);
    docs.clear(); 
    phrases.clear();
  }
  Collections.sort(this.clusters,
    Cluster.byReversedWeightedScoreAndSizeComparator(scoreWeight));
  Cluster.appendOtherTopics(this.documents, this.clusters);
}

  for (int i = 0; i < clusterLabelIndex.length; i++)
    final Cluster cluster = new Cluster();
    cluster.addPhrases(labelFormatter.format(context, labelFeature));
    cluster.setAttribute(Cluster.SCORE, clusterLabelScore[i]);
    for (int bit = bs.nextSetBit(0); bit >= 0; bit = bs.nextSetBit(bit + 1))
      cluster.addDocuments(documents.get(bit));
    Cluster.byReversedWeightedScoreAndSizeComparator(scoreWeight));
Cluster.appendOtherTopics(documents, clusters);

/**
 * If there are unclustered documents, appends the "Other Topics" group to the
 * <code>clusters</code>.
 * 
 * @see #buildOtherTopics(List, List, String)
 */
public static Cluster appendOtherTopics(List<Document> allDocuments,
  List<Cluster> clusters, String label)
{
  final Cluster otherTopics = buildOtherTopics(allDocuments, clusters, label);
  if (!otherTopics.getDocuments().isEmpty())
  {
    clusters.add(otherTopics);
  }
  return otherTopics;
}

final Cluster languageCluster = new Cluster(
  languageCode != null ? languageCode.toString() : "Unknown Language");
  clustersForLanguage.size() == 1 && clustersForLanguage.get(0).isOtherTopics())
  languageCluster.addDocuments(languageDocuments);
  languageCluster.addSubclusters(clustersForLanguage);

/**
 * Locate the first cluster that has id equal to <code>id</code>. The search includes
 * all the clusters in the input and their sub-clusters. The first cluster with
 * matching identifier is returned or <code>null</code> if no such cluster could be
 * found.
 */
public static Cluster find(int id, Collection<Cluster> clusters)
{
  for (Cluster c : clusters)
  {
    if (c != null)
    {
      if (c.id != null && c.id == id)
      {
        return c;
      }
      if (!c.getSubclusters().isEmpty())
      {
        final Cluster sub = find(id, c.getSubclusters());
        if (sub != null)
        {
          return sub;
        }
      }
    }
  }
  return null;
}

/**
 * Replace document refids with the actual references upon deserialization.
 */
private void documentIdToReference(Cluster cluster, Map<String, Document> documents)
{
  if (cluster.documentIds != null)
  {
    for (Cluster.DocumentRefid documentRefid : cluster.documentIds)
    {
      cluster.addDocuments(documents.get(documentRefid.refid));
    }
  }
  for (Cluster subcluster : cluster.getSubclusters())
  {
    documentIdToReference(subcluster, documents);
  }
}

public void calculate()
{
  final int partitionCount = getPartitionsCount(documents);
  if (partitionCount == 0)
  {
    return;
  }
  int weightSum = 0;
  double contaminationSum = 0;
  for (Cluster cluster : clusters)
  {
    if (cluster.isOtherTopics())
    {
      continue;
    }
    final double contamination = calculate(cluster, partitionCount);
    cluster.setAttribute(CONTAMINATION, contamination);
    contaminationSum += contamination * cluster.size();
    weightSum += cluster.size();
  }
  weightedAverageContamination = contaminationSum / weightSum;
}

/**
 * Builds an "Other Topics" cluster that groups those documents from
 * <code>allDocument</code> that were not referenced in any cluster in
 * <code>clusters</code>.
 * 
 * @param allDocuments all documents to check against
 * @param clusters list of clusters with assigned documents
 * @param label label for the "Other Topics" group
 * @return the "Other Topics" cluster
 */
public static Cluster buildOtherTopics(List<Document> allDocuments,
  List<Cluster> clusters, String label)
{
  final Set<Document> unclusteredDocuments = Sets.newLinkedHashSet(allDocuments);
  final Set<Document> assignedDocuments = Sets.newHashSet();
  for (Cluster cluster : clusters)
  {
    collectAllDocuments(cluster, assignedDocuments);
  }
  unclusteredDocuments.removeAll(assignedDocuments);
  final Cluster otherTopics = new Cluster(label);
  otherTopics.addDocuments(unclusteredDocuments);
  otherTopics.setOtherTopics(true);
  return otherTopics;
}

Javadoc

A cluster (group) of Documents. Each cluster has a human-readable label consisting of one or more phrases, a list of documents it contains and a list of its subclusters. Optionally, additional attributes can be associated with a cluster, e.g. #OTHER_TOPICS. This class is not thread-safe.

Most used methods

getAllDocuments
Returns all documents in this cluster ordered according to the provided comparator. See Document for
getDocuments
Returns all documents contained in this cluster. The returned list is unmodifiable.
getPhrases
Returns all phrases describing this cluster. The returned list is unmodifiable.
getLabel
Formats this cluster's label. If there is more than one phrase describing this cluster, phrases will
getSubclusters
Returns all subclusters of this cluster. The returned list is unmodifiable.
isOtherTopics
Returns true if this cluster is the #OTHER_TOPICS cluster.
<init>
Creates a Cluster with the provided phrase to be used as the cluster's label and documents containe
addDocuments
Adds document to this cluster.
addPhrases
Adds phrases to the description of this cluster.
addSubclusters
Adds subclusters to this cluster
appendOtherTopics
If there are unclustered documents, appends the "Other Topics" group to the clusters.
assignClusterIds
Assigns sequential identifiers to the provided clusters (and their sub-clusters). If any cluster alr

Popular in Java

Running tasks concurrently on multiple threads
getApplicationContext (Context)
findViewById (Activity)
getSystemService (Context)
GregorianCalendar (java.util)
GregorianCalendar is a concrete subclass of Calendarand provides the standard calendar used by most
HashSet (java.util)
HashSet is an implementation of a Set. All optional operations (adding and removing) are supported.
HttpServlet (javax.servlet.http)
Provides an abstract class to be subclassed to create an HTTP servlet suitable for a Web site. A sub
LogFactory (org.apache.commons.logging)
Factory for creating Log instances, with discovery and configuration features similar to that employ
BufferedImage (java.awt.image)
The BufferedImage subclass describes an java.awt.Image with an accessible buffer of image data. All
JFrame (javax.swing)
Best IntelliJ plugins

How to useCluster in org.carrot2.core

Best Java code snippets using org.carrot2.core.Cluster (Showing top 20 results out of 315)

How to use
Cluster
in
org.carrot2.core