weka.clusterers.SimpleKMeans java code examples

 public class demo {
  public demo() throws Exception {
    // TODO Auto-generated constructor stub
    BufferedReader breader = null;
    breader = new BufferedReader(new FileReader(
        "D:/logiciels/weka-3-7-12/weka-3-7-12/data/iris.arff"));
    Instances Train = new Instances(breader);
    //Train.setClassIndex(Train.numAttributes() - 1); // comment out this line
    SimpleKMeans kMeans = new SimpleKMeans();
    kMeans.setSeed(10);
    kMeans.setPreserveInstancesOrder(true);
    kMeans.setNumClusters(3);
    kMeans.buildClusterer(Train);
    int[] assignments = kMeans.getAssignments();
    int i = 0;
    for (int clusterNum : assignments) {
      System.out.printf("Instance %d -> Cluster %d", i, clusterNum);
      i++;
    }
    breader.close();
  }
  public static void main(String[] args) throws Exception {
    // TODO Auto-generated method stub
    new demo();
  }
}

 SimpleKMeans kmeans = ...
// your code
...
Instances instances = kmeans.getClusterCentroids();

 /**
  * Main method for executing this class.
  * 
  * @param args use -h to list all parameters
  */
 public static void main(String[] args) {
  runClusterer(new SimpleKMeans(), args);
 }
}

/**
 * Returns default capabilities of the clusterer (i.e., the ones of
 * SimpleKMeans).
 * 
 * @return the capabilities of this clusterer
 */
@Override
public Capabilities getCapabilities() {
 Capabilities result = new SimpleKMeans().getCapabilities();
 result.setOwner(this);
 return result;
}

double bestSqE = Double.MAX_VALUE;
for (i = 0; i < m_NumKMeansRuns; i++) {
 SimpleKMeans sk = new SimpleKMeans();
 sk.setSeed(m_rr.nextInt());
 sk.setNumClusters(m_num_clusters);
 sk.setNumExecutionSlots(m_executionSlots);
 sk.setDisplayStdDevs(true);
 sk.setDoNotCheckCapabilities(true);
 sk.setDontReplaceMissingValues(true);
 sk.buildClusterer(inst);
 if (sk.getSquaredError() < bestSqE) {
  bestSqE = sk.getSquaredError();
  bestK = sk;
m_num_clusters = bestK.numberOfClusters();
m_weights = new double[inst.numInstances()][m_num_clusters];
m_model = new DiscreteEstimator[m_num_clusters][m_num_attribs];
m_priorsPrev = new double[m_num_clusters];
Instances centers = bestK.getClusterCentroids();
Instances stdD = bestK.getClusterStandardDevs();
double[][][] nominalCounts = bestK.getClusterNominalCounts();
double[] clusterSizes = bestK.getClusterSizes();

  SimpleKMeans clusterer = new SimpleKMeans();
  clusterer.setInitializationMethod(new SelectedTag(SimpleKMeans.KMEANS_PLUS_PLUS,
      SimpleKMeans.TAGS_SELECTION));
  clusterer.setNumExecutionSlots(numSlots);
  clusterer.setNumClusters(numProductCentroids);
  clusterer.setMaxIterations(maxIterations);
  clusterer.setSeed(j + 1);
  clusterer.buildClusterer(datasets[i]);
  double SSE = clusterer.getSquaredError();
  if (SSE < minSSE) {
    minSSE = SSE;
System.out.println("Mininum SSE: " + minSSE + " Seed: " + bestClusterer.getSeed());
System.out.println("Saving best sub-quantizer in file..");
Instances clusterCentroids = bestClusterer.getClusterCentroids();
for (int j = 0; j < clusterCentroids.numInstances(); j++) {
  Instance centroid = clusterCentroids.instance(j);

long start = System.currentTimeMillis();
SimpleKMeans clusterer = new SimpleKMeans();
if (kMeansPlusPlus) {
  clusterer.setInitializationMethod(new SelectedTag(SimpleKMeans.KMEANS_PLUS_PLUS,
      SimpleKMeans.TAGS_SELECTION));
clusterer.setDebug(true);
clusterer.setSeed(seed);
clusterer.setNumClusters(numClusters);
clusterer.setMaxIterations(maxIterations);
clusterer.setNumExecutionSlots(numSlots);
clusterer.setFastDistanceCalc(true);
clusterer.buildClusterer(data);
Instances clusterCentroids = clusterer.getClusterCentroids();
for (int j = 0; j < clusterCentroids.numInstances(); j++) {
  Instance centroid = clusterCentroids.instance(j);

private void trainModel(Map<Long, Double> metricData) throws Exception {
  //Model has a single metric_value attribute
  Attribute value = new Attribute("metric_value");
  FastVector attributes = new FastVector();
  attributes.addElement(value);
  trainingData = new Instances("metric_value_data", attributes, 0);
  for (Double val : metricData.values()) {
    double[] valArray = new double[] { val };
    Instance instance = new Instance(1.0, valArray);
    trainingData.add(instance);
  }
  //Create and train the model
  model = new SimpleKMeans();
  model.setNumClusters(k);
  model.setMaxIterations(20);
  model.setPreserveInstancesOrder(true);
  model.buildClusterer(trainingData);
  clusterCentroids = model.getClusterCentroids();
  centroidAssignments = model.getAssignments();
  setMeanDistancesToCentroids();
}

getCapabilities().testWithFail(data);
m_FullMeansOrMediansOrModes = moveCentroid(0, instances, true, false);
Random RandomO = new Random(getSeed());
int instIndex;
HashMap<DecisionTableHashKey, Integer> initC =
 m_canopyClusters = new Canopy();
 m_canopyClusters.setNumClusters(m_NumClusters);
 m_canopyClusters.setSeed(getSeed());
 m_canopyClusters.setT2(getCanopyT2());
 m_canopyClusters.setT1(getCanopyT1());
 m_canopyClusters
  .setMaxNumCandidateCanopiesToHoldInMemory(getCanopyMaxNumCanopiesToHoldInMemory());
 m_canopyClusters.setPeriodicPruningRate(getCanopyPeriodicPruningRate());
 m_canopyClusters.setMinimumCanopyDensity(getCanopyMinimumCanopyDensity());
 m_canopyClusters.setDebug(getDebug());
 m_canopyClusters.buildClusterer(initInstances);
 kMeansPlusPlusInit(initInstances);
 canopyInit(initInstances);
 farthestFirstInit(initInstances);
m_ClusterNominalCounts = new double[m_NumClusters][instances.numAttributes()][0];
m_ClusterMissingCounts = new double[m_NumClusters][instances.numAttributes()];
startExecutorPool();

 setInitializationMethod(new SelectedTag(Integer.parseInt(initM),
  TAGS_SELECTION));
 setCanopyMaxNumCanopiesToHoldInMemory(Integer.parseInt(temp));
 setCanopyPeriodicPruningRate(Integer.parseInt(temp));
 setCanopyMinimumCanopyDensity(Double.parseDouble(temp));
 setCanopyT2(Double.parseDouble(temp));
 setCanopyT1(Double.parseDouble(temp));
 setNumClusters(Integer.parseInt(optionString));
 setMaxIterations(Integer.parseInt(optionString));
 distFunctionClassSpec[0] = "";
 setDistanceFunction((DistanceFunction) Utils.forName(
  DistanceFunction.class, className, distFunctionClassSpec));
} else {
 setDistanceFunction(new EuclideanDistance());
 setNumExecutionSlots(Integer.parseInt(slotsS));

SimpleKMeans localKMeans = new SimpleKMeans();
try {
 localKMeans.setNumClusters(numClusters);
 localKMeans.setInitializationMethod(new SelectedTag(
  SimpleKMeans.KMEANS_PLUS_PLUS, SimpleKMeans.TAGS_SELECTION));
 localKMeans.buildClusterer(sketchForRun);
 finalStartPointsForRuns.add(localKMeans.getClusterCentroids());
} catch (Exception ex) {
 throw new DistributedWekaException(ex);

 You can implement k-means algorithm as:
SimpleKMeans kmeans = new SimpleKMeans();

kmeans.setSeed(10);

// This is the important parameter to set
kmeans.setPreserveInstancesOrder(true);
kmeans.setNumClusters(numberOfClusters);
kmeans.buildClusterer(instances);

// This array returns the cluster number (starting with 0) for each instance
// The array has as many elements as the number of instances
int[] assignments = kmeans.getAssignments();

int i=0;
for(int clusterNum : assignments) {
System.out.printf("Instance %d -> Cluster %d", i, clusterNum);
i++;
}

 Instances instances = new Instances("iris.arff");
SimpleKMeans simpleKMeans = new SimpleKMeans();

// build clusterer
simpleKMeans.setPreservationOrder(true);
simpleKMeans.setNumClusters(2);
simpleKMeans.buildClusterer(instances);

ClusterEvaluation eval = new ClusterEvaluation();
eval.setClusterer(simpleKMeans);
eval.evaluateClusterer(instances);

System.out.println("Cluster Evaluation: "+eval.clusterResultsToString());

/**
 * Default constructor. 
 */
public ClusteringBased() {
  super(new LabelPowerset(new J48()));
  try {
    SimpleKMeans kmeans = new SimpleKMeans();
    kmeans.setNumClusters(5);
    kmeans.setDistanceFunction(new EuclideanDistance());
    clusterer = kmeans;
  } catch (Exception ex) {
    Logger.getLogger(ClusteringBased.class.getName()).log(Level.SEVERE, null, ex);
  }
}

/**
 * Initialize with the canopy centers of the Canopy clustering method
 * 
 * @param data the training data
 * @throws Exception if a problem occurs
 */
protected void canopyInit(Instances data) throws Exception {
 if (m_canopyClusters == null) {
  m_canopyClusters = new Canopy();
  m_canopyClusters.setNumClusters(m_NumClusters);
  m_canopyClusters.setSeed(getSeed());
  m_canopyClusters.setT2(getCanopyT2());
  m_canopyClusters.setT1(getCanopyT1());
  m_canopyClusters
   .setMaxNumCandidateCanopiesToHoldInMemory(getCanopyMaxNumCanopiesToHoldInMemory());
  m_canopyClusters.setPeriodicPruningRate(getCanopyPeriodicPruningRate());
  m_canopyClusters.setMinimumCanopyDensity(getCanopyMinimumCanopyDensity());
  m_canopyClusters.setDebug(getDebug());
  m_canopyClusters.buildClusterer(data);
 }
 m_ClusterCentroids = m_canopyClusters.getCanopies();
}

/**
 * the default constructor.
 */
public SimpleKMeans() {
 super();
 m_SeedDefault = 10;
 setSeed(m_SeedDefault);
}

Random randomO = new Random(getSeed());
HashMap<DecisionTableHashKey, String> initC =
 new HashMap<DecisionTableHashKey, String>();

double bestSqE = Double.MAX_VALUE;
for (i = 0; i < m_NumKMeansRuns; i++) {
 SimpleKMeans sk = new SimpleKMeans();
 sk.setSeed(m_rr.nextInt());
 sk.setNumClusters(m_num_clusters);
 sk.setNumExecutionSlots(m_executionSlots);
 sk.setDisplayStdDevs(true);
 sk.setDoNotCheckCapabilities(true);
 sk.setDontReplaceMissingValues(true);
 sk.buildClusterer(inst);
 if (sk.getSquaredError() < bestSqE) {
  bestSqE = sk.getSquaredError();
  bestK = sk;
m_num_clusters = bestK.numberOfClusters();
m_weights = new double[inst.numInstances()][m_num_clusters];
m_model = new DiscreteEstimator[m_num_clusters][m_num_attribs];
m_priorsPrev = new double[m_num_clusters];
Instances centers = bestK.getClusterCentroids();
Instances stdD = bestK.getClusterStandardDevs();
double[][][] nominalCounts = bestK.getClusterNominalCounts();
double[] clusterSizes = bestK.getClusterSizes();

private void trainModel(Map<Long, Double> metricData) throws Exception {
  //Model has a single metric_value attribute
  Attribute value = new Attribute("metric_value");
  FastVector attributes = new FastVector();
  attributes.addElement(value);
  trainingData = new Instances("metric_value_data", attributes, 0);
  for (Double val : metricData.values()) {
    double[] valArray = new double[] { val };
    Instance instance = new Instance(1.0, valArray);
    trainingData.add(instance);
  }
  //Create and train the model
  model = new SimpleKMeans();
  model.setNumClusters(k);
  model.setMaxIterations(20);
  model.setPreserveInstancesOrder(true);
  model.buildClusterer(trainingData);
  clusterCentroids = model.getClusterCentroids();
  centroidAssignments = model.getAssignments();
  setMeanDistancesToCentroids();
}

getCapabilities().testWithFail(data);
m_FullMeansOrMediansOrModes = moveCentroid(0, instances, true, false);
Random RandomO = new Random(getSeed());
int instIndex;
HashMap<DecisionTableHashKey, Integer> initC =
 m_canopyClusters = new Canopy();
 m_canopyClusters.setNumClusters(m_NumClusters);
 m_canopyClusters.setSeed(getSeed());
 m_canopyClusters.setT2(getCanopyT2());
 m_canopyClusters.setT1(getCanopyT1());
 m_canopyClusters
  .setMaxNumCandidateCanopiesToHoldInMemory(getCanopyMaxNumCanopiesToHoldInMemory());
 m_canopyClusters.setPeriodicPruningRate(getCanopyPeriodicPruningRate());
 m_canopyClusters.setMinimumCanopyDensity(getCanopyMinimumCanopyDensity());
 m_canopyClusters.setDebug(getDebug());
 m_canopyClusters.buildClusterer(initInstances);
 kMeansPlusPlusInit(initInstances);
 canopyInit(initInstances);
 farthestFirstInit(initInstances);
m_ClusterNominalCounts = new double[m_NumClusters][instances.numAttributes()][0];
m_ClusterMissingCounts = new double[m_NumClusters][instances.numAttributes()];
startExecutorPool();

Javadoc

Cluster data using the k means algorithm. Can use either the Euclidean distance (default) or the Manhattan distance. If the Manhattan distance is used, then centroids are computed as the component-wise median rather than mean. For more information see:

D. Arthur, S. Vassilvitskii: k-means++: the advantages of carefull seeding. In: Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete algorithms, 1027-1035, 2007.

BibTeX:

 
@inproceedings{Arthur2007, 
author = {D. Arthur and S. Vassilvitskii}, 
booktitle = {Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete algorithms}, 
pages = {1027-1035}, 
title = {k-means++: the advantages of carefull seeding}, 
year = {2007} 
}

Valid options are:

 
-N <num> 
Number of clusters. 
(default 2).

 
-init 
Initialization method to use. 
0 = random, 1 = k-means++, 2 = canopy, 3 = farthest first. 
(default = 0)

 
-C 
Use canopies to reduce the number of distance calculations.

 
-max-candidates <num> 
Maximum number of candidate canopies to retain in memory 
at any one time when using canopy clustering. 
T2 distance plus, data characteristics, 
will determine how many candidate canopies are formed before 
periodic and final pruning are performed, which might result 
in exceess memory consumption. This setting avoids large numbers 
of candidate canopies consuming memory. (default = 100)

 
-periodic-pruning <num> 
How often to prune low density canopies when using canopy clustering.  
(default = every 10,000 training instances)

 
-min-density 
Minimum canopy density, when using canopy clustering, below which 
a canopy will be pruned during periodic pruning. (default = 2 instances)

 
-t2 
The T2 distance to use when using canopy clustering. Values < 0 indicate that 
a heuristic based on attribute std. deviation should be used to set this. 
(default = -1.0)

 
-t1 
The T1 distance to use when using canopy clustering. A value < 0 is taken as a 
positive multiplier for T2. (default = -1.5)

 
-V 
Display std. deviations for centroids.

 
-M 
Don't replace missing values with mean/mode.

 
-A <classname and options> 
Distance function to use. 
(default: weka.core.EuclideanDistance)

 
-I <num> 
Maximum number of iterations.

 
-O 
Preserve order of instances.

 
-fast 
Enables faster distance calculations, using cut-off values. 
Disables the calculation/output of squared errors/distances.

 
-num-slots <num> 
Number of execution slots. 
(default 1 - i.e. no parallelism)

 
-S <num> 
Random number seed. 
(default 10)

 
-output-debug-info 
If set, clusterer is run in debug mode and 
may output additional info to the console

 
-do-not-check-capabilities 
If set, clusterer capabilities are not checked before clusterer is built 
(use with caution).

Most used methods

<init>
the default constructor.
setNumClusters
set the number of clusters to generate.
buildClusterer
Generates a clusterer. Has to initialize all fields of the clusterer that are not being set via opti
getClusterCentroids
Gets the the cluster centroids.
setInitializationMethod
Set the initialization method to use
setMaxIterations
set the maximum number of iterations to be executed.
setSeed
getSeed
getSquaredError
Gets the squared error for all clusters.
setDistanceFunction
sets the distance function to use for instance comparison.
setNumExecutionSlots
Set the degree of parallelism to use.
canopyInit
Initialize with the canopy centers of the Canopy clustering method

Popular in Java

Start an intent from android
putExtra (Intent)
compareTo (BigDecimal)
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
PrintStream (java.io)
Fake signature of an existing Java class.
String (java.lang)
InetAddress (java.net)
An Internet Protocol (IP) address. This can be either an IPv4 address or an IPv6 address, and in pra
SortedMap (java.util)
A map that has its keys ordered. The sorting is according to either the natural ordering of its keys
AtomicInteger (java.util.concurrent.atomic)
An int value that may be updated atomically. See the java.util.concurrent.atomic package specificati
Option (scala)
Best plugins for Eclipse

How to useSimpleKMeans in weka.clusterers

Best Java code snippets using weka.clusterers.SimpleKMeans (Showing top 20 results out of 315)

How to use
SimpleKMeans
in
weka.clusterers