/** * Convenience method to provide backward compatibility */ public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1, double t2, boolean runClustering, double clusterClassificationThreshold, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { run(conf, input, output, measure, t1, t2, t1, t2, 0, runClustering, clusterClassificationThreshold, runSequential); }
/** * Convenience method to provide backward compatibility */ public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1, double t2, boolean runClustering, double clusterClassificationThreshold, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { run(conf, input, output, measure, t1, t2, t1, t2, 0, runClustering, clusterClassificationThreshold, runSequential); }
/** * Convenience method to provide backward compatibility */ public static void run(Configuration conf, Path input, Path output, DistanceMeasure measure, double t1, double t2, boolean runClustering, double clusterClassificationThreshold, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { run(conf, input, output, measure, t1, t2, t1, t2, 0, runClustering, clusterClassificationThreshold, runSequential); }
/** * Convenience method creates new Configuration() Build a directory of Canopy * clusters from the input arguments and, if requested, cluster the input * vectors using these clusters * * @param input * the Path to the directory containing input vectors * @param output * the Path for all output directories * @param t1 * the double T1 distance metric * @param t2 * the double T2 distance metric * @param runClustering * cluster the input vectors if true * @param clusterClassificationThreshold * vectors having pdf below this value will not be clustered. Its value should be between 0 and 1. * @param runSequential * execute sequentially if true */ public static void run(Path input, Path output, DistanceMeasure measure, double t1, double t2, boolean runClustering, double clusterClassificationThreshold, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { run(new Configuration(), input, output, measure, t1, t2, runClustering, clusterClassificationThreshold, runSequential); }
/** * Convenience method creates new Configuration() Build a directory of Canopy * clusters from the input arguments and, if requested, cluster the input * vectors using these clusters * * @param input * the Path to the directory containing input vectors * @param output * the Path for all output directories * @param t1 * the double T1 distance metric * @param t2 * the double T2 distance metric * @param runClustering * cluster the input vectors if true * @param clusterClassificationThreshold * vectors having pdf below this value will not be clustered. Its value should be between 0 and 1. * @param runSequential * execute sequentially if true */ public static void run(Path input, Path output, DistanceMeasure measure, double t1, double t2, boolean runClustering, double clusterClassificationThreshold, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { run(new Configuration(), input, output, measure, t1, t2, runClustering, clusterClassificationThreshold, runSequential); }
/** * Convenience method creates new Configuration() Build a directory of Canopy * clusters from the input arguments and, if requested, cluster the input * vectors using these clusters * * @param input * the Path to the directory containing input vectors * @param output * the Path for all output directories * @param t1 * the double T1 distance metric * @param t2 * the double T2 distance metric * @param runClustering * cluster the input vectors if true * @param clusterClassificationThreshold * vectors having pdf below this value will not be clustered. Its value should be between 0 and 1. * @param runSequential * execute sequentially if true */ public static void run(Path input, Path output, DistanceMeasure measure, double t1, double t2, boolean runClustering, double clusterClassificationThreshold, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { run(new Configuration(), input, output, measure, t1, t2, runClustering, clusterClassificationThreshold, runSequential); }
private void topLevelClustering(Path pointsPath, Configuration conf) throws IOException, InterruptedException, ClassNotFoundException { CanopyDriver.run(conf, pointsPath, outputPath, new ManhattanDistanceMeasure(), 3.1, 2.1, true, 0.0, true); }
private void topLevelClustering(Path pointsPath, Configuration conf) throws IOException, InterruptedException, ClassNotFoundException { DistanceMeasure measure = new ManhattanDistanceMeasure(); CanopyDriver.run(conf, pointsPath, outputPathForCanopy, measure, 4.0, 3.0, true, 0.0, true); Path clustersIn = new Path(outputPathForCanopy, new Path(Cluster.CLUSTERS_DIR + '0' + Cluster.FINAL_ITERATION_SUFFIX)); KMeansDriver.run(conf, pointsPath, clustersIn, outputPathForKMeans, 1, 1, true, 0.0, true); }
Path clusterOutput = new Path(outputDir , "clusters"); CanopyDriver.run(vectorsFolder, canopyCentroids, new EuclideanDistanceMeasure(), 250, 120, false, false); KMeansDriver.run(conf, vectorsFolder, new Path(canopyCentroids, "clusters-0"),
String clusterOutput = outputDir + "/clusters/"; CanopyDriver.run(conf, new Path(vectorsFolder), new Path(canopyCentroids), new ManhattanDistanceMeasure(), 3000.0, 2000.0, false, false);
private void bottomLevelClustering(Map<String,Path> postProcessedClusterDirectories) throws IOException, InterruptedException, ClassNotFoundException { for (Entry<String,Path> topLevelCluster : postProcessedClusterDirectories.entrySet()) { String clusterId = topLevelCluster.getKey(); Path topLevelclusterPath = topLevelCluster.getValue(); Path bottomLevelCluster = PathDirectory.getBottomLevelClusterPath(outputPath, clusterId); CanopyDriver.run(conf, topLevelclusterPath, bottomLevelCluster, new ManhattanDistanceMeasure(), 2.1, 2.0, true, 0.0, true); assertBottomLevelCluster(bottomLevelCluster); } }
private void runClustering(Path pointsPath, Configuration conf, Boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { CanopyDriver.run(conf, pointsPath, clusteringOutputPath, new ManhattanDistanceMeasure(), 3.1, 2.1, false, 0.0, runSequential); Path finalClustersPath = new Path(clusteringOutputPath, "clusters-0-final"); ClusterClassifier.writePolicy(new CanopyClusteringPolicy(), finalClustersPath); }
/** Story: User can cluster points using sequential execution */ @Test public void testClusteringManhattanSeq() throws Exception { List<VectorWritable> points = getPointsWritable(); Configuration config = getConfiguration(); ClusteringTestUtils.writePointsToFile(points, getTestTempFilePath("testdata/file1"), fs, config); // now run the Canopy Driver in sequential mode Path output = getTestTempDirPath("output"); CanopyDriver.run(config, getTestTempDirPath("testdata"), output, manhattanDistanceMeasure, 3.1, 2.1, true, 0.0, true); // verify output from sequence file Path path = new Path(output, "clusters-0-final/part-r-00000"); int ix = 0; for (ClusterWritable clusterWritable : new SequenceFileValueIterable<ClusterWritable>(path, true, config)) { assertEquals("Center [" + ix + ']', manhattanCentroids.get(ix), clusterWritable.getValue() .getCenter()); ix++; } path = new Path(output, "clusteredPoints/part-m-0"); long count = HadoopUtil.countRecords(path, config); assertEquals("number of points", points.size(), count); }
/** * Story: User can produce final point clustering using a Hadoop map/reduce * job and a ManhattanDistanceMeasure. */ @Test public void testClusteringManhattanMR() throws Exception { List<VectorWritable> points = getPointsWritable(); Configuration conf = getConfiguration(); ClusteringTestUtils.writePointsToFile(points, true, getTestTempFilePath("testdata/file1"), fs, conf); ClusteringTestUtils.writePointsToFile(points, true, getTestTempFilePath("testdata/file2"), fs, conf); // now run the Job Path output = getTestTempDirPath("output"); CanopyDriver.run(conf, getTestTempDirPath("testdata"), output, manhattanDistanceMeasure, 3.1, 2.1, true, 0.0, false); Path path = new Path(output, "clusteredPoints/part-m-00000"); long count = HadoopUtil.countRecords(path, conf); assertEquals("number of points", points.size(), count); }
clusterClassificationThreshold = Double.parseDouble(getOption(DefaultOptionCreator.OUTLIER_THRESHOLD)); run(conf, input, output, measure, t1, t2, t3, t4, clusterFilter, runClustering, clusterClassificationThreshold, runSequential); return 0;
CanopyDriver.run(conf, pointsPath, outputPath, new ManhattanDistanceMeasure(), 3.1, 2.1, false, 0.0, false);
clusterClassificationThreshold = Double.parseDouble(getOption(DefaultOptionCreator.OUTLIER_THRESHOLD)); run(conf, input, output, measure, t1, t2, t3, t4, clusterFilter, runClustering, clusterClassificationThreshold, runSequential); return 0;
clusterClassificationThreshold = Double.parseDouble(getOption(DefaultOptionCreator.OUTLIER_THRESHOLD)); run(conf, input, output, measure, t1, t2, t3, t4, clusterFilter, runClustering, clusterClassificationThreshold, runSequential); return 0;
CanopyDriver.run(config, getTestTempDirPath("testdata"), output, manhattanDistanceMeasure, 3.1, 2.1, false, 0.0, false);
CanopyDriver.run(config, getTestTempDirPath("testdata"), output, euclideanDistanceMeasure, 3.1, 2.1, false, 0.0, false);