/** * Adds up the distances from each point to its closest cluster and returns the sum. * @param datapoints iterable of datapoints. * @param centroids iterable of Centroids. * @return the total cost described above. */ public static double totalClusterCost(Iterable<? extends Vector> datapoints, Iterable<? extends Vector> centroids) { DistanceMeasure distanceMeasure = new EuclideanDistanceMeasure(); UpdatableSearcher searcher = new ProjectionSearch(distanceMeasure, 3, 1); searcher.addAll(centroids); return totalClusterCost(datapoints, searcher); }
@Override public boolean remove(Vector vector, double epsilon) { WeightedThing<Vector> toRemove = searchFirst(vector, false); if (toRemove.getWeight() < epsilon) { Iterator<? extends Vector> basisVectors = basisMatrix.iterator(); for (TreeMultiset<WeightedThing<Vector>> projection : scalarProjections) { if (!projection.remove(new WeightedThing<Vector>(vector, vector.dot(basisVectors.next())))) { throw new RuntimeException("Internal inconsistency in ProjectionSearch"); } } return true; } else { return false; } }
/** * Adds a WeightedVector into the set of projections for later searching. * @param vector The WeightedVector to add. */ @Override public void add(Vector vector) { initialize(vector.size()); Vector projection = basisMatrix.times(vector); // Add the the new vector and the projected distance to each set separately. int i = 0; for (TreeMultiset<WeightedThing<Vector>> s : scalarProjections) { s.add(new WeightedThing<Vector>(vector, projection.get(i++))); } int numVectors = scalarProjections.get(0).size(); for (TreeMultiset<WeightedThing<Vector>> s : scalarProjections) { Preconditions.checkArgument(s.size() == numVectors, "Number of vectors in projection sets " + "differ"); double firstWeight = s.firstEntry().getElement().getWeight(); for (WeightedThing<Vector> w : s) { Preconditions.checkArgument(firstWeight <= w.getWeight(), "Weights not in non-decreasing " + "order"); firstWeight = w.getWeight(); } } }
/** * Adds a WeightedVector into the set of projections for later searching. * @param vector The WeightedVector to add. */ @Override public void add(Vector vector) { initialize(vector.size()); Vector projection = basisMatrix.times(vector); // Add the the new vector and the projected distance to each set separately. int i = 0; for (TreeMultiset<WeightedThing<Vector>> s : scalarProjections) { s.add(new WeightedThing<>(vector, projection.get(i++))); } int numVectors = scalarProjections.get(0).size(); for (TreeMultiset<WeightedThing<Vector>> s : scalarProjections) { Preconditions.checkArgument(s.size() == numVectors, "Number of vectors in projection sets " + "differ"); double firstWeight = s.firstEntry().getElement().getWeight(); for (WeightedThing<Vector> w : s) { Preconditions.checkArgument(firstWeight <= w.getWeight(), "Weights not in non-decreasing " + "order"); firstWeight = w.getWeight(); } } }
/** * Adds up the distances from each point to its closest cluster and returns the sum. * @param datapoints iterable of datapoints. * @param centroids iterable of Centroids. * @return the total cost described above. */ public static double totalClusterCost(Iterable<? extends Vector> datapoints, Iterable<? extends Vector> centroids) { DistanceMeasure distanceMeasure = new EuclideanDistanceMeasure(); UpdatableSearcher searcher = new ProjectionSearch(distanceMeasure, 3, 1); searcher.addAll(centroids); return totalClusterCost(datapoints, searcher); }
@Override public boolean remove(Vector vector, double epsilon) { WeightedThing<Vector> toRemove = searchFirst(vector, false); if (toRemove.getWeight() < epsilon) { Iterator<? extends Vector> basisVectors = basisMatrix.iterator(); for (TreeMultiset<WeightedThing<Vector>> projection : scalarProjections) { if (!projection.remove(new WeightedThing<Vector>(vector, vector.dot(basisVectors.next())))) { throw new RuntimeException("Internal inconsistency in ProjectionSearch"); } } return true; } else { return false; } }
/** * Adds a WeightedVector into the set of projections for later searching. * @param vector The WeightedVector to add. */ @Override public void add(Vector vector) { initialize(vector.size()); Vector projection = basisMatrix.times(vector); // Add the the new vector and the projected distance to each set separately. int i = 0; for (TreeMultiset<WeightedThing<Vector>> s : scalarProjections) { s.add(new WeightedThing<Vector>(vector, projection.get(i++))); } int numVectors = scalarProjections.get(0).size(); for (TreeMultiset<WeightedThing<Vector>> s : scalarProjections) { Preconditions.checkArgument(s.size() == numVectors, "Number of vectors in projection sets " + "differ"); double firstWeight = s.firstEntry().getElement().getWeight(); for (WeightedThing<Vector> w : s) { Preconditions.checkArgument(firstWeight <= w.getWeight(), "Weights not in non-decreasing " + "order"); firstWeight = w.getWeight(); } } }
/** * Adds up the distances from each point to its closest cluster and returns the sum. * @param datapoints iterable of datapoints. * @param centroids iterable of Centroids. * @return the total cost described above. */ public static double totalClusterCost(Iterable<? extends Vector> datapoints, Iterable<? extends Vector> centroids) { DistanceMeasure distanceMeasure = new EuclideanDistanceMeasure(); UpdatableSearcher searcher = new ProjectionSearch(distanceMeasure, 3, 1); searcher.addAll(centroids); return totalClusterCost(datapoints, searcher); }
@Override public boolean remove(Vector vector, double epsilon) { WeightedThing<Vector> toRemove = searchFirst(vector, false); if (toRemove.getWeight() < epsilon) { Iterator<? extends Vector> basisVectors = basisMatrix.iterator(); for (TreeMultiset<WeightedThing<Vector>> projection : scalarProjections) { if (!projection.remove(new WeightedThing<>(vector, vector.dot(basisVectors.next())))) { throw new RuntimeException("Internal inconsistency in ProjectionSearch"); } } return true; } else { return false; } }
/** * Computes the summaries for the distances in each cluster. * @param datapoints iterable of datapoints. * @param centroids iterable of Centroids. * @return a list of OnlineSummarizers where the i-th element is the summarizer corresponding to the cluster whose * index is i. */ public static List<OnlineSummarizer> summarizeClusterDistances(Iterable<? extends Vector> datapoints, Iterable<? extends Vector> centroids, DistanceMeasure distanceMeasure) { UpdatableSearcher searcher = new ProjectionSearch(distanceMeasure, 3, 1); searcher.addAll(centroids); List<OnlineSummarizer> summarizers = new ArrayList<>(); if (searcher.size() == 0) { return summarizers; } for (int i = 0; i < searcher.size(); ++i) { summarizers.add(new OnlineSummarizer()); } for (Vector v : datapoints) { Centroid closest = (Centroid)searcher.search(v, 1).get(0).getValue(); OnlineSummarizer summarizer = summarizers.get(closest.getIndex()); summarizer.add(distanceMeasure.distance(v, closest)); } return summarizers; }
/** * Computes the summaries for the distances in each cluster. * @param datapoints iterable of datapoints. * @param centroids iterable of Centroids. * @return a list of OnlineSummarizers where the i-th element is the summarizer corresponding to the cluster whose * index is i. */ public static List<OnlineSummarizer> summarizeClusterDistances(Iterable<? extends Vector> datapoints, Iterable<? extends Vector> centroids, DistanceMeasure distanceMeasure) { UpdatableSearcher searcher = new ProjectionSearch(distanceMeasure, 3, 1); searcher.addAll(centroids); List<OnlineSummarizer> summarizers = Lists.newArrayList(); if (searcher.size() == 0) { return summarizers; } for (int i = 0; i < searcher.size(); ++i) { summarizers.add(new OnlineSummarizer()); } for (Vector v : datapoints) { Centroid closest = (Centroid)searcher.search(v, 1).get(0).getValue(); OnlineSummarizer summarizer = summarizers.get(closest.getIndex()); summarizer.add(distanceMeasure.distance(v, closest)); } return summarizers; }
/** * Computes the summaries for the distances in each cluster. * @param datapoints iterable of datapoints. * @param centroids iterable of Centroids. * @return a list of OnlineSummarizers where the i-th element is the summarizer corresponding to the cluster whose * index is i. */ public static List<OnlineSummarizer> summarizeClusterDistances(Iterable<? extends Vector> datapoints, Iterable<? extends Vector> centroids, DistanceMeasure distanceMeasure) { UpdatableSearcher searcher = new ProjectionSearch(distanceMeasure, 3, 1); searcher.addAll(centroids); List<OnlineSummarizer> summarizers = Lists.newArrayList(); if (searcher.size() == 0) { return summarizers; } for (int i = 0; i < searcher.size(); ++i) { summarizers.add(new OnlineSummarizer()); } for (Vector v : datapoints) { Centroid closest = (Centroid)searcher.search(v, 1).get(0).getValue(); OnlineSummarizer summarizer = summarizers.get(closest.getIndex()); summarizer.add(distanceMeasure.distance(v, closest)); } return summarizers; }
@Parameters public static List<Object[]> generateData() { return Arrays.asList(new Object[][] { {new ProjectionSearch(new SquaredEuclideanDistanceMeasure(), NUM_PROJECTIONS, SEARCH_SIZE), true}, {new FastProjectionSearch(new SquaredEuclideanDistanceMeasure(), NUM_PROJECTIONS, SEARCH_SIZE), true}, {new ProjectionSearch(new SquaredEuclideanDistanceMeasure(), NUM_PROJECTIONS, SEARCH_SIZE), false}, {new FastProjectionSearch(new SquaredEuclideanDistanceMeasure(), NUM_PROJECTIONS, SEARCH_SIZE), false}, }); }
@Parameterized.Parameters public static List<Object[]> generateData() { RandomUtils.useTestSeed(); Matrix dataPoints = LumpyData.lumpyRandomData(NUM_DATA_POINTS, NUM_DIMENSIONS); Matrix queries = LumpyData.lumpyRandomData(NUM_QUERIES, NUM_DIMENSIONS); DistanceMeasure distanceMeasure = new CosineDistanceMeasure(); Searcher bruteSearcher = new BruteSearch(distanceMeasure); bruteSearcher.addAll(dataPoints); Pair<List<List<WeightedThing<Vector>>>, Long> reference = getResultsAndRuntime(bruteSearcher, queries); Pair<List<WeightedThing<Vector>>, Long> referenceSearchFirst = getResultsAndRuntimeSearchFirst(bruteSearcher, queries); double bruteSearchAvgTime = reference.getSecond() / (queries.numRows() * 1.0); System.out.printf("BruteSearch: avg_time(1 query) %f[s]\n", bruteSearchAvgTime); return Arrays.asList(new Object[][]{ // NUM_PROJECTIONS = 3 // SEARCH_SIZE = 10 {new ProjectionSearch(distanceMeasure, 3, 10), dataPoints, queries, reference, referenceSearchFirst}, {new FastProjectionSearch(distanceMeasure, 3, 10), dataPoints, queries, reference, referenceSearchFirst}, // NUM_PROJECTIONS = 5 // SEARCH_SIZE = 5 {new ProjectionSearch(distanceMeasure, 5, 5), dataPoints, queries, reference, referenceSearchFirst}, {new FastProjectionSearch(distanceMeasure, 5, 5), dataPoints, queries, reference, referenceSearchFirst}, } ); }
@Parameterized.Parameters public static List<Object[]> generateData() { RandomUtils.useTestSeed(); Matrix dataPoints = multiNormalRandomData(NUM_DATA_POINTS, NUM_DIMENSIONS); return Arrays.asList(new Object[][]{ {new ProjectionSearch(new EuclideanDistanceMeasure(), NUM_PROJECTIONS, SEARCH_SIZE), dataPoints}, {new FastProjectionSearch(new EuclideanDistanceMeasure(), NUM_PROJECTIONS, SEARCH_SIZE), dataPoints}, {new LocalitySensitiveHashSearch(new EuclideanDistanceMeasure(), SEARCH_SIZE), dataPoints}, }); }