Refine search
private JavaPairRDD<Integer, Iterable<double[]>> fetchClusteredPoints(JavaRDD<? extends Vector> evalData) { return evalData.mapToPair(vector -> { double closestDist = Double.POSITIVE_INFINITY; int minClusterID = Integer.MIN_VALUE; double[] vec = vector.toArray(); DistanceFn<double[]> distanceFn = getDistanceFn(); Map<Integer,ClusterInfo> clusters = getClustersByID(); for (ClusterInfo cluster : clusters.values()) { double distance = distanceFn.applyAsDouble(cluster.getCenter(), vec); if (distance < closestDist) { closestDist = distance; minClusterID = cluster.getID(); } } Preconditions.checkState(!Double.isInfinite(closestDist) && !Double.isNaN(closestDist)); return new Tuple2<>(minClusterID, vec); }).groupByKey(); }
@Test @SuppressWarnings("unchecked") public void testUniformVectorRDD() { long m = 100L; int n = 10; int p = 2; long seed = 1L; JavaRDD<Vector> rdd1 = uniformJavaVectorRDD(jsc, m, n); JavaRDD<Vector> rdd2 = uniformJavaVectorRDD(jsc, m, n, p); JavaRDD<Vector> rdd3 = uniformJavaVectorRDD(jsc, m, n, p, seed); for (JavaRDD<Vector> rdd : Arrays.asList(rdd1, rdd2, rdd3)) { Assert.assertEquals(m, rdd.count()); Assert.assertEquals(n, rdd.first().size()); } }
.cache(); JavaRDD<Row> samples = data.map( new Function<String, Row>() { public Row call(String s) { }).cache(); JavaRDD<Row> validSamples = samples.filter( final LassoModel model = algorithm.run(JavaRDD.toRDD(parsedData)); System.out.println("Coefficients: " + Arrays.toString(model.weights().toArray())); System.out.println("Intercept: " + model.intercept());
@Test public void tfIdf() { // The tests are to check Java compatibility. HashingTF tf = new HashingTF(); @SuppressWarnings("unchecked") JavaRDD<List<String>> documents = jsc.parallelize(Arrays.asList( Arrays.asList("this is a sentence".split(" ")), Arrays.asList("this is another sentence".split(" ")), Arrays.asList("this is still a sentence".split(" "))), 2); JavaRDD<Vector> termFreqs = tf.transform(documents); termFreqs.collect(); IDF idf = new IDF(); JavaRDD<Vector> tfIdfs = idf.fit(termFreqs).transform(termFreqs); List<Vector> localTfIdfs = tfIdfs.collect(); int indexOfThis = tf.indexOf("this"); for (Vector v : localTfIdfs) { Assert.assertEquals(0.0, v.apply(indexOfThis), 1e-15); } }
return trainPointData.mapPartitions(data -> { DecisionTreeModel[] trees = model.trees(); List<IntLongHashMap> treeNodeIDCounts = IntStream.range(0, trees.length). mapToObj(i -> new IntLongHashMap()).collect(Collectors.toList()); data.forEachRemaining(datum -> { double[] featureVector = datum.features().toArray(); for (int i = 0; i < trees.length; i++) { DecisionTreeModel tree = trees[i]; return Collections.singleton(treeNodeIDCounts).iterator(); ).reduce((a, b) -> { Preconditions.checkArgument(a.size() == b.size()); for (int i = 0; i < a.size(); i++) {
private ClusteringModel pmmlClusteringModel(KMeansModel model, Map<Integer,Long> clusterSizesMap) { Vector[] clusterCenters = model.clusterCenters(); List<ClusteringField> clusteringFields = new ArrayList<>(); for (int i = 0; i < inputSchema.getNumFeatures(); i++) { if (inputSchema.isActive(i)) { FieldName fieldName = FieldName.create(inputSchema.getFeatureNames().get(i)); ClusteringField clusteringField = new ClusteringField(fieldName).setCenterField(ClusteringField.CenterField.TRUE); clusteringFields.add(clusteringField); } } List<Cluster> clusters = new ArrayList<>(clusterCenters.length); for (int i = 0; i < clusterCenters.length; i++) { clusters.add(new Cluster().setId(Integer.toString(i)) .setSize(clusterSizesMap.get(i).intValue()) .setArray(AppPMMLUtils.toArray(clusterCenters[i].toArray()))); } return new ClusteringModel( MiningFunction.CLUSTERING, ClusteringModel.ModelClass.CENTER_BASED, clusters.size(), AppPMMLUtils.buildMiningSchema(inputSchema), new ComparisonMeasure(ComparisonMeasure.Kind.DISTANCE).setMeasure(new SquaredEuclidean()), clusteringFields, clusters); }
@Test public void testConvertVectorColumnsToAndFromML() { Vector x = Vectors.dense(2.0); Dataset<Row> dataset = spark.createDataFrame( Collections.singletonList(new LabeledPoint(1.0, x)), LabeledPoint.class ).select("label", "features"); Dataset<Row> newDataset1 = MLUtils.convertVectorColumnsToML(dataset); Row new1 = newDataset1.first(); Assert.assertEquals(RowFactory.create(1.0, x.asML()), new1); Row new2 = MLUtils.convertVectorColumnsToML(dataset, "features").first(); Assert.assertEquals(new1, new2); Row old1 = MLUtils.convertVectorColumnsFromML(newDataset1).first(); Assert.assertEquals(RowFactory.create(1.0, x), old1); }
@Test @SuppressWarnings("unchecked") public void testNormalVectorRDD() { long m = 100L; int n = 10; int p = 2; long seed = 1L; JavaRDD<Vector> rdd1 = normalJavaVectorRDD(jsc, m, n); JavaRDD<Vector> rdd2 = normalJavaVectorRDD(jsc, m, n, p); JavaRDD<Vector> rdd3 = normalJavaVectorRDD(jsc, m, n, p, seed); for (JavaRDD<Vector> rdd : Arrays.asList(rdd1, rdd2, rdd3)) { Assert.assertEquals(m, rdd.count()); Assert.assertEquals(n, rdd.first().size()); } }
@Test public void tfIdf() { // The tests are to check Java compatibility. HashingTF tf = new HashingTF(); @SuppressWarnings("unchecked") JavaRDD<List<String>> documents = jsc.parallelize(Arrays.asList( Arrays.asList("this is a sentence".split(" ")), Arrays.asList("this is another sentence".split(" ")), Arrays.asList("this is still a sentence".split(" "))), 2); JavaRDD<Vector> termFreqs = tf.transform(documents); termFreqs.collect(); IDF idf = new IDF(); JavaRDD<Vector> tfIdfs = idf.fit(termFreqs).transform(termFreqs); List<Vector> localTfIdfs = tfIdfs.collect(); int indexOfThis = tf.indexOf("this"); for (Vector v : localTfIdfs) { Assert.assertEquals(0.0, v.apply(indexOfThis), 1e-15); } }
/** * @param trainPointData data to run down trees * @param model random decision forest model to count on * @return map of predictor index to the number of training examples that reached a * node whose decision is based on that feature. The index is among predictors, not all * features, since there are fewer predictors than features. That is, the index will * match the one used in the {@link RandomForestModel}. */ private static IntLongHashMap predictorExampleCounts(JavaRDD<? extends LabeledPoint> trainPointData, RandomForestModel model) { return trainPointData.mapPartitions(data -> { IntLongHashMap featureIndexCount = new IntLongHashMap(); data.forEachRemaining(datum -> { double[] featureVector = datum.features().toArray(); for (DecisionTreeModel tree : model.trees()) { org.apache.spark.mllib.tree.model.Node node = tree.topNode(); // This logic cloned from Node.predict: while (!node.isLeaf()) { Split split = node.split().get(); int featureIndex = split.feature(); // Count feature featureIndexCount.addToValue(featureIndex, 1); node = nextNode(featureVector, node, split, featureIndex); } } }); return Collections.singleton(featureIndexCount).iterator(); }).reduce(RDFUpdate::merge); }
static void debug(String record, Vector v) { THE_LOGGER.info("DEBUG started:"); double[] d = v.toArray(); StringBuilder builder = new StringBuilder(); builder.append("DEBUG[record="); builder.append(record); builder.append("]:"); for (int i=0; i < d.length; i++){ builder.append("\t"); builder.append(d[i]); } THE_LOGGER.info(builder.toString()); }
@Test public void testConvertVectorColumnsToAndFromML() { Vector x = Vectors.dense(2.0); Dataset<Row> dataset = spark.createDataFrame( Collections.singletonList(new LabeledPoint(1.0, x)), LabeledPoint.class ).select("label", "features"); Dataset<Row> newDataset1 = MLUtils.convertVectorColumnsToML(dataset); Row new1 = newDataset1.first(); Assert.assertEquals(RowFactory.create(1.0, x.asML()), new1); Row new2 = MLUtils.convertVectorColumnsToML(dataset, "features").first(); Assert.assertEquals(new1, new2); Row old1 = MLUtils.convertVectorColumnsFromML(newDataset1).first(); Assert.assertEquals(RowFactory.create(1.0, x), old1); }
@Test @SuppressWarnings("unchecked") public void testNormalVectorRDD() { long m = 100L; int n = 10; int p = 2; long seed = 1L; JavaRDD<Vector> rdd1 = normalJavaVectorRDD(jsc, m, n); JavaRDD<Vector> rdd2 = normalJavaVectorRDD(jsc, m, n, p); JavaRDD<Vector> rdd3 = normalJavaVectorRDD(jsc, m, n, p, seed); for (JavaRDD<Vector> rdd : Arrays.asList(rdd1, rdd2, rdd3)) { Assert.assertEquals(m, rdd.count()); Assert.assertEquals(n, rdd.first().size()); } }
@Test public void tfIdfMinimumDocumentFrequency() { // The tests are to check Java compatibility. HashingTF tf = new HashingTF(); @SuppressWarnings("unchecked") JavaRDD<List<String>> documents = jsc.parallelize(Arrays.asList( Arrays.asList("this is a sentence".split(" ")), Arrays.asList("this is another sentence".split(" ")), Arrays.asList("this is still a sentence".split(" "))), 2); JavaRDD<Vector> termFreqs = tf.transform(documents); termFreqs.collect(); IDF idf = new IDF(2); JavaRDD<Vector> tfIdfs = idf.fit(termFreqs).transform(termFreqs); List<Vector> localTfIdfs = tfIdfs.collect(); int indexOfThis = tf.indexOf("this"); for (Vector v : localTfIdfs) { Assert.assertEquals(0.0, v.apply(indexOfThis), 1e-15); } }
/** * @param evalData points to cluster for evaluation * @return cluster IDs as keys, and metrics for each cluster like the count, sum of distances to centroid, * and sum of squared distances */ JavaPairRDD<Integer,ClusterMetric> fetchClusterMetrics(JavaRDD<Vector> evalData) { return evalData.mapToPair(vector -> { double closestDist = Double.POSITIVE_INFINITY; int minClusterID = Integer.MIN_VALUE; double[] vec = vector.toArray(); for (ClusterInfo cluster : clusters.values()) { double distance = distanceFn.applyAsDouble(cluster.getCenter(), vec); if (distance < closestDist) { closestDist = distance; minClusterID = cluster.getID(); } } Preconditions.checkState(!Double.isInfinite(closestDist) && !Double.isNaN(closestDist)); return new Tuple2<>(minClusterID, new ClusterMetric(1L, closestDist, closestDist * closestDist)); }).reduceByKey(ClusterMetric::add); }