private JavaRDD<Vector> parsedToVectorRDD(JavaRDD<String[]> parsedRDD) { return parsedRDD.map(data -> { try { return Vectors.dense(KMeansUtils.featuresFromTokens(data, inputSchema)); } catch (NumberFormatException | ArrayIndexOutOfBoundsException e) { log.warn("Bad input: {}", Arrays.toString(data)); throw e; } }); }
static Vector average(Vector vec, Integer numVectors) { double[] avg = new double[vec.size()]; for (int i = 0; i < avg.length; i++) { // avg[i] = vec.apply(i) * (1.0 / numVectors); avg[i] = vec.apply(i) / ((double) numVectors); } return new DenseVector(avg); }
private JavaPairRDD<Integer, Iterable<double[]>> fetchClusteredPoints(JavaRDD<? extends Vector> evalData) { return evalData.mapToPair(vector -> { double closestDist = Double.POSITIVE_INFINITY; int minClusterID = Integer.MIN_VALUE; double[] vec = vector.toArray(); DistanceFn<double[]> distanceFn = getDistanceFn(); Map<Integer,ClusterInfo> clusters = getClustersByID(); for (ClusterInfo cluster : clusters.values()) { double distance = distanceFn.applyAsDouble(cluster.getCenter(), vec); if (distance < closestDist) { closestDist = distance; minClusterID = cluster.getID(); } } Preconditions.checkState(!Double.isInfinite(closestDist) && !Double.isNaN(closestDist)); return new Tuple2<>(minClusterID, vec); }).groupByKey(); }
static Vector getMalignant() { // 842302,M,17.99,10.38,122.8,1001,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189 String data = "17.99,10.38,122.8,1001,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189"; double[] features = new double[30]; String[] tokens = StringUtils.split(data, ","); // 30 tokens for (int i = 0; i < features.length; i++) { features[i] = Double.parseDouble(tokens[i]); } return new DenseVector(features); }
static Vector add(Vector a, Vector b) { double[] sum = new double[a.size()]; for (int i = 0; i < sum.length; i++) { sum[i] += a.apply(i) + b.apply(i); } return new DenseVector(sum); }
/** * @param evalData points to cluster for evaluation * @return cluster IDs as keys, and metrics for each cluster like the count, sum of distances to centroid, * and sum of squared distances */ JavaPairRDD<Integer,ClusterMetric> fetchClusterMetrics(JavaRDD<Vector> evalData) { return evalData.mapToPair(vector -> { double closestDist = Double.POSITIVE_INFINITY; int minClusterID = Integer.MIN_VALUE; double[] vec = vector.toArray(); for (ClusterInfo cluster : clusters.values()) { double distance = distanceFn.applyAsDouble(cluster.getCenter(), vec); if (distance < closestDist) { closestDist = distance; minClusterID = cluster.getID(); } } Preconditions.checkState(!Double.isInfinite(closestDist) && !Double.isNaN(closestDist)); return new Tuple2<>(minClusterID, new ClusterMetric(1L, closestDist, closestDist * closestDist)); }).reduceByKey(ClusterMetric::add); }
return new LabeledPoint(target, Vectors.dense(features)); } catch (NumberFormatException | ArrayIndexOutOfBoundsException e) { log.warn("Bad input: {}", Arrays.toString(data));
@Override public Vector call(String record) { // 8 features String[] tokens = StringUtils.split(record, ","); double[] features = new double[8]; for (int i = 0; i < features.length; i++) { features[i] = Double.parseDouble(tokens[i]); } return new DenseVector(features); } });
static Vector average(List<Vector> list) { // find sum double[] sum = new double[list.get(0).size()]; for (Vector v : list) { for (int i = 0; i < sum.length; i++) { sum[i] += v.apply(i); } } // find averages... int numOfVectors = list.size(); for (int i = 0; i < sum.length; i++) { sum[i] = sum[i] / numOfVectors; } return new DenseVector(sum); }
mapToObj(i -> new IntLongHashMap()).collect(Collectors.toList()); data.forEachRemaining(datum -> { double[] featureVector = datum.features().toArray(); for (int i = 0; i < trees.length; i++) { DecisionTreeModel tree = trees[i];
@Override public LabeledPoint call(String record) { // record: <Price><,><Age><,><KM><,><FuelType1><,><FuelType2><,><HP><,><MetColor><,><Automatic><,><CC><,><Doors><,><Weight> // tokens[0] = <Price> String[] tokens = StringUtils.split(record, ","); double[] features = new double[tokens.length - 1]; for (int i = 0; i < features.length; i++) { features[i] = Double.parseDouble(tokens[i+1]); } // double price = Double.parseDouble(tokens[0]); return new LabeledPoint(price, Vectors.dense(features)); } });
static Vector getBenign() { // 8510653,B,13.08,15.71,85.63,520,0.1075,0.127,0.04568,0.0311,0.1967,0.06811,0.1852,0.7477,1.383,14.67,0.004097,0.01898,0.01698,0.00649,0.01678,0.002425,14.5,20.49,96.09,630.5,0.1312,0.2776,0.189,0.07283,0.3184,0.08183 String data = "13.08,15.71,85.63,520,0.1075,0.127,0.04568,0.0311,0.1967,0.06811,0.1852,0.7477,1.383,14.67,0.004097,0.01898,0.01698,0.00649,0.01678,0.002425,14.5,20.49,96.09,630.5,0.1312,0.2776,0.189,0.07283,0.3184,0.08183"; double[] features = new double[30]; String[] tokens = StringUtils.split(data, ","); // 30 tokens for (int i = 0; i < features.length; i++) { features[i] = Double.parseDouble(tokens[i]); } return new DenseVector(features); }
/** * @param trainPointData data to run down trees * @param model random decision forest model to count on * @return map of predictor index to the number of training examples that reached a * node whose decision is based on that feature. The index is among predictors, not all * features, since there are fewer predictors than features. That is, the index will * match the one used in the {@link RandomForestModel}. */ private static IntLongHashMap predictorExampleCounts(JavaRDD<? extends LabeledPoint> trainPointData, RandomForestModel model) { return trainPointData.mapPartitions(data -> { IntLongHashMap featureIndexCount = new IntLongHashMap(); data.forEachRemaining(datum -> { double[] featureVector = datum.features().toArray(); for (DecisionTreeModel tree : model.trees()) { org.apache.spark.mllib.tree.model.Node node = tree.topNode(); // This logic cloned from Node.predict: while (!node.isLeaf()) { Split split = node.split().get(); int featureIndex = split.feature(); // Count feature featureIndexCount.addToValue(featureIndex, 1); node = nextNode(featureVector, node, split, featureIndex); } } }); return Collections.singleton(featureIndexCount).iterator(); }).reduce(RDFUpdate::merge); }
/** * Build a Vector from given features denoted by a comma separated feature values * * @param features set of features in the format: * <feature_1><,><feature_2><,>...<,><feature_N> * @param delimiter such as ",", "\t", ";", ... * @return a Vector of features */ static Vector buildVector(final String features, final String delimiter) { String[] tokens = StringUtils.split(features, delimiter); double[] d = new double[tokens.length]; for (int i = 0; i < d.length; i++) { d[i] = Double.parseDouble(tokens[i]); } return new DenseVector(d); }
private ClusteringModel pmmlClusteringModel(KMeansModel model, Map<Integer,Long> clusterSizesMap) { Vector[] clusterCenters = model.clusterCenters(); List<ClusteringField> clusteringFields = new ArrayList<>(); for (int i = 0; i < inputSchema.getNumFeatures(); i++) { if (inputSchema.isActive(i)) { FieldName fieldName = FieldName.create(inputSchema.getFeatureNames().get(i)); ClusteringField clusteringField = new ClusteringField(fieldName).setCenterField(ClusteringField.CenterField.TRUE); clusteringFields.add(clusteringField); } } List<Cluster> clusters = new ArrayList<>(clusterCenters.length); for (int i = 0; i < clusterCenters.length; i++) { clusters.add(new Cluster().setId(Integer.toString(i)) .setSize(clusterSizesMap.get(i).intValue()) .setArray(AppPMMLUtils.toArray(clusterCenters[i].toArray()))); } return new ClusteringModel( MiningFunction.CLUSTERING, ClusteringModel.ModelClass.CENTER_BASED, clusters.size(), AppPMMLUtils.buildMiningSchema(inputSchema), new ComparisonMeasure(ComparisonMeasure.Kind.DISTANCE).setMeasure(new SquaredEuclidean()), clusteringFields, clusters); }
@Override public Vector call(String record) { String[] tokens = StringUtils.split(record, " "); // 4 tokens double[] features = new double[4]; features[0] = getOutlook(tokens[0]); // outlook features[1] = getTemperature(tokens[1]); // temperature features[2] = getHumidity(tokens[2]); // humidity features[3] = getWind(tokens[3]); // windy return new DenseVector(features); } });
static void debug(String record, Vector v) { THE_LOGGER.info("DEBUG started:"); double[] d = v.toArray(); StringBuilder builder = new StringBuilder(); builder.append("DEBUG[record="); builder.append(record); builder.append("]:"); for (int i=0; i < d.length; i++){ builder.append("\t"); builder.append(d[i]); } THE_LOGGER.info(builder.toString()); }
@Override public LabeledPoint call(String record) { String[] tokens = StringUtils.split(record, " "); // 5 tokens double[] features = new double[4]; features[0] = getOutlook(tokens[0]); // outlook features[1] = getTemperature(tokens[1]); // temperature features[2] = getHumidity(tokens[2]); // humidity features[3] = getWind(tokens[3]); // windy // tokens[4] => classification: play=0 or not-play=1 double classification = getPlay(tokens[4]); Vector v = new DenseVector(features); debug(record, v); // add a classification for the training data set return new LabeledPoint(classification, v); } });
static void debug(String record, Vector v) { THE_LOGGER.info("DEBUG started:"); double[] d = v.toArray(); StringBuilder builder = new StringBuilder(); builder.append("DEBUG[record="); builder.append(record); builder.append("]:"); for (int i=0; i < d.length; i++){ builder.append("\t"); builder.append(d[i]); } THE_LOGGER.info(builder.toString()); }