private static org.apache.spark.mllib.tree.model.Node nextNode( double[] featureVector, org.apache.spark.mllib.tree.model.Node node, Split split, int featureIndex) { double featureValue = featureVector[featureIndex]; if (split.featureType().equals(FeatureType.Continuous())) { if (featureValue <= split.threshold()) { return node.leftNode().get(); } else { return node.rightNode().get(); } } else { if (split.categories().contains(featureValue)) { return node.leftNode().get(); } else { return node.rightNode().get(); } } }
/** * @param trainPointData data to run down trees * @param model random decision forest model to count on * @return map of predictor index to the number of training examples that reached a * node whose decision is based on that feature. The index is among predictors, not all * features, since there are fewer predictors than features. That is, the index will * match the one used in the {@link RandomForestModel}. */ private static IntLongHashMap predictorExampleCounts(JavaRDD<? extends LabeledPoint> trainPointData, RandomForestModel model) { return trainPointData.mapPartitions(data -> { IntLongHashMap featureIndexCount = new IntLongHashMap(); data.forEachRemaining(datum -> { double[] featureVector = datum.features().toArray(); for (DecisionTreeModel tree : model.trees()) { org.apache.spark.mllib.tree.model.Node node = tree.topNode(); // This logic cloned from Node.predict: while (!node.isLeaf()) { Split split = node.split().get(); int featureIndex = split.feature(); // Count feature featureIndexCount.addToValue(featureIndex, 1); node = nextNode(featureVector, node, split, featureIndex); } } }); return Collections.singleton(featureIndexCount).iterator(); }).reduce(RDFUpdate::merge); }
IntLongMap nodeIDCounts) { boolean classificationTask = dtModel.algo().equals(Algo.Classification()); Preconditions.checkState(classificationTask == inputSchema.isClassification()); treeNodes.add(new Pair<>(dtModel.topNode(), null)); long nodeCount = nodeIDCounts.get(treeNode.id()); modelNode.setRecordCount((double) nodeCount); if (treeNode.isLeaf()) { Predict prediction = treeNode.predict(); int targetEncodedValue = (int) prediction.predict(); if (classificationTask) { Map<Integer,String> targetEncodingToValue = categoricalValueEncodings.getEncodingValueMap(inputSchema.getTargetFeatureIndex()); double predictedProbability = prediction.prob(); Preconditions.checkState(predictedProbability >= 0.0 && predictedProbability <= 1.0); Split split = treeNode.split().get(); org.apache.spark.mllib.tree.model.Node rightTreeNode = treeNode.rightNode().get(); org.apache.spark.mllib.tree.model.Node leftTreeNode = treeNode.leftNode().get(); boolean defaultRight = nodeIDCounts.get(rightTreeNode.id()) > nodeIDCounts.get(leftTreeNode.id()); modelNode.setDefaultChild(defaultRight ? positiveModelNode.getId() : negativeModelNode.getId());
@Override public RDD<Object> predict(RDD<Vector> testData) { return model.predict(testData); }
private Predicate buildPredicate(Split split, CategoricalValueEncodings categoricalValueEncodings) { if (split == null) { // Left child always applies, but is evaluated second return new True(); } int featureIndex = inputSchema.predictorToFeatureIndex(split.feature()); FieldName fieldName = FieldName.create(inputSchema.getFeatureNames().get(featureIndex)); if (split.featureType().equals(FeatureType.Categorical())) { // Note that categories in MLlib model select the *left* child but the // convention here will be that the predicate selects the *right* child // So the predicate will evaluate "not in" this set // More ugly casting @SuppressWarnings("unchecked") Collection<Double> javaCategories = (Collection<Double>) (Collection<?>) JavaConversions.seqAsJavaList(split.categories()); Set<Integer> negativeEncodings = javaCategories.stream().map(Double::intValue).collect(Collectors.toSet()); Map<Integer,String> encodingToValue = categoricalValueEncodings.getEncodingValueMap(featureIndex); List<String> negativeValues = negativeEncodings.stream().map(encodingToValue::get).collect(Collectors.toList()); String joinedValues = TextUtils.joinPMMLDelimited(negativeValues); return new SimpleSetPredicate(fieldName, SimpleSetPredicate.BooleanOperator.IS_NOT_IN, new Array(Array.Type.STRING, joinedValues)); } else { // For MLlib, left means <= threshold, so right means > return new SimplePredicate(fieldName, SimplePredicate.Operator.GREATER_THAN) .setValue(Double.toString(split.threshold())); } }
//just a stub of how the search for a specific node might work (this is not the real implementation Node currentNode = ... if(comparator.compare(currentNode.content , toSearch) < 0) currentNode = currentNode.leftNode(); else ...
@Override public double predict(Vector testData) { return model.predict(testData); }
RandomForestModel model) { return trainPointData.mapPartitions(data -> { DecisionTreeModel[] trees = model.trees(); List<IntLongHashMap> treeNodeIDCounts = IntStream.range(0, trees.length). mapToObj(i -> new IntLongHashMap()).collect(Collectors.toList()); DecisionTreeModel tree = trees[i]; IntLongHashMap nodeIDCount = treeNodeIDCounts.get(i); org.apache.spark.mllib.tree.model.Node node = tree.topNode(); while (!node.isLeaf()) { nodeIDCount.addToValue(node.id(), 1); Split split = node.split().get(); int featureIndex = split.feature(); node = nextNode(featureVector, node, split, featureIndex); nodeIDCount.addToValue(node.id(), 1);
private static int validatePrediction( List<LabeledPoint> validationData, DecisionTreeModel model) { int numCorrect = 0; for (LabeledPoint point : validationData) { Double prediction = model.predict(point.features()); if (prediction == point.label()) { numCorrect++; } } return numCorrect; }
private static int validatePrediction( List<LabeledPoint> validationData, DecisionTreeModel model) { int numCorrect = 0; for (LabeledPoint point : validationData) { Double prediction = model.predict(point.features()); if (prediction == point.label()) { numCorrect++; } } return numCorrect; }
private static int validatePrediction( List<LabeledPoint> validationData, DecisionTreeModel model) { int numCorrect = 0; for (LabeledPoint point : validationData) { Double prediction = model.predict(point.features()); if (prediction == point.label()) { numCorrect++; } } return numCorrect; }
@Override public JavaRDD<Double> predict(JavaRDD<Vector> testData) { return model.predict(testData.rdd()).toJavaRDD().map(new DoubleValueMapper()); } }
@Test public void runDTUsingStaticMethods() { List<LabeledPoint> arr = DecisionTreeSuite.generateCategoricalDataPointsAsJavaList(); JavaRDD<LabeledPoint> rdd = jsc.parallelize(arr); HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<>(); categoricalFeaturesInfo.put(1, 2); // feature 1 has 2 categories int maxDepth = 4; int numClasses = 2; int maxBins = 100; Strategy strategy = new Strategy(Algo.Classification(), Gini.instance(), maxDepth, numClasses, maxBins, categoricalFeaturesInfo); DecisionTreeModel model = DecisionTree$.MODULE$.train(rdd.rdd(), strategy); // java compatibility test JavaRDD<Double> predictions = model.predict(rdd.map(LabeledPoint::features)); int numCorrect = validatePrediction(arr, model); Assert.assertEquals(numCorrect, rdd.count()); }
@Test public void runDTUsingStaticMethods() { List<LabeledPoint> arr = DecisionTreeSuite.generateCategoricalDataPointsAsJavaList(); JavaRDD<LabeledPoint> rdd = jsc.parallelize(arr); HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<>(); categoricalFeaturesInfo.put(1, 2); // feature 1 has 2 categories int maxDepth = 4; int numClasses = 2; int maxBins = 100; Strategy strategy = new Strategy(Algo.Classification(), Gini.instance(), maxDepth, numClasses, maxBins, categoricalFeaturesInfo); DecisionTreeModel model = DecisionTree$.MODULE$.train(rdd.rdd(), strategy); // java compatibility test JavaRDD<Double> predictions = model.predict(rdd.map(LabeledPoint::features)); int numCorrect = validatePrediction(arr, model); Assert.assertEquals(numCorrect, rdd.count()); }
@Test public void runDTUsingStaticMethods() { List<LabeledPoint> arr = DecisionTreeSuite.generateCategoricalDataPointsAsJavaList(); JavaRDD<LabeledPoint> rdd = jsc.parallelize(arr); HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<>(); categoricalFeaturesInfo.put(1, 2); // feature 1 has 2 categories int maxDepth = 4; int numClasses = 2; int maxBins = 100; Strategy strategy = new Strategy(Algo.Classification(), Gini.instance(), maxDepth, numClasses, maxBins, categoricalFeaturesInfo); DecisionTreeModel model = DecisionTree$.MODULE$.train(rdd.rdd(), strategy); // java compatibility test JavaRDD<Double> predictions = model.predict(rdd.map(LabeledPoint::features)); int numCorrect = validatePrediction(arr, model); Assert.assertEquals(numCorrect, rdd.count()); }