org.apache.spark.mllib.tree.model java code examples

private static org.apache.spark.mllib.tree.model.Node nextNode(
  double[] featureVector,
  org.apache.spark.mllib.tree.model.Node node,
  Split split,
  int featureIndex) {
 double featureValue = featureVector[featureIndex];
 if (split.featureType().equals(FeatureType.Continuous())) {
  if (featureValue <= split.threshold()) {
   return node.leftNode().get();
  } else {
   return node.rightNode().get();
  }
 } else {
  if (split.categories().contains(featureValue)) {
   return node.leftNode().get();
  } else {
   return node.rightNode().get();
  }
 }
}

/**
 * @param trainPointData data to run down trees
 * @param model random decision forest model to count on
 * @return map of predictor index to the number of training examples that reached a
 *  node whose decision is based on that feature. The index is among predictors, not all
 *  features, since there are fewer predictors than features. That is, the index will
 *  match the one used in the {@link RandomForestModel}.
 */
private static IntLongHashMap predictorExampleCounts(JavaRDD<? extends LabeledPoint> trainPointData,
                           RandomForestModel model) {
 return trainPointData.mapPartitions(data -> {
   IntLongHashMap featureIndexCount = new IntLongHashMap();
   data.forEachRemaining(datum -> {
    double[] featureVector = datum.features().toArray();
    for (DecisionTreeModel tree : model.trees()) {
     org.apache.spark.mllib.tree.model.Node node = tree.topNode();
     // This logic cloned from Node.predict:
     while (!node.isLeaf()) {
      Split split = node.split().get();
      int featureIndex = split.feature();
      // Count feature
      featureIndexCount.addToValue(featureIndex, 1);
      node = nextNode(featureVector, node, split, featureIndex);
     }
    }
   });
   return Collections.singleton(featureIndexCount).iterator();
 }).reduce(RDFUpdate::merge);
}

              IntLongMap nodeIDCounts) {
boolean classificationTask = dtModel.algo().equals(Algo.Classification());
Preconditions.checkState(classificationTask == inputSchema.isClassification());
treeNodes.add(new Pair<>(dtModel.topNode(), null));
 long nodeCount = nodeIDCounts.get(treeNode.id());
 modelNode.setRecordCount((double) nodeCount);
 if (treeNode.isLeaf()) {
  Predict prediction = treeNode.predict();
  int targetEncodedValue = (int) prediction.predict();
  if (classificationTask) {
   Map<Integer,String> targetEncodingToValue =
     categoricalValueEncodings.getEncodingValueMap(inputSchema.getTargetFeatureIndex());
   double predictedProbability = prediction.prob();
   Preconditions.checkState(predictedProbability >= 0.0 && predictedProbability <= 1.0);
  Split split = treeNode.split().get();
  org.apache.spark.mllib.tree.model.Node rightTreeNode = treeNode.rightNode().get();
  org.apache.spark.mllib.tree.model.Node leftTreeNode = treeNode.leftNode().get();
  boolean defaultRight = nodeIDCounts.get(rightTreeNode.id()) > nodeIDCounts.get(leftTreeNode.id());
  modelNode.setDefaultChild(defaultRight ? positiveModelNode.getId() : negativeModelNode.getId());

@Override
public RDD<Object> predict(RDD<Vector> testData) {
 return model.predict(testData);
}

private Predicate buildPredicate(Split split,
                 CategoricalValueEncodings categoricalValueEncodings) {
 if (split == null) {
  // Left child always applies, but is evaluated second
  return new True();
 }
 int featureIndex = inputSchema.predictorToFeatureIndex(split.feature());
 FieldName fieldName = FieldName.create(inputSchema.getFeatureNames().get(featureIndex));
 if (split.featureType().equals(FeatureType.Categorical())) {
  // Note that categories in MLlib model select the *left* child but the
  // convention here will be that the predicate selects the *right* child
  // So the predicate will evaluate "not in" this set
  // More ugly casting
  @SuppressWarnings("unchecked")
  Collection<Double> javaCategories = (Collection<Double>) (Collection<?>)
    JavaConversions.seqAsJavaList(split.categories());
  Set<Integer> negativeEncodings = javaCategories.stream().map(Double::intValue).collect(Collectors.toSet());
  Map<Integer,String> encodingToValue =
    categoricalValueEncodings.getEncodingValueMap(featureIndex);
  List<String> negativeValues = negativeEncodings.stream().map(encodingToValue::get).collect(Collectors.toList());
  String joinedValues = TextUtils.joinPMMLDelimited(negativeValues);
  return new SimpleSetPredicate(fieldName,
                 SimpleSetPredicate.BooleanOperator.IS_NOT_IN,
                 new Array(Array.Type.STRING, joinedValues));
 } else {
  // For MLlib, left means <= threshold, so right means >
  return new SimplePredicate(fieldName, SimplePredicate.Operator.GREATER_THAN)
    .setValue(Double.toString(split.threshold()));
 }
}

             IntLongMap predictorIndexCounts) {
boolean classificationTask = rfModel.algo().equals(Algo.Classification());
Preconditions.checkState(classificationTask == inputSchema.isClassification());
DecisionTreeModel[] trees = rfModel.trees();

 //just a stub of how the search for a specific node might work (this is not the real implementation
Node currentNode = ...
if(comparator.compare(currentNode.content , toSearch) < 0)
  currentNode = currentNode.leftNode();
else
  ...

@Override
public double predict(Vector testData) {
 return model.predict(testData);
}

                            RandomForestModel model) {
return trainPointData.mapPartitions(data -> {
  DecisionTreeModel[] trees = model.trees();
  List<IntLongHashMap> treeNodeIDCounts = IntStream.range(0, trees.length).
    mapToObj(i -> new IntLongHashMap()).collect(Collectors.toList());
    DecisionTreeModel tree = trees[i];
    IntLongHashMap nodeIDCount = treeNodeIDCounts.get(i);
    org.apache.spark.mllib.tree.model.Node node = tree.topNode();
    while (!node.isLeaf()) {
     nodeIDCount.addToValue(node.id(), 1);
     Split split = node.split().get();
     int featureIndex = split.feature();
     node = nextNode(featureVector, node, split, featureIndex);
    nodeIDCount.addToValue(node.id(), 1);

private static int validatePrediction(
  List<LabeledPoint> validationData, DecisionTreeModel model) {
 int numCorrect = 0;
 for (LabeledPoint point : validationData) {
  Double prediction = model.predict(point.features());
  if (prediction == point.label()) {
   numCorrect++;
  }
 }
 return numCorrect;
}

private static int validatePrediction(
  List<LabeledPoint> validationData, DecisionTreeModel model) {
 int numCorrect = 0;
 for (LabeledPoint point : validationData) {
  Double prediction = model.predict(point.features());
  if (prediction == point.label()) {
   numCorrect++;
  }
 }
 return numCorrect;
}

private static int validatePrediction(
  List<LabeledPoint> validationData, DecisionTreeModel model) {
 int numCorrect = 0;
 for (LabeledPoint point : validationData) {
  Double prediction = model.predict(point.features());
  if (prediction == point.label()) {
   numCorrect++;
  }
 }
 return numCorrect;
}

 @Override
 public JavaRDD<Double> predict(JavaRDD<Vector> testData) {
  return model.predict(testData.rdd()).toJavaRDD().map(new DoubleValueMapper());
 }
}

@Test
public void runDTUsingStaticMethods() {
 List<LabeledPoint> arr = DecisionTreeSuite.generateCategoricalDataPointsAsJavaList();
 JavaRDD<LabeledPoint> rdd = jsc.parallelize(arr);
 HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<>();
 categoricalFeaturesInfo.put(1, 2); // feature 1 has 2 categories
 int maxDepth = 4;
 int numClasses = 2;
 int maxBins = 100;
 Strategy strategy = new Strategy(Algo.Classification(), Gini.instance(), maxDepth, numClasses,
  maxBins, categoricalFeaturesInfo);
 DecisionTreeModel model = DecisionTree$.MODULE$.train(rdd.rdd(), strategy);
 // java compatibility test
 JavaRDD<Double> predictions = model.predict(rdd.map(LabeledPoint::features));
 int numCorrect = validatePrediction(arr, model);
 Assert.assertEquals(numCorrect, rdd.count());
}

@Test
public void runDTUsingStaticMethods() {
 List<LabeledPoint> arr = DecisionTreeSuite.generateCategoricalDataPointsAsJavaList();
 JavaRDD<LabeledPoint> rdd = jsc.parallelize(arr);
 HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<>();
 categoricalFeaturesInfo.put(1, 2); // feature 1 has 2 categories
 int maxDepth = 4;
 int numClasses = 2;
 int maxBins = 100;
 Strategy strategy = new Strategy(Algo.Classification(), Gini.instance(), maxDepth, numClasses,
  maxBins, categoricalFeaturesInfo);
 DecisionTreeModel model = DecisionTree$.MODULE$.train(rdd.rdd(), strategy);
 // java compatibility test
 JavaRDD<Double> predictions = model.predict(rdd.map(LabeledPoint::features));
 int numCorrect = validatePrediction(arr, model);
 Assert.assertEquals(numCorrect, rdd.count());
}

@Test
public void runDTUsingStaticMethods() {
 List<LabeledPoint> arr = DecisionTreeSuite.generateCategoricalDataPointsAsJavaList();
 JavaRDD<LabeledPoint> rdd = jsc.parallelize(arr);
 HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<>();
 categoricalFeaturesInfo.put(1, 2); // feature 1 has 2 categories
 int maxDepth = 4;
 int numClasses = 2;
 int maxBins = 100;
 Strategy strategy = new Strategy(Algo.Classification(), Gini.instance(), maxDepth, numClasses,
  maxBins, categoricalFeaturesInfo);
 DecisionTreeModel model = DecisionTree$.MODULE$.train(rdd.rdd(), strategy);
 // java compatibility test
 JavaRDD<Double> predictions = model.predict(rdd.map(LabeledPoint::features));
 int numCorrect = validatePrediction(arr, model);
 Assert.assertEquals(numCorrect, rdd.count());
}

How to use org.apache.spark.mllib.tree.model

Best Java code snippets using org.apache.spark.mllib.tree.model (Showing top 16 results out of 315)