dataType = null; DataField field = new DataField(FieldName.create(featureName), opType, dataType); if (schema.isCategorical(featureName)) { Objects.requireNonNull(categoricalValueEncodings);
private Predicate buildPredicate(Split split, CategoricalValueEncodings categoricalValueEncodings) { if (split == null) { // Left child always applies, but is evaluated second return new True(); } int featureIndex = inputSchema.predictorToFeatureIndex(split.feature()); FieldName fieldName = FieldName.create(inputSchema.getFeatureNames().get(featureIndex)); if (split.featureType().equals(FeatureType.Categorical())) { // Note that categories in MLlib model select the *left* child but the // convention here will be that the predicate selects the *right* child // So the predicate will evaluate "not in" this set // More ugly casting @SuppressWarnings("unchecked") Collection<Double> javaCategories = (Collection<Double>) (Collection<?>) JavaConversions.seqAsJavaList(split.categories()); Set<Integer> negativeEncodings = javaCategories.stream().map(Double::intValue).collect(Collectors.toSet()); Map<Integer,String> encodingToValue = categoricalValueEncodings.getEncodingValueMap(featureIndex); List<String> negativeValues = negativeEncodings.stream().map(encodingToValue::get).collect(Collectors.toList()); String joinedValues = TextUtils.joinPMMLDelimited(negativeValues); return new SimpleSetPredicate(fieldName, SimpleSetPredicate.BooleanOperator.IS_NOT_IN, new Array(Array.Type.STRING, joinedValues)); } else { // For MLlib, left means <= threshold, so right means > return new SimplePredicate(fieldName, SimplePredicate.Operator.GREATER_THAN) .setValue(Double.toString(split.threshold())); } }
private ClusteringModel pmmlClusteringModel(KMeansModel model, Map<Integer,Long> clusterSizesMap) { Vector[] clusterCenters = model.clusterCenters(); List<ClusteringField> clusteringFields = new ArrayList<>(); for (int i = 0; i < inputSchema.getNumFeatures(); i++) { if (inputSchema.isActive(i)) { FieldName fieldName = FieldName.create(inputSchema.getFeatureNames().get(i)); ClusteringField clusteringField = new ClusteringField(fieldName).setCenterField(ClusteringField.CenterField.TRUE); clusteringFields.add(clusteringField); } } List<Cluster> clusters = new ArrayList<>(clusterCenters.length); for (int i = 0; i < clusterCenters.length; i++) { clusters.add(new Cluster().setId(Integer.toString(i)) .setSize(clusterSizesMap.get(i).intValue()) .setArray(AppPMMLUtils.toArray(clusterCenters[i].toArray()))); } return new ClusteringModel( MiningFunction.CLUSTERING, ClusteringModel.ModelClass.CENTER_BASED, clusters.size(), AppPMMLUtils.buildMiningSchema(inputSchema), new ComparisonMeasure(ComparisonMeasure.Kind.DISTANCE).setMeasure(new SquaredEuclidean()), clusteringFields, clusters); }
for (int featureIndex = 0; featureIndex < featureNames.size(); featureIndex++) { String featureName = featureNames.get(featureIndex); MiningField field = new MiningField(FieldName.create(featureName)); if (schema.isNumeric(featureName)) { field.setOpType(OpType.CONTINUOUS);
@Test public void testBuildCategoricalEncoding() { List<DataField> dataFields = new ArrayList<>(); dataFields.add(new DataField(FieldName.create("foo"), OpType.CONTINUOUS, DataType.DOUBLE)); DataField barField = new DataField(FieldName.create("bar"), OpType.CATEGORICAL, DataType.STRING); barField.addValues(new Value("b"), new Value("a")); dataFields.add(barField); DataDictionary dictionary = new DataDictionary(dataFields).setNumberOfFields(dataFields.size()); CategoricalValueEncodings encodings = AppPMMLUtils.buildCategoricalValueEncodings(dictionary); assertEquals(2, encodings.getValueCount(1)); assertEquals(0, encodings.getValueEncodingMap(1).get("b").intValue()); assertEquals(1, encodings.getValueEncodingMap(1).get("a").intValue()); assertEquals("b", encodings.getEncodingValueMap(1).get(0)); assertEquals("a", encodings.getEncodingValueMap(1).get(1)); assertEquals(Collections.singletonMap(1, 2), encodings.getCategoryCounts()); }
dataFields.add(new DataField(FieldName.create("x"), OpType.CONTINUOUS, DataType.DOUBLE)); dataFields.add(new DataField(FieldName.create("y"), OpType.CONTINUOUS, DataType.DOUBLE)); DataDictionary dataDictionary = new DataDictionary(dataFields).setNumberOfFields(dataFields.size()); MiningField xMF = new MiningField(FieldName.create("x")) .setOpType(OpType.CONTINUOUS).setUsageType(MiningField.UsageType.ACTIVE); miningFields.add(xMF); MiningField yMF = new MiningField(FieldName.create("y")) .setOpType(OpType.CONTINUOUS).setUsageType(MiningField.UsageType.ACTIVE); miningFields.add(yMF); FieldName.create("x")).setCenterField(ClusteringField.CenterField.TRUE)); clusteringFields.add(new ClusteringField( FieldName.create("y")).setCenterField(ClusteringField.CenterField.TRUE));
dataFields.add(new DataField(FieldName.create("foo"), OpType.CONTINUOUS, DataType.DOUBLE)); dataFields.add(new DataField(FieldName.create("bar"), OpType.CONTINUOUS, DataType.DOUBLE)); DataDictionary dataDictionary = new DataDictionary(dataFields).setNumberOfFields(dataFields.size()); MiningField predictorMF = new MiningField(FieldName.create("foo")) .setOpType(OpType.CONTINUOUS) .setUsageType(MiningField.UsageType.ACTIVE) .setImportance(0.5); miningFields.add(predictorMF); MiningField targetMF = new MiningField(FieldName.create("bar")) .setOpType(OpType.CONTINUOUS) .setUsageType(MiningField.UsageType.PREDICTED); .setScore("-2.0"); Node right = new Node().setId("r+").setRecordCount(halfCount) .setPredicate(new SimplePredicate(FieldName.create("foo"), SimplePredicate.Operator.GREATER_THAN).setValue("3.14")) .setScore("2.0");
new DataField(FieldName.create("color"), OpType.CATEGORICAL, DataType.STRING); predictor.addValues(new Value("yellow"), new Value("red")); dataFields.add(predictor); DataField target = new DataField(FieldName.create("fruit"), OpType.CATEGORICAL, DataType.STRING); target.addValues(new Value("banana"), new Value("apple")); dataFields.add(target); MiningField predictorMF = new MiningField(FieldName.create("color")) .setOpType(OpType.CATEGORICAL) .setUsageType(MiningField.UsageType.ACTIVE) .setImportance(0.5); miningFields.add(predictorMF); MiningField targetMF = new MiningField(FieldName.create("fruit")) .setOpType(OpType.CATEGORICAL) .setUsageType(MiningField.UsageType.PREDICTED); left.addScoreDistributions(new ScoreDistribution("apple", halfCount)); Node right = new Node().setId("r+").setRecordCount(halfCount) .setPredicate(new SimpleSetPredicate(FieldName.create("color"), SimpleSetPredicate.BooleanOperator.IS_NOT_IN, new Array(Array.Type.STRING, "red")));
@Override public FieldName convert(String value){ FieldName name = FieldName.create(value); return name; } }
static public <T extends Transformer & HasOutputCol> FieldName formatName(T transformer, int index){ return FieldName.create(transformer.getOutputCol() + "[" + index + "]"); } }
private Expression filterConstant(Constant constant){ if(("FMTWIDTH").equals(constant.getValue())){ FieldRef fieldRef = new FieldRef(FieldName.create("FMTWIDTH")); return fieldRef; } return constant; } }
final public FieldRef FieldInvocationExpression() throws ParseException {Token name; name = jj_consume_token(IDENTIFIER); FieldRef fieldRef = new FieldRef() .setField(FieldName.create(name.image)); return fieldRef; }
@Override public List<Feature> encodeFeatures(List<Feature> features, SkLearnEncoder encoder){ Transformer transformer = getTransformer(); String name = getName(); List<Feature> result = transformer.encodeFeatures(features, encoder); ClassDictUtil.checkSize(1, result); encoder.renameFeature(result.get(0), FieldName.create(name)); return result; }
@Override public List<OutputField> registerOutputFields(Label label, SparkMLEncoder encoder){ T model = getTransformer(); String predictionCol = model.getPredictionCol(); OutputField predictedField = ModelUtil.createPredictedField(FieldName.create(predictionCol), label.getDataType(), OpType.CONTINUOUS); encoder.putOnlyFeature(predictionCol, new ContinuousFeature(encoder, predictedField)); return Collections.singletonList(predictedField); } }
@Override public MiningModel encodeMiningModel(List<RegTree> regTrees, float base_score, Integer ntreeLimit, Schema schema){ Schema segmentSchema = schema.toAnonymousSchema(); MiningModel miningModel = createMiningModel(regTrees, base_score, ntreeLimit, segmentSchema) .setOutput(ModelUtil.createPredictedOutput(FieldName.create("xgbValue"), OpType.CONTINUOUS, DataType.FLOAT)); return MiningModelUtil.createRegression(miningModel, RegressionModel.NormalizationMethod.LOGIT, schema); } }
public TermFeature(PMMLEncoder encoder, DefineFunction defineFunction, Feature feature, String value){ super(encoder, FieldName.create(defineFunction.getName() + "(" + value + ")"), defineFunction.getDataType()); setDefineFunction(defineFunction); setFeature(feature); setValue(value); }
@Override public MiningModel encodeMiningModel(List<Tree> trees, Integer numIteration, Schema schema){ Schema segmentSchema = schema.toAnonymousSchema(); MiningModel miningModel = super.encodeMiningModel(trees, numIteration, segmentSchema) .setOutput(ModelUtil.createPredictedOutput(FieldName.create("lgbmValue"), OpType.CONTINUOUS, DataType.DOUBLE)); return MiningModelUtil.createRegression(miningModel, RegressionModel.NormalizationMethod.EXP, schema); } }
public PMML encodePMML(FieldName targetField, List<String> targetCategories, FeatureMap featureMap, Map<String, ?> options){ XGBoostEncoder encoder = new XGBoostEncoder(); if(targetField == null){ targetField = FieldName.create("_target"); } Label label = this.obj.encodeLabel(targetField, targetCategories, encoder); List<Feature> features = featureMap.encodeFeatures(encoder); Schema schema = new Schema(label, features); MiningModel miningModel = encodeMiningModel(options, schema); PMML pmml = encoder.encodePMML(miningModel); return pmml; }
private DataField convertColumnToDataField(ColumnConfig columnConfig) { DataField field = new DataField(); field.setName(FieldName.create(CommonUtils.getSimpleColumnName(columnConfig.getColumnName()))); field.setOptype(getOptype(columnConfig)); field.setDataType(getDataType(field.getOptype())); return field; }
@Override public MiningModel encodeMiningModel(List<Tree> trees, Integer numIteration, Schema schema){ Schema segmentSchema = new Schema(new ContinuousLabel(null, DataType.DOUBLE), schema.getFeatures()); MiningModel miningModel = createMiningModel(trees, numIteration, segmentSchema) .setOutput(ModelUtil.createPredictedOutput(FieldName.create("lgbmValue"), OpType.CONTINUOUS, DataType.DOUBLE, new SigmoidTransformation(-1d * BinomialLogisticRegression.this.sigmoid_))); return MiningModelUtil.createBinaryLogisticClassification(miningModel, 1d, 0d, RegressionModel.NormalizationMethod.NONE, true, schema); } }