complexMap2}); StructType schema = new StructType() .add("a", BooleanType, false) .add("b", IntegerType, false) .add("c", BinaryType) .add("d", createArrayType(StringType)) .add("e", createArrayType(StringType)) .add("f", createArrayType(LongType)) .add("g", createMapType(IntegerType, StringType)) .add("h",createMapType(createArrayType(LongType), createMapType(StringType, StringType))); Dataset<SimpleJavaBean> ds3 = spark.createDataFrame(Arrays.asList(row1, row2), schema) .as(Encoders.bean(SimpleJavaBean.class));
complexMap2}); StructType schema = new StructType() .add("a", BooleanType, false) .add("b", IntegerType, false) .add("c", BinaryType) .add("d", createArrayType(StringType)) .add("e", createArrayType(StringType)) .add("f", createArrayType(LongType)) .add("g", createMapType(IntegerType, StringType)) .add("h",createMapType(createArrayType(LongType), createMapType(StringType, StringType))); Dataset<SimpleJavaBean> ds3 = spark.createDataFrame(Arrays.asList(row1, row2), schema) .as(Encoders.bean(SimpleJavaBean.class));
complexMap2}); StructType schema = new StructType() .add("a", BooleanType, false) .add("b", IntegerType, false) .add("c", BinaryType) .add("d", createArrayType(StringType)) .add("e", createArrayType(StringType)) .add("f", createArrayType(LongType)) .add("g", createMapType(IntegerType, StringType)) .add("h",createMapType(createArrayType(LongType), createMapType(StringType, StringType))); Dataset<SimpleJavaBean> ds3 = spark.createDataFrame(Arrays.asList(row1, row2), schema) .as(Encoders.bean(SimpleJavaBean.class));
public StructType add(String name, String dataType, Metadata metadata) { return add(name, DataType.fromName(dataType), metadata); }
@Override public StructType transformSchema(StructType structType) { return structType.add(getOutputColumn(),new VectorUDT(),true); }
public static StructType appendFields(StructType from, List<StructField> fields) { StructType to = DataTypes.createStructType(from.fields()); for (StructField field : fields) { to = to.add(field); } return to; }
@Override public StructType transformSchema(StructType schema){ StructType structSchema = getStructSchema(schema); StructType result = schema; StructField[] fields = structSchema.fields(); for(StructField field : fields){ result = result.add(field); } return result; }
resultSchema = resultSchema.add(f);
resultSchema = resultSchema.add(f);
public static Row append(Row row, String fieldName, DataType fieldType, Object value) { StructType appendedSchema = row.schema().add(fieldName, fieldType); Object[] appendedValues = ObjectArrays.concat(valuesFor(row), value); Row appendedRow = new RowWithSchema(appendedSchema, appendedValues); return appendedRow; }
@Override public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception { if (!dependencies.containsKey(intoDependency)) { throw new RuntimeException("Nest deriver points to non-existent nest-into dependency"); } Dataset<Row> into = dependencies.get(intoDependency); if (!dependencies.containsKey(fromDependency)) { throw new RuntimeException("Nest deriver points to non-existent nest-from dependency"); } Dataset<Row> from = dependencies.get(fromDependency); ExtractFieldsFunction extractFieldsFunction = new ExtractFieldsFunction(keyFieldNames); JavaPairRDD<List<Object>, Row> keyedIntoRDD = into.javaRDD().keyBy(extractFieldsFunction); JavaPairRDD<List<Object>, Row> keyedFromRDD = from.javaRDD().keyBy(extractFieldsFunction); NestFunction nestFunction = new NestFunction(); JavaRDD<Row> nestedRDD = keyedIntoRDD.cogroup(keyedFromRDD).values().map(nestFunction); StructType nestedSchema = into.schema().add(nestedFieldName, DataTypes.createArrayType(from.schema())); Dataset<Row> nested = into.sqlContext().createDataFrame(nestedRDD, nestedSchema); return nested; }
@Before public void before() { schemaWithoutMT = DataTypes.createStructType(Lists.newArrayList(DataTypes.createStructField("other", DataTypes.StringType, true))); schemaWithMT = schemaWithoutMT.add(DataTypes.createStructField(MutationType.MUTATION_TYPE_FIELD_NAME, DataTypes.StringType, true)); schemaWithTMs = DataTypes.createStructType(Lists.newArrayList( DataTypes.createStructField("first", DataTypes.StringType, true), DataTypes.createStructField("second", DataTypes.StringType, true))); firstTM = new DummyTimeModel(); secondTM = new DummyTimeModel(); firstTM.configure(ConfigFactory.empty(), Lists.newArrayList("first")); secondTM.configure(ConfigFactory.empty(), Lists.newArrayList("second")); }
@Test public void vectorSlice() { Attribute[] attrs = new Attribute[]{ NumericAttribute.defaultAttr().withName("f1"), NumericAttribute.defaultAttr().withName("f2"), NumericAttribute.defaultAttr().withName("f3") }; AttributeGroup group = new AttributeGroup("userFeatures", attrs); List<Row> data = Arrays.asList( RowFactory.create(Vectors.sparse(3, new int[]{0, 1}, new double[]{-2.0, 2.3})), RowFactory.create(Vectors.dense(-2.0, 2.3, 0.0)) ); Dataset<Row> dataset = spark.createDataFrame(data, (new StructType()).add(group.toStructField())); VectorSlicer vectorSlicer = new VectorSlicer() .setInputCol("userFeatures").setOutputCol("features"); vectorSlicer.setIndices(new int[]{1}).setNames(new String[]{"f3"}); Dataset<Row> output = vectorSlicer.transform(dataset); for (Row r : output.select("userFeatures", "features").takeAsList(2)) { Vector features = r.getAs(1); Assert.assertEquals(features.size(), 2); } } }
@Test public void vectorSlice() { Attribute[] attrs = new Attribute[]{ NumericAttribute.defaultAttr().withName("f1"), NumericAttribute.defaultAttr().withName("f2"), NumericAttribute.defaultAttr().withName("f3") }; AttributeGroup group = new AttributeGroup("userFeatures", attrs); List<Row> data = Arrays.asList( RowFactory.create(Vectors.sparse(3, new int[]{0, 1}, new double[]{-2.0, 2.3})), RowFactory.create(Vectors.dense(-2.0, 2.3, 0.0)) ); Dataset<Row> dataset = spark.createDataFrame(data, (new StructType()).add(group.toStructField())); VectorSlicer vectorSlicer = new VectorSlicer() .setInputCol("userFeatures").setOutputCol("features"); vectorSlicer.setIndices(new int[]{1}).setNames(new String[]{"f3"}); Dataset<Row> output = vectorSlicer.transform(dataset); for (Row r : output.select("userFeatures", "features").takeAsList(2)) { Vector features = r.getAs(1); Assert.assertEquals(features.size(), 2); } } }