Refine search
private static Dataset<Row> generateData_numbers_1k(SparkSession spark) { StructField[] structFields = new StructField[1]; org.apache.spark.sql.types.DataType dataType = DataTypes.IntegerType; String column = "number"; StructField structField = new StructField(column, dataType, true, Metadata.empty()); structFields[0] = structField; StructType structType = new StructType(structFields); List<Row> rows = new ArrayList<>(); for (int i = 0; i <= 1000; i++) { Object[] objects = new Object[structFields.length]; objects[0] = i; Row row = RowFactory.create(objects); rows.add(row); } Dataset<Row> df = spark.createDataFrame(rows, structType); return df; }
@Test public void testJavaWord2Vec() { StructType schema = new StructType(new StructField[]{ new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) }); Dataset<Row> documentDF = spark.createDataFrame( Arrays.asList( RowFactory.create(Arrays.asList("Hi I heard about Spark".split(" "))), RowFactory.create(Arrays.asList("I wish Java could use case classes".split(" "))), RowFactory.create(Arrays.asList("Logistic regression models are neat".split(" ")))), schema); Word2Vec word2Vec = new Word2Vec() .setInputCol("text") .setOutputCol("result") .setVectorSize(3) .setMinCount(0); Word2VecModel model = word2Vec.fit(documentDF); Dataset<Row> result = model.transform(documentDF); for (Row r : result.select("result").collectAsList()) { double[] polyFeatures = ((Vector) r.get(0)).toArray(); Assert.assertEquals(polyFeatures.length, 3); } } }
@Test public void bucketizerTest() { double[] splits = {-0.5, 0.0, 0.5}; StructType schema = new StructType(new StructField[]{ new StructField("feature", DataTypes.DoubleType, false, Metadata.empty()) }); Dataset<Row> dataset = spark.createDataFrame( Arrays.asList( RowFactory.create(-0.5), RowFactory.create(-0.3), RowFactory.create(0.0), RowFactory.create(0.2)), schema); Bucketizer bucketizer = new Bucketizer() .setInputCol("feature") .setOutputCol("result") .setSplits(splits); List<Row> result = bucketizer.transform(dataset).select("result").collectAsList(); for (Row r : result) { double index = r.getDouble(0); Assert.assertTrue((index >= 0) && (index <= 1)); } }
@Test public void javaCompatibilityTest() { double[] input = new double[]{1D, 2D, 3D, 4D}; Dataset<Row> dataset = spark.createDataFrame( Arrays.asList(RowFactory.create(Vectors.dense(input))), new StructType(new StructField[]{ new StructField("vec", (new VectorUDT()), false, Metadata.empty()) })); double[] expectedResult = input.clone(); (new DoubleDCT_1D(input.length)).forward(expectedResult, true); DCT dct = new DCT() .setInputCol("vec") .setOutputCol("resultVec"); List<Row> result = dct.transform(dataset).select("resultVec").collectAsList(); Vector resultVec = result.get(0).getAs("resultVec"); Assert.assertArrayEquals(expectedResult, resultVec.toArray(), 1e-6); } }
@Test public void hashingTF() { List<Row> data = Arrays.asList( RowFactory.create(0.0, "Hi I heard about Spark"), RowFactory.create(0.0, "I wish Java could use case classes"), RowFactory.create(1.0, "Logistic regression models are neat") ); StructType schema = new StructType(new StructField[]{ new StructField("label", DataTypes.DoubleType, false, Metadata.empty()), new StructField("sentence", DataTypes.StringType, false, Metadata.empty()) }); Dataset<Row> sentenceData = spark.createDataFrame(data, schema); Tokenizer tokenizer = new Tokenizer() .setInputCol("sentence") IDFModel idfModel = idf.fit(featurizedData); Dataset<Row> rescaledData = idfModel.transform(featurizedData); for (Row r : rescaledData.select("features", "label").takeAsList(3)) { Vector features = r.getAs(0); Assert.assertEquals(features.size(), numFeatures);
@Test public void constructSimpleRow() { Row simpleRow = RowFactory.create( Assert.assertEquals(byteValue, simpleRow.getByte(0)); Assert.assertEquals(byteValue, simpleRow.get(0)); Assert.assertEquals(byteValue, simpleRow.getByte(1)); Assert.assertEquals(byteValue, simpleRow.get(1)); Assert.assertEquals(shortValue, simpleRow.getShort(2));
@Test public void dataFrameRDDOperations() { List<Person> personList = new ArrayList<>(2); Person person1 = new Person(); person1.setName("Michael"); person1.setAge(29); personList.add(person1); Person person2 = new Person(); person2.setName("Yin"); person2.setAge(28); personList.add(person2); JavaRDD<Row> rowRDD = jsc.parallelize(personList).map( person -> RowFactory.create(person.getName(), person.getAge())); List<StructField> fields = new ArrayList<>(2); fields.add(DataTypes.createStructField("", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("age", DataTypes.IntegerType, false)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> df = spark.createDataFrame(rowRDD, schema); df.createOrReplaceTempView("people"); List<String> actual = spark.sql("SELECT * FROM people").toJavaRDD() .map(row -> row.getString(0) + "_" + row.get(1)).collect(); List<String> expected = new ArrayList<>(2); expected.add("Michael_29"); expected.add("Yin_28"); Assert.assertEquals(expected, actual); }
@Test public void vectorSlice() { Attribute[] attrs = new Attribute[]{ NumericAttribute.defaultAttr().withName("f1"), NumericAttribute.defaultAttr().withName("f2"), NumericAttribute.defaultAttr().withName("f3") }; AttributeGroup group = new AttributeGroup("userFeatures", attrs); List<Row> data = Arrays.asList( RowFactory.create(Vectors.sparse(3, new int[]{0, 1}, new double[]{-2.0, 2.3})), RowFactory.create(Vectors.dense(-2.0, 2.3, 0.0)) ); Dataset<Row> dataset = spark.createDataFrame(data, (new StructType()).add(group.toStructField())); VectorSlicer vectorSlicer = new VectorSlicer() .setInputCol("userFeatures").setOutputCol("features"); vectorSlicer.setIndices(new int[]{1}).setNames(new String[]{"f3"}); Dataset<Row> output = vectorSlicer.transform(dataset); for (Row r : output.select("userFeatures", "features").takeAsList(2)) { Vector features = r.getAs(1); Assert.assertEquals(features.size(), 2); } } }
@Test public void testConvertVectorColumnsToAndFromML() { Vector x = Vectors.dense(2.0); Dataset<Row> dataset = spark.createDataFrame( Collections.singletonList(new LabeledPoint(1.0, x)), LabeledPoint.class ).select("label", "features"); Dataset<Row> newDataset1 = MLUtils.convertVectorColumnsToML(dataset); Row new1 = newDataset1.first(); Assert.assertEquals(RowFactory.create(1.0, x.asML()), new1); Row new2 = MLUtils.convertVectorColumnsToML(dataset, "features").first(); Assert.assertEquals(new1, new2); Row old1 = MLUtils.convertVectorColumnsFromML(newDataset1).first(); Assert.assertEquals(RowFactory.create(1.0, x), old1); }
@Test public void applySchemaToJSON() { Dataset<String> jsonDS = spark.createDataset(Arrays.asList( "{\"string\":\"this is a simple string.\", \"integer\":10, \"long\":21474836470, " + "\"bigInteger\":92233720368547758070, \"double\":1.7976931348623157E308, " + List<Row> expectedResult = new ArrayList<>(2); expectedResult.add( RowFactory.create( new BigDecimal("92233720368547758070"), true, "this is a simple string.")); expectedResult.add( RowFactory.create( new BigDecimal("92233720368547758069"), false, "this is another simple string.")); Dataset<Row> df1 = spark.read().json(jsonDS); StructType actualSchema1 = df1.schema(); Assert.assertEquals(expectedSchema, actualSchema1); df1.createOrReplaceTempView("jsonTable1"); List<Row> actual1 = spark.sql("select * from jsonTable1").collectAsList(); Assert.assertEquals(expectedResult, actual1);
Row row = (Row) item; for (int i = 0; i < row.length(); i++) { Object value = row.get(i); StructType schema = row.schema(); if (null == schema) { throw new RuntimeException(String.format("Type[%s] - Invalid Row format, no schema found: Row[%s]", type, String[] fieldNames = schema.fieldNames(); for (int i = 0; i < fieldNames.length; i++) { Object value = row.get(i); Map<Object, Object> input = (Map<Object, Object>) item; for (StructField f : ((StructType) type).fields()) { if (input.containsKey(f.name())) { Object value = input.get(f.name()); StructField[] fields = ((StructType) type).fields(); throw new RuntimeException(String.format("Type[%s] - Invalid or unrecognized input format: %s", type, item)); return RowFactory.create(valueList.toArray()); default: if (type.typeName().startsWith("decimal")) {
List<Row> data = new ArrayList<Row>(); SparkSession spark = SparkSession.builder().appName("AgePredict").getOrCreate(); data.add(RowFactory.create(document, context.toArray())); StructType schema = new StructType( new StructField[] { new StructField("document", DataTypes.StringType, false, Metadata.empty()), new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) }); Dataset<Row> df = spark.createDataFrame(data, schema); JavaRDD<Row> normEventDF = normalizer.transform(eventDF).javaRDD(); spark.stop();
ListMultimap<String, Object> activeFields = record.getFields(); for (StructField field : schema.fields()) { String fieldName = field.name(); DataType fieldDataType = field.dataType(); Object result = RowUtils.toRowValue(recordValue, field.dataType()); values.add(result); } catch (Exception e) { Row result = RowFactory.create(values.toArray()); LOG.trace("Converted Record to Row: {}", result);
data.add(RowFactory.create(document, context.toArray())); StructType schema = new StructType( new StructField[] { new StructField("document", DataTypes.StringType, false, Metadata.empty()), new StructField("text", new ArrayType(DataTypes.StringType, true), false, Metadata.empty()) }); Dataset<Row> df = spark.createDataFrame(data, schema); JavaRDD<Row> normEventDF = normalizer.transform(eventDF).javaRDD(); SparseVector sp = (SparseVector) event.getAs("normFeature");
@Test public void applySchema() { List<Person> personList = new ArrayList<>(2); Person person1 = new Person(); person1.setName("Michael"); person1.setAge(29); personList.add(person1); Person person2 = new Person(); person2.setName("Yin"); person2.setAge(28); personList.add(person2); JavaRDD<Row> rowRDD = jsc.parallelize(personList).map( person -> RowFactory.create(person.getName(), person.getAge())); List<StructField> fields = new ArrayList<>(2); fields.add(DataTypes.createStructField("name", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("age", DataTypes.IntegerType, false)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> df = spark.createDataFrame(rowRDD, schema); df.createOrReplaceTempView("people"); List<Row> actual = spark.sql("SELECT * FROM people").collectAsList(); List<Row> expected = new ArrayList<>(2); expected.add(RowFactory.create("Michael", 29)); expected.add(RowFactory.create("Yin", 28)); Assert.assertEquals(expected, actual); }
public static void main(String[] args) { SparkSession spark = SparkSession .builder() .appName("DataFrame-FromRowsAndSchema") .master("local[4]") RowFactory.create(1, "Widget Co", 120000.00, 0.00, "AZ"), RowFactory.create(2, "Acme Widgets", 410500.00, 500.00, "CA"), RowFactory.create(3, "Widgetry", 410500.00, 200.00, "CA"), RowFactory.create(4, "Widgets R Us", 410500.00, 0.0, "CA"), RowFactory.create(5, "Ye Olde Widgete", 500.00, 0.0, "MA") ); spark.createDataFrame(customerRows, customerSchema); customerDF.printSchema(); customerDF.show(); customerDF.filter(col("state").equalTo("CA")).show(); spark.stop();
private void start() { SparkSession spark = SparkSession.builder().appName( "Build a DataFrame from Scratch").master("local[*]") .getOrCreate(); List<String> stringAsList = new ArrayList<>(); stringAsList.add("bar"); JavaSparkContext sparkContext = new JavaSparkContext(spark.sparkContext()); JavaRDD<Row> rowRDD = sparkContext.parallelize(stringAsList).map(( String row) -> RowFactory.create(row)); // Creates schema StructType schema = DataTypes.createStructType( new StructField[] { DataTypes.createStructField("foe", DataTypes.StringType, false) }); Dataset<Row> df = spark.sqlContext().createDataFrame(rowRDD, schema).toDF(); log.debug("** Schema: "); df.printSchema(); log.debug("** Data: "); df.show(); sparkContext.close(); } }
@Test public void testCreateDataFromFromList() { StructType schema = createStructType(Arrays.asList(createStructField("i", IntegerType, true))); List<Row> rows = Arrays.asList(RowFactory.create(0)); Dataset<Row> df = spark.createDataFrame(rows, schema); List<Row> result = df.collectAsList(); Assert.assertEquals(1, result.size()); }
@Override public Iterator<Row> call(String line) throws Exception { List<Row> list = new ArrayList<>(); String[] tokens = line.split("\\s"); for (int i = 0; i < tokens.length; i++) { int start = (i - brodcastWindow.value() < 0) ? 0 : i - brodcastWindow.value(); int end = (i + brodcastWindow.value() >= tokens.length) ? tokens.length - 1 : i + brodcastWindow.value(); for (int j = start; j <= end; j++) { if (j != i) { list.add(RowFactory.create(tokens[i], tokens[j], 1)); } else { // do nothing continue; } } } return list.iterator(); } });
Row simpleStruct = RowFactory.create( doubleValue, stringValue, timestampValue, null); Row complexStruct = RowFactory.create( simpleStringArray, simpleMap, complexMap, null); Assert.assertEquals(simpleStringArray, complexStruct.get(0)); Assert.assertEquals(simpleMap, complexStruct.get(1)); Assert.assertEquals(simpleStruct, complexStruct.get(2)); Assert.assertEquals(arrayOfMaps, complexStruct.get(3)); Assert.assertEquals(arrayOfRows, complexStruct.get(4)); Row complexRow = RowFactory.create(arrayOfMaps, arrayOfRows, complexMap, complexStruct); Assert.assertEquals(arrayOfMaps, complexRow.get(0)); Assert.assertEquals(arrayOfRows, complexRow.get(1));