@Test public void pivot() { Dataset<Row> df = spark.table("courseSales"); List<Row> actual = df.groupBy("year") .pivot("course", Arrays.asList("dotNET", "Java")) .agg(sum("earnings")).orderBy("year").collectAsList(); Assert.assertEquals(2012, actual.get(0).getInt(0)); Assert.assertEquals(15000.0, actual.get(0).getDouble(1), 0.01); Assert.assertEquals(20000.0, actual.get(0).getDouble(2), 0.01); Assert.assertEquals(2013, actual.get(1).getInt(0)); Assert.assertEquals(48000.0, actual.get(1).getDouble(1), 0.01); Assert.assertEquals(30000.0, actual.get(1).getDouble(2), 0.01); }
@Test public void testSampleBy() { Dataset<Row> df = spark.range(0, 100, 1, 2).select(col("id").mod(3).as("key")); Dataset<Row> sampled = df.stat().sampleBy("key", ImmutableMap.of(0, 0.1, 1, 0.2), 0L); List<Row> actual = sampled.groupBy("key").count().orderBy("key").collectAsList(); Assert.assertEquals(0, actual.get(0).getLong(0)); Assert.assertTrue(0 <= actual.get(0).getLong(1) && actual.get(0).getLong(1) <= 8); Assert.assertEquals(1, actual.get(1).getLong(0)); Assert.assertTrue(2 <= actual.get(1).getLong(1) && actual.get(1).getLong(1) <= 13); }
@Test public void testSampleBy() { Dataset<Row> df = spark.range(0, 100, 1, 2).select(col("id").mod(3).as("key")); Dataset<Row> sampled = df.stat().sampleBy("key", ImmutableMap.of(0, 0.1, 1, 0.2), 0L); List<Row> actual = sampled.groupBy("key").count().orderBy("key").collectAsList(); Assert.assertEquals(0, actual.get(0).getLong(0)); Assert.assertTrue(0 <= actual.get(0).getLong(1) && actual.get(0).getLong(1) <= 8); Assert.assertEquals(1, actual.get(1).getLong(0)); Assert.assertTrue(2 <= actual.get(1).getLong(1) && actual.get(1).getLong(1) <= 13); }
@Test public void pivot() { Dataset<Row> df = spark.table("courseSales"); List<Row> actual = df.groupBy("year") .pivot("course", Arrays.asList("dotNET", "Java")) .agg(sum("earnings")).orderBy("year").collectAsList(); Assert.assertEquals(2012, actual.get(0).getInt(0)); Assert.assertEquals(15000.0, actual.get(0).getDouble(1), 0.01); Assert.assertEquals(20000.0, actual.get(0).getDouble(2), 0.01); Assert.assertEquals(2013, actual.get(1).getInt(0)); Assert.assertEquals(48000.0, actual.get(1).getDouble(1), 0.01); Assert.assertEquals(30000.0, actual.get(1).getDouble(2), 0.01); }
@Test public void pivot() { Dataset<Row> df = spark.table("courseSales"); List<Row> actual = df.groupBy("year") .pivot("course", Arrays.asList("dotNET", "Java")) .agg(sum("earnings")).orderBy("year").collectAsList(); Assert.assertEquals(2012, actual.get(0).getInt(0)); Assert.assertEquals(15000.0, actual.get(0).getDouble(1), 0.01); Assert.assertEquals(20000.0, actual.get(0).getDouble(2), 0.01); Assert.assertEquals(2013, actual.get(1).getInt(0)); Assert.assertEquals(48000.0, actual.get(1).getDouble(1), 0.01); Assert.assertEquals(30000.0, actual.get(1).getDouble(2), 0.01); }
@Test public void testSampleBy() { Dataset<Row> df = spark.range(0, 100, 1, 2).select(col("id").mod(3).as("key")); Dataset<Row> sampled = df.stat().sampleBy("key", ImmutableMap.of(0, 0.1, 1, 0.2), 0L); List<Row> actual = sampled.groupBy("key").count().orderBy("key").collectAsList(); Assert.assertEquals(0, actual.get(0).getLong(0)); Assert.assertTrue(0 <= actual.get(0).getLong(1) && actual.get(0).getLong(1) <= 8); Assert.assertEquals(1, actual.get(1).getLong(0)); Assert.assertTrue(2 <= actual.get(1).getLong(1) && actual.get(1).getLong(1) <= 13); }
@Override public Dataset<T> orderBy(final String sortCol, final String... sortCols) { final boolean userTriggered = initializeFunction(sortCol, sortCols); final Dataset<T> result = from(super.orderBy(sortCol, sortCols)); this.setIsUserTriggered(userTriggered); return result; }
@Override public Dataset<T> orderBy(final Column... sortExprs) { final boolean userTriggered = initializeFunction(sortExprs); final Dataset<T> result = from(super.orderBy(sortExprs)); this.setIsUserTriggered(userTriggered); return result; }
@Override public Dataset<T> orderBy(final scala.collection.Seq<Column> sortExprs) { final boolean userTriggered = initializeFunction(sortExprs); final Dataset<T> result = from(super.orderBy(sortExprs)); this.setIsUserTriggered(userTriggered); return result; }
@Override public Dataset<T> orderBy(final String sortCol, final scala.collection.Seq<String> sortCols) { final boolean userTriggered = initializeFunction(sortCol, sortCols); final Dataset<T> result = from(super.orderBy(sortCol, sortCols)); this.setIsUserTriggered(userTriggered); return result; }
private void start() { SparkSession spark = SparkSession.builder() .appName("Dataset from MySQL JDBC Connection") .master("local") .getOrCreate(); java.util.Properties props = new Properties(); props.put("user", "root"); props.put("password", "password"); props.put("useSSL", "false"); Dataset<Row> df = spark.read().jdbc( "jdbc:mysql://localhost:3306/sakila?serverTimezone=EST", "actor", props); df = df.orderBy(df.col("last_name")); df.show(); } }
df = spark.read().option("dateFormat", "yyyy-mm-dd").json(fileToAnalyze); df = df.withColumn("district", df.col("fields.district")); df = df.groupBy("district").count().orderBy(df.col("district")); df.show(150, false);
.load(location.toString()); List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList();
.load(location.toString()); List<SimpleRecord> actual = result.orderBy("id").as( Encoders.bean(SimpleRecord.class)).collectAsList();
@Test public void testStringIndexer() { StructType schema = createStructType(new StructField[]{ createStructField("id", IntegerType, false), createStructField("label", StringType, false) }); List<Row> data = Arrays.asList( cr(0, "a"), cr(1, "b"), cr(2, "c"), cr(3, "a"), cr(4, "a"), cr(5, "c")); Dataset<Row> dataset = spark.createDataFrame(data, schema); StringIndexer indexer = new StringIndexer() .setInputCol("label") .setOutputCol("labelIndex"); Dataset<Row> output = indexer.fit(dataset).transform(dataset); Assert.assertEquals( Arrays.asList(cr(0, 0.0), cr(1, 2.0), cr(2, 1.0), cr(3, 0.0), cr(4, 0.0), cr(5, 1.0)), output.orderBy("id").select("id", "labelIndex").collectAsList()); }
@Test public void testStringIndexer() { StructType schema = createStructType(new StructField[]{ createStructField("id", IntegerType, false), createStructField("label", StringType, false) }); List<Row> data = Arrays.asList( cr(0, "a"), cr(1, "b"), cr(2, "c"), cr(3, "a"), cr(4, "a"), cr(5, "c")); Dataset<Row> dataset = spark.createDataFrame(data, schema); StringIndexer indexer = new StringIndexer() .setInputCol("label") .setOutputCol("labelIndex"); Dataset<Row> output = indexer.fit(dataset).transform(dataset); Assert.assertEquals( Arrays.asList(cr(0, 0.0), cr(1, 2.0), cr(2, 1.0), cr(3, 0.0), cr(4, 0.0), cr(5, 1.0)), output.orderBy("id").select("id", "labelIndex").collectAsList()); }
@Test public void testStringIndexer() { StructType schema = createStructType(new StructField[]{ createStructField("id", IntegerType, false), createStructField("label", StringType, false) }); List<Row> data = Arrays.asList( cr(0, "a"), cr(1, "b"), cr(2, "c"), cr(3, "a"), cr(4, "a"), cr(5, "c")); Dataset<Row> dataset = spark.createDataFrame(data, schema); StringIndexer indexer = new StringIndexer() .setInputCol("label") .setOutputCol("labelIndex"); Dataset<Row> output = indexer.fit(dataset).transform(dataset); Assert.assertEquals( Arrays.asList(cr(0, 0.0), cr(1, 2.0), cr(2, 1.0), cr(3, 0.0), cr(4, 0.0), cr(5, 1.0)), output.orderBy("id").select("id", "labelIndex").collectAsList()); }