@Override public Dataset<String> read(SparkSession spark, Properties profilerProps, Properties readerProps) { String inputPath = TELEMETRY_INPUT_PATH.get(profilerProps, String.class); if(inputFormat == null) { inputFormat = TELEMETRY_INPUT_FORMAT.get(profilerProps, String.class); } LOG.debug("Loading telemetry; inputPath={}, inputFormat={}", inputPath, inputFormat); return spark .read() .options(Maps.fromProperties(readerProps)) .format(inputFormat) .load(inputPath) .as(Encoders.STRING()); } }
@Test public void testJoin() { List<Integer> data = Arrays.asList(1, 2, 3); Dataset<Integer> ds = spark.createDataset(data, Encoders.INT()).as("a"); List<Integer> data2 = Arrays.asList(2, 3, 4); Dataset<Integer> ds2 = spark.createDataset(data2, Encoders.INT()).as("b"); Dataset<Tuple2<Integer, Integer>> joined = ds.joinWith(ds2, col("a.value").equalTo(col("b.value"))); Assert.assertEquals( Arrays.asList(tuple2(2, 2), tuple2(3, 3)), joined.collectAsList()); }
@Test public void testJoin() { List<Integer> data = Arrays.asList(1, 2, 3); Dataset<Integer> ds = spark.createDataset(data, Encoders.INT()).as("a"); List<Integer> data2 = Arrays.asList(2, 3, 4); Dataset<Integer> ds2 = spark.createDataset(data2, Encoders.INT()).as("b"); Dataset<Tuple2<Integer, Integer>> joined = ds.joinWith(ds2, col("a.value").equalTo(col("b.value"))); Assert.assertEquals( Arrays.asList(tuple2(2, 2), tuple2(3, 3)), joined.collectAsList()); }
@Test public void testJoin() { List<Integer> data = Arrays.asList(1, 2, 3); Dataset<Integer> ds = spark.createDataset(data, Encoders.INT()).as("a"); List<Integer> data2 = Arrays.asList(2, 3, 4); Dataset<Integer> ds2 = spark.createDataset(data2, Encoders.INT()).as("b"); Dataset<Tuple2<Integer, Integer>> joined = ds.joinWith(ds2, col("a.value").equalTo(col("b.value"))); Assert.assertEquals( Arrays.asList(tuple2(2, 2), tuple2(3, 3)), joined.collectAsList()); }
@Test public void testTypedAggregationAnonClass() { KeyValueGroupedDataset<String, Tuple2<String, Integer>> grouped = generateGroupedDataset(); Dataset<Tuple2<String, Integer>> agged = grouped.agg(new IntSumOf().toColumn()); Assert.assertEquals( Arrays.asList(new Tuple2<>("a", 3), new Tuple2<>("b", 3)), agged.collectAsList()); Dataset<Tuple2<String, Integer>> agged2 = grouped.agg(new IntSumOf().toColumn()) .as(Encoders.tuple(Encoders.STRING(), Encoders.INT())); Assert.assertEquals( Arrays.asList( new Tuple2<>("a", 3), new Tuple2<>("b", 3)), agged2.collectAsList()); }
@Test public void testTypedAggregationAnonClass() { KeyValueGroupedDataset<String, Tuple2<String, Integer>> grouped = generateGroupedDataset(); Dataset<Tuple2<String, Integer>> agged = grouped.agg(new IntSumOf().toColumn()); Assert.assertEquals( Arrays.asList(new Tuple2<>("a", 3), new Tuple2<>("b", 3)), agged.collectAsList()); Dataset<Tuple2<String, Integer>> agged2 = grouped.agg(new IntSumOf().toColumn()) .as(Encoders.tuple(Encoders.STRING(), Encoders.INT())); Assert.assertEquals( Arrays.asList( new Tuple2<>("a", 3), new Tuple2<>("b", 3)), agged2.collectAsList()); }
@Test public void testTypedAggregationAnonClass() { KeyValueGroupedDataset<String, Tuple2<String, Integer>> grouped = generateGroupedDataset(); Dataset<Tuple2<String, Integer>> agged = grouped.agg(new IntSumOf().toColumn()); Assert.assertEquals( Arrays.asList(new Tuple2<>("a", 3), new Tuple2<>("b", 3)), agged.collectAsList()); Dataset<Tuple2<String, Integer>> agged2 = grouped.agg(new IntSumOf().toColumn()) .as(Encoders.tuple(Encoders.STRING(), Encoders.INT())); Assert.assertEquals( Arrays.asList( new Tuple2<>("a", 3), new Tuple2<>("b", 3)), agged2.collectAsList()); }
@Test public void testSelect() { List<Integer> data = Arrays.asList(2, 6); Dataset<Integer> ds = spark.createDataset(data, Encoders.INT()); Dataset<Tuple2<Integer, String>> selected = ds.select( expr("value + 1"), col("value").cast("string")).as(Encoders.tuple(Encoders.INT(), Encoders.STRING())); Assert.assertEquals( Arrays.asList(tuple2(3, "2"), tuple2(7, "6")), selected.collectAsList()); }
@Test public void testBeanWithArrayFieldDeserialization() { Encoder<Record> encoder = Encoders.bean(Record.class); Dataset<Record> dataset = spark .read() .format("json") .schema("id int, intervals array<struct<startTime: bigint, endTime: bigint>>") .load("src/test/resources/test-data/with-array-fields.json") .as(encoder); List<Record> records = dataset.collectAsList(); Assert.assertEquals(records, RECORDS); }
@Test public void testSelect() { List<Integer> data = Arrays.asList(2, 6); Dataset<Integer> ds = spark.createDataset(data, Encoders.INT()); Dataset<Tuple2<Integer, String>> selected = ds.select( expr("value + 1"), col("value").cast("string")).as(Encoders.tuple(Encoders.INT(), Encoders.STRING())); Assert.assertEquals( Arrays.asList(tuple2(3, "2"), tuple2(7, "6")), selected.collectAsList()); }
@Test public void testSelect() { List<Integer> data = Arrays.asList(2, 6); Dataset<Integer> ds = spark.createDataset(data, Encoders.INT()); Dataset<Tuple2<Integer, String>> selected = ds.select( expr("value + 1"), col("value").cast("string")).as(Encoders.tuple(Encoders.INT(), Encoders.STRING())); Assert.assertEquals( Arrays.asList(tuple2(3, "2"), tuple2(7, "6")), selected.collectAsList()); }
@Test public void testBeanWithArrayFieldDeserialization() { Encoder<Record> encoder = Encoders.bean(Record.class); Dataset<Record> dataset = spark .read() .format("json") .schema("id int, intervals array<struct<startTime: bigint, endTime: bigint>>") .load("src/test/resources/test-data/with-array-fields.json") .as(encoder); List<Record> records = dataset.collectAsList(); Assert.assertEquals(records, RECORDS); }
.add("h",createMapType(createArrayType(LongType), createMapType(StringType, StringType))); Dataset<SimpleJavaBean> ds3 = spark.createDataFrame(Arrays.asList(row1, row2), schema) .as(Encoders.bean(SimpleJavaBean.class)); Assert.assertEquals(data, ds3.collectAsList());
.add("h",createMapType(createArrayType(LongType), createMapType(StringType, StringType))); Dataset<SimpleJavaBean> ds3 = spark.createDataFrame(Arrays.asList(row1, row2), schema) .as(Encoders.bean(SimpleJavaBean.class)); Assert.assertEquals(data, ds3.collectAsList());
@Test public void testBeanWithArrayFieldDeserialization() { Encoder<Record> encoder = Encoders.bean(Record.class); Dataset<Record> dataset = spark .read() .format("json") .schema(createSchema()) .load("src/test/resources/test-data/with-array-fields.json") .as(encoder); List<Record> records = dataset.collectAsList(); Assert.assertEquals(records, RECORDS); }
.add("h",createMapType(createArrayType(LongType), createMapType(StringType, StringType))); Dataset<SimpleJavaBean> ds3 = spark.createDataFrame(Arrays.asList(row1, row2), schema) .as(Encoders.bean(SimpleJavaBean.class)); Assert.assertEquals(data, ds3.collectAsList());
/** * Returns a dataset of distinct URL and version tuples. */ protected Dataset<UrlAndVersion> getUrlAndVersions(Dataset<T> valueSets) { return valueSets.select("url", "version") .distinct() .as(URL_AND_VERSION_ENCODER); }