@Test public void isInCollectionCheckExceptionMessage() { List<Row> rows = Arrays.asList( RowFactory.create(1, Arrays.asList(1)), RowFactory.create(2, Arrays.asList(2)), RowFactory.create(3, Arrays.asList(3))); StructType schema = createStructType(Arrays.asList( createStructField("a", IntegerType, false), createStructField("b", createArrayType(IntegerType, false), false))); Dataset<Row> df = spark.createDataFrame(rows, schema); try { df.filter(df.col("a").isInCollection(Arrays.asList(new Column("b")))); Assert.fail("Expected org.apache.spark.sql.AnalysisException"); } catch (Exception e) { Arrays.asList("cannot resolve", "due to data type mismatch: Arguments must be same type but were") .forEach(s -> Assert.assertTrue( e.getMessage().toLowerCase(Locale.ROOT).contains(s.toLowerCase(Locale.ROOT)))); } } }
@Test public void isInCollectionCheckExceptionMessage() { List<Row> rows = Arrays.asList( RowFactory.create(1, Arrays.asList(1)), RowFactory.create(2, Arrays.asList(2)), RowFactory.create(3, Arrays.asList(3))); StructType schema = createStructType(Arrays.asList( createStructField("a", IntegerType, false), createStructField("b", createArrayType(IntegerType, false), false))); Dataset<Row> df = spark.createDataFrame(rows, schema); try { df.filter(df.col("a").isInCollection(Arrays.asList(new Column("b")))); Assert.fail("Expected org.apache.spark.sql.AnalysisException"); } catch (Exception e) { Arrays.asList("cannot resolve", "due to data type mismatch: Arguments must be same type but were") .forEach(s -> Assert.assertTrue( e.getMessage().toLowerCase(Locale.ROOT).contains(s.toLowerCase(Locale.ROOT)))); } } }
@Test public void testExecution() { Dataset<Row> df = spark.table("testData").filter("key = 1"); Assert.assertEquals(1, df.select("key").collectAsList().get(0).get(0)); }
@Test public void testExecution() { Dataset<Row> df = spark.table("testData").filter("key = 1"); Assert.assertEquals(1, df.select("key").collectAsList().get(0).get(0)); }
@Test public void isInCollectionWorksCorrectlyOnJava() { List<Row> rows = Arrays.asList( RowFactory.create(1, "x"), RowFactory.create(2, "y"), RowFactory.create(3, "z")); StructType schema = createStructType(Arrays.asList( createStructField("a", IntegerType, false), createStructField("b", StringType, false))); Dataset<Row> df = spark.createDataFrame(rows, schema); // Test with different types of collections Assert.assertTrue(Arrays.equals( (Row[]) df.filter(df.col("a").isInCollection(Arrays.asList(1, 2))).collect(), (Row[]) df.filter((FilterFunction<Row>) r -> r.getInt(0) == 1 || r.getInt(0) == 2).collect() )); Assert.assertTrue(Arrays.equals( (Row[]) df.filter(df.col("a").isInCollection(new HashSet<>(Arrays.asList(1, 2)))).collect(), (Row[]) df.filter((FilterFunction<Row>) r -> r.getInt(0) == 1 || r.getInt(0) == 2).collect() )); Assert.assertTrue(Arrays.equals( (Row[]) df.filter(df.col("a").isInCollection(new ArrayList<>(Arrays.asList(3, 1)))).collect(), (Row[]) df.filter((FilterFunction<Row>) r -> r.getInt(0) == 3 || r.getInt(0) == 1).collect() )); }
@Test public void isInCollectionWorksCorrectlyOnJava() { List<Row> rows = Arrays.asList( RowFactory.create(1, "x"), RowFactory.create(2, "y"), RowFactory.create(3, "z")); StructType schema = createStructType(Arrays.asList( createStructField("a", IntegerType, false), createStructField("b", StringType, false))); Dataset<Row> df = spark.createDataFrame(rows, schema); // Test with different types of collections Assert.assertTrue(Arrays.equals( (Row[]) df.filter(df.col("a").isInCollection(Arrays.asList(1, 2))).collect(), (Row[]) df.filter((FilterFunction<Row>) r -> r.getInt(0) == 1 || r.getInt(0) == 2).collect() )); Assert.assertTrue(Arrays.equals( (Row[]) df.filter(df.col("a").isInCollection(new HashSet<>(Arrays.asList(1, 2)))).collect(), (Row[]) df.filter((FilterFunction<Row>) r -> r.getInt(0) == 1 || r.getInt(0) == 2).collect() )); Assert.assertTrue(Arrays.equals( (Row[]) df.filter(df.col("a").isInCollection(new ArrayList<>(Arrays.asList(3, 1)))).collect(), (Row[]) df.filter((FilterFunction<Row>) r -> r.getInt(0) == 3 || r.getInt(0) == 1).collect() )); }
@Test public void testExecution() { Dataset<Row> df = spark.table("testData").filter("key = 1"); Assert.assertEquals(1, df.select("key").collectAsList().get(0).get(0)); }
@Test public void testCommonOperation() { List<String> data = Arrays.asList("hello", "world"); Dataset<String> ds = spark.createDataset(data, Encoders.STRING()); Assert.assertEquals("hello", ds.first()); Dataset<String> filtered = ds.filter((FilterFunction<String>) v -> v.startsWith("h")); Assert.assertEquals(Arrays.asList("hello"), filtered.collectAsList()); Dataset<Integer> mapped = ds.map((MapFunction<String, Integer>) String::length, Encoders.INT()); Assert.assertEquals(Arrays.asList(5, 5), mapped.collectAsList()); Dataset<String> parMapped = ds.mapPartitions((MapPartitionsFunction<String, String>) it -> { List<String> ls = new LinkedList<>(); while (it.hasNext()) { ls.add(it.next().toUpperCase(Locale.ROOT)); } return ls.iterator(); }, Encoders.STRING()); Assert.assertEquals(Arrays.asList("HELLO", "WORLD"), parMapped.collectAsList()); Dataset<String> flatMapped = ds.flatMap((FlatMapFunction<String, String>) s -> { List<String> ls = new LinkedList<>(); for (char c : s.toCharArray()) { ls.add(String.valueOf(c)); } return ls.iterator(); }, Encoders.STRING()); Assert.assertEquals( Arrays.asList("h", "e", "l", "l", "o", "w", "o", "r", "l", "d"), flatMapped.collectAsList()); }
@Test public void testCommonOperation() { List<String> data = Arrays.asList("hello", "world"); Dataset<String> ds = spark.createDataset(data, Encoders.STRING()); Assert.assertEquals("hello", ds.first()); Dataset<String> filtered = ds.filter((FilterFunction<String>) v -> v.startsWith("h")); Assert.assertEquals(Arrays.asList("hello"), filtered.collectAsList()); Dataset<Integer> mapped = ds.map((MapFunction<String, Integer>) String::length, Encoders.INT()); Assert.assertEquals(Arrays.asList(5, 5), mapped.collectAsList()); Dataset<String> parMapped = ds.mapPartitions((MapPartitionsFunction<String, String>) it -> { List<String> ls = new LinkedList<>(); while (it.hasNext()) { ls.add(it.next().toUpperCase(Locale.ROOT)); } return ls.iterator(); }, Encoders.STRING()); Assert.assertEquals(Arrays.asList("HELLO", "WORLD"), parMapped.collectAsList()); Dataset<String> flatMapped = ds.flatMap((FlatMapFunction<String, String>) s -> { List<String> ls = new LinkedList<>(); for (char c : s.toCharArray()) { ls.add(String.valueOf(c)); } return ls.iterator(); }, Encoders.STRING()); Assert.assertEquals( Arrays.asList("h", "e", "l", "l", "o", "w", "o", "r", "l", "d"), flatMapped.collectAsList()); }
@Test public void testCollectAndTake() { Dataset<Row> df = spark.table("testData").filter("key = 1 or key = 2 or key = 3"); Assert.assertEquals(3, df.select("key").collectAsList().size()); Assert.assertEquals(2, df.select("key").takeAsList(2).size()); }
@Test public void testCollectAndTake() { Dataset<Row> df = spark.table("testData").filter("key = 1 or key = 2 or key = 3"); Assert.assertEquals(3, df.select("key").collectAsList().size()); Assert.assertEquals(2, df.select("key").takeAsList(2).size()); }
@Test public void testCollectAndTake() { Dataset<Row> df = spark.table("testData").filter("key = 1 or key = 2 or key = 3"); Assert.assertEquals(3, df.select("key").collectAsList().size()); Assert.assertEquals(2, df.select("key").takeAsList(2).size()); }
@Test public void testCommonOperation() { List<String> data = Arrays.asList("hello", "world"); Dataset<String> ds = spark.createDataset(data, Encoders.STRING()); Assert.assertEquals("hello", ds.first()); Dataset<String> filtered = ds.filter((FilterFunction<String>) v -> v.startsWith("h")); Assert.assertEquals(Arrays.asList("hello"), filtered.collectAsList()); Dataset<Integer> mapped = ds.map((MapFunction<String, Integer>) String::length, Encoders.INT()); Assert.assertEquals(Arrays.asList(5, 5), mapped.collectAsList()); Dataset<String> parMapped = ds.mapPartitions((MapPartitionsFunction<String, String>) it -> { List<String> ls = new LinkedList<>(); while (it.hasNext()) { ls.add(it.next().toUpperCase(Locale.ROOT)); } return ls.iterator(); }, Encoders.STRING()); Assert.assertEquals(Arrays.asList("HELLO", "WORLD"), parMapped.collectAsList()); Dataset<String> flatMapped = ds.flatMap((FlatMapFunction<String, String>) s -> { List<String> ls = new LinkedList<>(); for (char c : s.toCharArray()) { ls.add(String.valueOf(c)); } return ls.iterator(); }, Encoders.STRING()); Assert.assertEquals( Arrays.asList("h", "e", "l", "l", "o", "w", "o", "r", "l", "d"), flatMapped.collectAsList()); }
@Test public void testTypedFilterPreservingSchema() { Dataset<Long> ds = spark.range(10); Dataset<Long> ds2 = ds.filter((FilterFunction<Long>) value -> value > 3); Assert.assertEquals(ds.schema(), ds2.schema()); }
@Test public void testTypedFilterPreservingSchema() { Dataset<Long> ds = spark.range(10); Dataset<Long> ds2 = ds.filter((FilterFunction<Long>) value -> value > 3); Assert.assertEquals(ds.schema(), ds2.schema()); }
@Test public void testTypedFilterPreservingSchema() { Dataset<Long> ds = spark.range(10); Dataset<Long> ds2 = ds.filter((FilterFunction<Long>) value -> value > 3); Assert.assertEquals(ds.schema(), ds2.schema()); }
@Override public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception { LOGGER.debug("Derive: Validating dependencies map " + dependencies.toString()); validate(dependencies); String step = getStepName(dependencies); String field = getFieldName(dependencies); Object[] inList = getInList(dependencies); LOGGER.debug("Derive: Filtering dataset " + step + " by field " + field + " being IN " + Arrays.asList(inList).toString() + ""); return dependencies.get(step).filter(dependencies.get(step).col(field).isin(inList)); }
@Override public Dataset<T> filter(final Column condition) { final boolean userTriggered = initializeFunction(condition); final Dataset<T> result = from(super.filter(condition)); this.setIsUserTriggered(userTriggered); return result; }
@Override public Dataset<T> filter(final String conditionExpr) { final boolean userTriggered = initializeFunction(conditionExpr); final Dataset<T> result = from(super.filter(conditionExpr)); this.setIsUserTriggered(userTriggered); return result; }
private static List<Row> read(String table, String expr, String select0, String... selectN) { Dataset<Row> dataset = spark.read().format("iceberg").load(table).filter(expr) .select(select0, selectN); return dataset.collectAsList(); }