@Test public void isInCollectionWorksCorrectlyOnJava() { List<Row> rows = Arrays.asList( RowFactory.create(1, "x"), RowFactory.create(2, "y"), RowFactory.create(3, "z")); StructType schema = createStructType(Arrays.asList( createStructField("a", IntegerType, false), createStructField("b", StringType, false))); Dataset<Row> df = spark.createDataFrame(rows, schema); // Test with different types of collections Assert.assertTrue(Arrays.equals( (Row[]) df.filter(df.col("a").isInCollection(Arrays.asList(1, 2))).collect(), (Row[]) df.filter((FilterFunction<Row>) r -> r.getInt(0) == 1 || r.getInt(0) == 2).collect() )); Assert.assertTrue(Arrays.equals( (Row[]) df.filter(df.col("a").isInCollection(new HashSet<>(Arrays.asList(1, 2)))).collect(), (Row[]) df.filter((FilterFunction<Row>) r -> r.getInt(0) == 1 || r.getInt(0) == 2).collect() )); Assert.assertTrue(Arrays.equals( (Row[]) df.filter(df.col("a").isInCollection(new ArrayList<>(Arrays.asList(3, 1)))).collect(), (Row[]) df.filter((FilterFunction<Row>) r -> r.getInt(0) == 3 || r.getInt(0) == 1).collect() )); }
@Test public void isInCollectionCheckExceptionMessage() { List<Row> rows = Arrays.asList( RowFactory.create(1, Arrays.asList(1)), RowFactory.create(2, Arrays.asList(2)), RowFactory.create(3, Arrays.asList(3))); StructType schema = createStructType(Arrays.asList( createStructField("a", IntegerType, false), createStructField("b", createArrayType(IntegerType, false), false))); Dataset<Row> df = spark.createDataFrame(rows, schema); try { df.filter(df.col("a").isInCollection(Arrays.asList(new Column("b")))); Assert.fail("Expected org.apache.spark.sql.AnalysisException"); } catch (Exception e) { Arrays.asList("cannot resolve", "due to data type mismatch: Arguments must be same type but were") .forEach(s -> Assert.assertTrue( e.getMessage().toLowerCase(Locale.ROOT).contains(s.toLowerCase(Locale.ROOT)))); } } }
/** * Normalize by zero mean unit variance * * @param frame the data to normalize * @return a zero mean unit variance centered * rdd */ public static DataRowsFacade zeromeanUnitVariance(DataRowsFacade frame, List<String> skipColumns) { List<String> columnsList = DataFrames.toList(frame.get().columns()); columnsList.removeAll(skipColumns); String[] columnNames = DataFrames.toArray(columnsList); //first row is std second row is mean, each column in a row is for a particular column List<Row> stdDevMean = stdDevMeanColumns(frame, columnNames); for (int i = 0; i < columnNames.length; i++) { String columnName = columnNames[i]; double std = ((Number) stdDevMean.get(0).get(i)).doubleValue(); double mean = ((Number) stdDevMean.get(1).get(i)).doubleValue(); if (std == 0.0) std = 1; //All same value -> (x-x)/1 = 0 frame = dataRows(frame.get().withColumn(columnName, frame.get().col(columnName).minus(mean).divide(std))); } return frame; }
@Test public void isInCollectionCheckExceptionMessage() { List<Row> rows = Arrays.asList( RowFactory.create(1, Arrays.asList(1)), RowFactory.create(2, Arrays.asList(2)), RowFactory.create(3, Arrays.asList(3))); StructType schema = createStructType(Arrays.asList( createStructField("a", IntegerType, false), createStructField("b", createArrayType(IntegerType, false), false))); Dataset<Row> df = spark.createDataFrame(rows, schema); try { df.filter(df.col("a").isInCollection(Arrays.asList(new Column("b")))); Assert.fail("Expected org.apache.spark.sql.AnalysisException"); } catch (Exception e) { Arrays.asList("cannot resolve", "due to data type mismatch: Arguments must be same type but were") .forEach(s -> Assert.assertTrue( e.getMessage().toLowerCase(Locale.ROOT).contains(s.toLowerCase(Locale.ROOT)))); } } }
private void start() { SparkSession spark = SparkSession.builder() .appName("CSV to Dataset") .master("local") .getOrCreate(); String filename = "data/tuple-data-file.csv"; Dataset<Row> df = spark.read().format("csv") .option("inferSchema", "true") .option("header", "false") .load(filename); df.show(); // To ensure compatibility between Spark 2.0.0 and Spark 1.6.x int count = df.columns().length; for (int i = 0; i < count; i++) { String oldColName = "_c" + i; String newColName = "C" + i; df = df.withColumn(newColName, df.col(oldColName)).drop(oldColName); } df.show(); } }
@Test public void isInCollectionWorksCorrectlyOnJava() { List<Row> rows = Arrays.asList( RowFactory.create(1, "x"), RowFactory.create(2, "y"), RowFactory.create(3, "z")); StructType schema = createStructType(Arrays.asList( createStructField("a", IntegerType, false), createStructField("b", StringType, false))); Dataset<Row> df = spark.createDataFrame(rows, schema); // Test with different types of collections Assert.assertTrue(Arrays.equals( (Row[]) df.filter(df.col("a").isInCollection(Arrays.asList(1, 2))).collect(), (Row[]) df.filter((FilterFunction<Row>) r -> r.getInt(0) == 1 || r.getInt(0) == 2).collect() )); Assert.assertTrue(Arrays.equals( (Row[]) df.filter(df.col("a").isInCollection(new HashSet<>(Arrays.asList(1, 2)))).collect(), (Row[]) df.filter((FilterFunction<Row>) r -> r.getInt(0) == 1 || r.getInt(0) == 2).collect() )); Assert.assertTrue(Arrays.equals( (Row[]) df.filter(df.col("a").isInCollection(new ArrayList<>(Arrays.asList(3, 1)))).collect(), (Row[]) df.filter((FilterFunction<Row>) r -> r.getInt(0) == 3 || r.getInt(0) == 1).collect() )); }
@Override public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception { LOGGER.debug("Derive: Validating dependencies map " + dependencies.toString()); validate(dependencies); String step = getStepName(dependencies); String field = getFieldName(dependencies); Object[] inList = getInList(dependencies); LOGGER.debug("Derive: Filtering dataset " + step + " by field " + field + " being IN " + Arrays.asList(inList).toString() + ""); return dependencies.get(step).filter(dependencies.get(step).col(field).isin(inList)); }
/** * MIn for a column * * @param dataFrame the dataframe to * get the column from * @param columnName the name of the column to get the min for * @return the column that represents the min */ public static Column min(DataRowsFacade dataFrame, String columnName) { return dataFrame.get().groupBy(columnName).agg(functions.min(columnName)).col(columnName); }
/** * Mean for a column * * @param dataFrame the dataframe to * get the column fron * @param columnName the name of the column to get the mean for * @return the column that represents the mean */ public static Column mean(DataRowsFacade dataFrame, String columnName) { return dataFrame.get().groupBy(columnName).agg(avg(columnName)).col(columnName); }
/** * Max for a column * * @param dataFrame the dataframe to * get the column from * @param columnName the name of the column * to get the max for * @return the column that represents the max */ public static Column max(DataRowsFacade dataFrame, String columnName) { return dataFrame.get().groupBy(columnName).agg(functions.max(columnName)).col(columnName); }
/** * Standard deviation for a column * * @param dataFrame the dataframe to * get the column from * @param columnName the name of the column to get the standard * deviation for * @return the column that represents the standard deviation */ public static Column var(DataRowsFacade dataFrame, String columnName) { return dataFrame.get().groupBy(columnName).agg(functions.variance(columnName)).col(columnName); }
/** * Standard deviation for a column * * @param dataFrame the dataframe to * get the column from * @param columnName the name of the column to get the standard * deviation for * @return the column that represents the standard deviation */ public static Column var(DataRowsFacade dataFrame, String columnName) { return dataFrame.get().groupBy(columnName).agg(functions.variance(columnName)).col(columnName); }
/** * Max for a column * * @param dataFrame the dataframe to * get the column from * @param columnName the name of the column * to get the max for * @return the column that represents the max */ public static Column max(DataRowsFacade dataFrame, String columnName) { return dataFrame.get().groupBy(columnName).agg(functions.max(columnName)).col(columnName); }
/** * MIn for a column * * @param dataFrame the dataframe to * get the column from * @param columnName the name of the column to get the min for * @return the column that represents the min */ public static Column min(DataRowsFacade dataFrame, String columnName) { return dataFrame.get().groupBy(columnName).agg(functions.min(columnName)).col(columnName); }
/** * Mean for a column * * @param dataFrame the dataframe to * get the column fron * @param columnName the name of the column to get the mean for * @return the column that represents the mean */ public static Column mean(DataRowsFacade dataFrame, String columnName) { return dataFrame.get().groupBy(columnName).agg(avg(columnName)).col(columnName); }
private void start() { SparkSession spark = SparkSession.builder() .appName("Dataset from MySQL JDBC Connection") .master("local") .getOrCreate(); java.util.Properties props = new Properties(); props.put("user", "root"); props.put("password", "password"); props.put("useSSL", "false"); Dataset<Row> df = spark.read().jdbc( "jdbc:mysql://localhost:3306/sakila?serverTimezone=EST", "actor", props); df = df.orderBy(df.col("last_name")); df.show(); } }
private void start() { SparkSession spark = SparkSession.builder().appName("JSON to Dataset") .master("local").getOrCreate(); String filename = "data/north-carolina-school-performance-data.json"; long start = System.currentTimeMillis(); Dataset<Row> df = spark.read().json(filename); long stop = System.currentTimeMillis(); System.out.println("Processing took " + (stop - start) + " ms"); df.show(); df.printSchema(); // Flatenization df = df.withColumn("district", df.col("fields.district")); df = df.drop(df.col("fields.district")); // this does not work as the column // stays here (Spark 2.0.0) df.show(); df.printSchema(); } }
private void start() { SparkSession spark = SparkSession.builder().appName("JSON array to Dataset") .master("local").getOrCreate(); String filename = "data/array.json"; long start = System.currentTimeMillis(); Dataset<Row> df = spark.read().json(filename); long stop = System.currentTimeMillis(); System.out.println("Processing took " + (stop - start) + " ms"); df.show(); df.printSchema(); // Turns the "one liner" into a real column df = df.select(explode(df.col("valsInArrays"))).toDF("vals"); df.show(); df.printSchema(); } }
private void start() { SparkSession spark = SparkSession.builder().appName("CSV to Dataset") .master("local").getOrCreate(); spark.udf().register("x2Multiplier", new Multiplier2(), DataTypes.IntegerType); String filename = "data/tuple-data-file.csv"; Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true") .option("header", "false").load(filename); df = df.withColumn("label", df.col("_c0")).drop("_c0"); df = df.withColumn("value", df.col("_c1")).drop("_c1"); df = df.withColumn("x2", callUDF("x2Multiplier", df.col("value").cast( DataTypes.IntegerType))); df.show(); } }
private void start() { SparkSession spark = SparkSession.builder().appName("CSV to Dataset") .master("local").getOrCreate(); // registers a new internal UDF spark.udf().register("x2Multiplier", new UDF1<Integer, Integer>() { private static final long serialVersionUID = -5372447039252716846L; @Override public Integer call(Integer x) { return x * 2; } }, DataTypes.IntegerType); String filename = "data/tuple-data-file.csv"; Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true") .option("header", "false").load(filename); df = df.withColumn("label", df.col("_c0")).drop("_c0"); df = df.withColumn("value", df.col("_c1")).drop("_c1"); df = df.withColumn("x2", callUDF("x2Multiplier", df.col("value").cast( DataTypes.IntegerType))); df.show(); } }