org.apache.spark.sql.Dataset.col java code examples

@Test
public void isInCollectionWorksCorrectlyOnJava() {
 List<Row> rows = Arrays.asList(
  RowFactory.create(1, "x"),
  RowFactory.create(2, "y"),
  RowFactory.create(3, "z"));
 StructType schema = createStructType(Arrays.asList(
  createStructField("a", IntegerType, false),
  createStructField("b", StringType, false)));
 Dataset<Row> df = spark.createDataFrame(rows, schema);
 // Test with different types of collections
 Assert.assertTrue(Arrays.equals(
  (Row[]) df.filter(df.col("a").isInCollection(Arrays.asList(1, 2))).collect(),
  (Row[]) df.filter((FilterFunction<Row>) r -> r.getInt(0) == 1 || r.getInt(0) == 2).collect()
 ));
 Assert.assertTrue(Arrays.equals(
  (Row[]) df.filter(df.col("a").isInCollection(new HashSet<>(Arrays.asList(1, 2)))).collect(),
  (Row[]) df.filter((FilterFunction<Row>) r -> r.getInt(0) == 1 || r.getInt(0) == 2).collect()
 ));
 Assert.assertTrue(Arrays.equals(
  (Row[]) df.filter(df.col("a").isInCollection(new ArrayList<>(Arrays.asList(3, 1)))).collect(),
  (Row[]) df.filter((FilterFunction<Row>) r -> r.getInt(0) == 3 || r.getInt(0) == 1).collect()
 ));
}

 @Test
 public void isInCollectionCheckExceptionMessage() {
  List<Row> rows = Arrays.asList(
   RowFactory.create(1, Arrays.asList(1)),
   RowFactory.create(2, Arrays.asList(2)),
   RowFactory.create(3, Arrays.asList(3)));
  StructType schema = createStructType(Arrays.asList(
   createStructField("a", IntegerType, false),
   createStructField("b", createArrayType(IntegerType, false), false)));
  Dataset<Row> df = spark.createDataFrame(rows, schema);
  try {
   df.filter(df.col("a").isInCollection(Arrays.asList(new Column("b"))));
   Assert.fail("Expected org.apache.spark.sql.AnalysisException");
  } catch (Exception e) {
   Arrays.asList("cannot resolve",
    "due to data type mismatch: Arguments must be same type but were")
    .forEach(s -> Assert.assertTrue(
     e.getMessage().toLowerCase(Locale.ROOT).contains(s.toLowerCase(Locale.ROOT))));
  }
 }
}

/**
 * Normalize by zero mean unit variance
 *
 * @param frame the data to normalize
 * @return a zero mean unit variance centered
 * rdd
 */
public static DataRowsFacade zeromeanUnitVariance(DataRowsFacade frame, List<String> skipColumns) {
  List<String> columnsList = DataFrames.toList(frame.get().columns());
  columnsList.removeAll(skipColumns);
  String[] columnNames = DataFrames.toArray(columnsList);
  //first row is std second row is mean, each column in a row is for a particular column
  List<Row> stdDevMean = stdDevMeanColumns(frame, columnNames);
  for (int i = 0; i < columnNames.length; i++) {
    String columnName = columnNames[i];
    double std = ((Number) stdDevMean.get(0).get(i)).doubleValue();
    double mean = ((Number) stdDevMean.get(1).get(i)).doubleValue();
    if (std == 0.0)
      std = 1; //All same value -> (x-x)/1 = 0
    frame = dataRows(frame.get().withColumn(columnName, frame.get().col(columnName).minus(mean).divide(std)));
  }
  return frame;
}

 @Test
 public void isInCollectionCheckExceptionMessage() {
  List<Row> rows = Arrays.asList(
   RowFactory.create(1, Arrays.asList(1)),
   RowFactory.create(2, Arrays.asList(2)),
   RowFactory.create(3, Arrays.asList(3)));
  StructType schema = createStructType(Arrays.asList(
   createStructField("a", IntegerType, false),
   createStructField("b", createArrayType(IntegerType, false), false)));
  Dataset<Row> df = spark.createDataFrame(rows, schema);
  try {
   df.filter(df.col("a").isInCollection(Arrays.asList(new Column("b"))));
   Assert.fail("Expected org.apache.spark.sql.AnalysisException");
  } catch (Exception e) {
   Arrays.asList("cannot resolve",
    "due to data type mismatch: Arguments must be same type but were")
    .forEach(s -> Assert.assertTrue(
     e.getMessage().toLowerCase(Locale.ROOT).contains(s.toLowerCase(Locale.ROOT))));
  }
 }
}

 private void start() {
  SparkSession spark = SparkSession.builder()
    .appName("CSV to Dataset")
    .master("local")
    .getOrCreate();

  String filename = "data/tuple-data-file.csv";
  Dataset<Row> df = spark.read().format("csv")
    .option("inferSchema", "true")
    .option("header", "false")
    .load(filename);
  df.show();

  // To ensure compatibility between Spark 2.0.0 and Spark 1.6.x
  int count = df.columns().length;
  for (int i = 0; i < count; i++) {
   String oldColName = "_c" + i;
   String newColName = "C" + i;
   df = df.withColumn(newColName, df.col(oldColName)).drop(oldColName);
  }
  df.show();
 }
}

@Test
public void isInCollectionWorksCorrectlyOnJava() {
 List<Row> rows = Arrays.asList(
  RowFactory.create(1, "x"),
  RowFactory.create(2, "y"),
  RowFactory.create(3, "z"));
 StructType schema = createStructType(Arrays.asList(
  createStructField("a", IntegerType, false),
  createStructField("b", StringType, false)));
 Dataset<Row> df = spark.createDataFrame(rows, schema);
 // Test with different types of collections
 Assert.assertTrue(Arrays.equals(
  (Row[]) df.filter(df.col("a").isInCollection(Arrays.asList(1, 2))).collect(),
  (Row[]) df.filter((FilterFunction<Row>) r -> r.getInt(0) == 1 || r.getInt(0) == 2).collect()
 ));
 Assert.assertTrue(Arrays.equals(
  (Row[]) df.filter(df.col("a").isInCollection(new HashSet<>(Arrays.asList(1, 2)))).collect(),
  (Row[]) df.filter((FilterFunction<Row>) r -> r.getInt(0) == 1 || r.getInt(0) == 2).collect()
 ));
 Assert.assertTrue(Arrays.equals(
  (Row[]) df.filter(df.col("a").isInCollection(new ArrayList<>(Arrays.asList(3, 1)))).collect(),
  (Row[]) df.filter((FilterFunction<Row>) r -> r.getInt(0) == 3 || r.getInt(0) == 1).collect()
 ));
}

@Override
public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception {
 LOGGER.debug("Derive: Validating dependencies map " + dependencies.toString());
 validate(dependencies);
 String step = getStepName(dependencies);
 String field = getFieldName(dependencies);
 Object[] inList = getInList(dependencies);
 LOGGER.debug("Derive: Filtering dataset " + step + " by field " + field + " being IN "
   + Arrays.asList(inList).toString() + "");
 return dependencies.get(step).filter(dependencies.get(step).col(field).isin(inList));
}

/**
 * MIn for a column
 *
 * @param dataFrame  the dataframe to
 *                   get the column from
 * @param columnName the name of the column to get the min for
 * @return the column that represents the min
 */
public static Column min(DataRowsFacade dataFrame, String columnName) {
  return dataFrame.get().groupBy(columnName).agg(functions.min(columnName)).col(columnName);
}

/**
 * Mean for a column
 *
 * @param dataFrame  the dataframe to
 *                   get the column fron
 * @param columnName the name of the column to get the mean for
 * @return the column that represents the mean
 */
public static Column mean(DataRowsFacade dataFrame, String columnName) {
  return dataFrame.get().groupBy(columnName).agg(avg(columnName)).col(columnName);
}

/**
 * Max for a column
 *
 * @param dataFrame  the dataframe to
 *                   get the column from
 * @param columnName the name of the column
 *                   to get the max for
 * @return the column that represents the max
 */
public static Column max(DataRowsFacade dataFrame, String columnName) {
  return dataFrame.get().groupBy(columnName).agg(functions.max(columnName)).col(columnName);
}

/**
 * Standard deviation for a column
 *
 * @param dataFrame  the dataframe to
 *                   get the column from
 * @param columnName the name of the column to get the standard
 *                   deviation for
 * @return the column that represents the standard deviation
 */
public static Column var(DataRowsFacade dataFrame, String columnName) {
  return dataFrame.get().groupBy(columnName).agg(functions.variance(columnName)).col(columnName);
}

/**
 * Standard deviation for a column
 *
 * @param dataFrame  the dataframe to
 *                   get the column from
 * @param columnName the name of the column to get the standard
 *                   deviation for
 * @return the column that represents the standard deviation
 */
public static Column var(DataRowsFacade dataFrame, String columnName) {
  return dataFrame.get().groupBy(columnName).agg(functions.variance(columnName)).col(columnName);
}

/**
 * Max for a column
 *
 * @param dataFrame  the dataframe to
 *                   get the column from
 * @param columnName the name of the column
 *                   to get the max for
 * @return the column that represents the max
 */
public static Column max(DataRowsFacade dataFrame, String columnName) {
  return dataFrame.get().groupBy(columnName).agg(functions.max(columnName)).col(columnName);
}

/**
 * MIn for a column
 *
 * @param dataFrame  the dataframe to
 *                   get the column from
 * @param columnName the name of the column to get the min for
 * @return the column that represents the min
 */
public static Column min(DataRowsFacade dataFrame, String columnName) {
  return dataFrame.get().groupBy(columnName).agg(functions.min(columnName)).col(columnName);
}

/**
 * Mean for a column
 *
 * @param dataFrame  the dataframe to
 *                   get the column fron
 * @param columnName the name of the column to get the mean for
 * @return the column that represents the mean
 */
public static Column mean(DataRowsFacade dataFrame, String columnName) {
  return dataFrame.get().groupBy(columnName).agg(avg(columnName)).col(columnName);
}

 private void start() {
  SparkSession spark = SparkSession.builder()
    .appName("Dataset from MySQL JDBC Connection")
    .master("local")
    .getOrCreate();

  java.util.Properties props = new Properties();
  props.put("user", "root");
  props.put("password", "password");
  props.put("useSSL", "false");
  Dataset<Row> df = spark.read().jdbc(
    "jdbc:mysql://localhost:3306/sakila?serverTimezone=EST",
    "actor", props);
  df = df.orderBy(df.col("last_name"));
  df.show();
 }
}

 private void start() {
  SparkSession spark = SparkSession.builder().appName("JSON to Dataset")
    .master("local").getOrCreate();

  String filename = "data/north-carolina-school-performance-data.json";
  long start = System.currentTimeMillis();
  Dataset<Row> df = spark.read().json(filename);
  long stop = System.currentTimeMillis();
  System.out.println("Processing took " + (stop - start) + " ms");
  df.show();
  df.printSchema();

  // Flatenization
  df = df.withColumn("district", df.col("fields.district"));
  df = df.drop(df.col("fields.district")); // this does not work as the column
                       // stays here (Spark 2.0.0)
  df.show();
  df.printSchema();
 }
}

 private void start() {
  SparkSession spark = SparkSession.builder().appName("JSON array to Dataset")
    .master("local").getOrCreate();

  String filename = "data/array.json";
  long start = System.currentTimeMillis();
  Dataset<Row> df = spark.read().json(filename);
  long stop = System.currentTimeMillis();
  System.out.println("Processing took " + (stop - start) + " ms");
  df.show();
  df.printSchema();

  // Turns the "one liner" into a real column
  df = df.select(explode(df.col("valsInArrays"))).toDF("vals");
  df.show();
  df.printSchema();
 }
}

 private void start() {
  SparkSession spark = SparkSession.builder().appName("CSV to Dataset")
    .master("local").getOrCreate();

  spark.udf().register("x2Multiplier", new Multiplier2(),
    DataTypes.IntegerType);

  String filename = "data/tuple-data-file.csv";
  Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true")
    .option("header", "false").load(filename);
  df = df.withColumn("label", df.col("_c0")).drop("_c0");
  df = df.withColumn("value", df.col("_c1")).drop("_c1");
  df = df.withColumn("x2", callUDF("x2Multiplier", df.col("value").cast(
    DataTypes.IntegerType)));
  df.show();
 }
}

 private void start() {
  SparkSession spark = SparkSession.builder().appName("CSV to Dataset")
    .master("local").getOrCreate();

  // registers a new internal UDF
  spark.udf().register("x2Multiplier", new UDF1<Integer, Integer>() {
   private static final long serialVersionUID = -5372447039252716846L;

   @Override
   public Integer call(Integer x) {
    return x * 2;
   }
  }, DataTypes.IntegerType);

  String filename = "data/tuple-data-file.csv";
  Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true")
    .option("header", "false").load(filename);
  df = df.withColumn("label", df.col("_c0")).drop("_c0");
  df = df.withColumn("value", df.col("_c1")).drop("_c1");
  df = df.withColumn("x2", callUDF("x2Multiplier", df.col("value").cast(
    DataTypes.IntegerType)));
  df.show();
 }
}

Popular methods of Dataset

Popular in Java

Reactive rest calls using spring rest template
scheduleAtFixedRate (ScheduledExecutorService)
setScale (BigDecimal)
scheduleAtFixedRate (Timer)
Socket (java.net)
Provides a client-side TCP socket.
NoSuchElementException (java.util)
Thrown when trying to retrieve an element past the end of an Enumeration or Iterator.
Random (java.util)
This class provides methods that return pseudo-random values.It is dangerous to seed Random with the
SortedMap (java.util)
A map that has its keys ordered. The sorting is according to either the natural ordering of its keys
Semaphore (java.util.concurrent)
A counting semaphore. Conceptually, a semaphore maintains a set of permits. Each #acquire blocks if
SAXParseException (org.xml.sax)
Encapsulate an XML parse error or warning.> This module, both source code and documentation, is in t
Top plugins for WebStorm

How to use colmethodin org.apache.spark.sql.Dataset

Best Java code snippets using org.apache.spark.sql.Dataset.col (Showing top 20 results out of 315)

How to use
col
method
in
org.apache.spark.sql.Dataset