org.apache.spark.sql.Dataset.orderBy java code examples

@Test
public void pivot() {
 Dataset<Row> df = spark.table("courseSales");
 List<Row> actual = df.groupBy("year")
  .pivot("course", Arrays.asList("dotNET", "Java"))
  .agg(sum("earnings")).orderBy("year").collectAsList();
 Assert.assertEquals(2012, actual.get(0).getInt(0));
 Assert.assertEquals(15000.0, actual.get(0).getDouble(1), 0.01);
 Assert.assertEquals(20000.0, actual.get(0).getDouble(2), 0.01);
 Assert.assertEquals(2013, actual.get(1).getInt(0));
 Assert.assertEquals(48000.0, actual.get(1).getDouble(1), 0.01);
 Assert.assertEquals(30000.0, actual.get(1).getDouble(2), 0.01);
}

@Test
public void testSampleBy() {
 Dataset<Row> df = spark.range(0, 100, 1, 2).select(col("id").mod(3).as("key"));
 Dataset<Row> sampled = df.stat().sampleBy("key", ImmutableMap.of(0, 0.1, 1, 0.2), 0L);
 List<Row> actual = sampled.groupBy("key").count().orderBy("key").collectAsList();
 Assert.assertEquals(0, actual.get(0).getLong(0));
 Assert.assertTrue(0 <= actual.get(0).getLong(1) && actual.get(0).getLong(1) <= 8);
 Assert.assertEquals(1, actual.get(1).getLong(0));
 Assert.assertTrue(2 <= actual.get(1).getLong(1) && actual.get(1).getLong(1) <= 13);
}

@Test
public void testSampleBy() {
 Dataset<Row> df = spark.range(0, 100, 1, 2).select(col("id").mod(3).as("key"));
 Dataset<Row> sampled = df.stat().sampleBy("key", ImmutableMap.of(0, 0.1, 1, 0.2), 0L);
 List<Row> actual = sampled.groupBy("key").count().orderBy("key").collectAsList();
 Assert.assertEquals(0, actual.get(0).getLong(0));
 Assert.assertTrue(0 <= actual.get(0).getLong(1) && actual.get(0).getLong(1) <= 8);
 Assert.assertEquals(1, actual.get(1).getLong(0));
 Assert.assertTrue(2 <= actual.get(1).getLong(1) && actual.get(1).getLong(1) <= 13);
}

@Test
public void pivot() {
 Dataset<Row> df = spark.table("courseSales");
 List<Row> actual = df.groupBy("year")
  .pivot("course", Arrays.asList("dotNET", "Java"))
  .agg(sum("earnings")).orderBy("year").collectAsList();
 Assert.assertEquals(2012, actual.get(0).getInt(0));
 Assert.assertEquals(15000.0, actual.get(0).getDouble(1), 0.01);
 Assert.assertEquals(20000.0, actual.get(0).getDouble(2), 0.01);
 Assert.assertEquals(2013, actual.get(1).getInt(0));
 Assert.assertEquals(48000.0, actual.get(1).getDouble(1), 0.01);
 Assert.assertEquals(30000.0, actual.get(1).getDouble(2), 0.01);
}

@Test
public void pivot() {
 Dataset<Row> df = spark.table("courseSales");
 List<Row> actual = df.groupBy("year")
  .pivot("course", Arrays.asList("dotNET", "Java"))
  .agg(sum("earnings")).orderBy("year").collectAsList();
 Assert.assertEquals(2012, actual.get(0).getInt(0));
 Assert.assertEquals(15000.0, actual.get(0).getDouble(1), 0.01);
 Assert.assertEquals(20000.0, actual.get(0).getDouble(2), 0.01);
 Assert.assertEquals(2013, actual.get(1).getInt(0));
 Assert.assertEquals(48000.0, actual.get(1).getDouble(1), 0.01);
 Assert.assertEquals(30000.0, actual.get(1).getDouble(2), 0.01);
}

@Test
public void testSampleBy() {
 Dataset<Row> df = spark.range(0, 100, 1, 2).select(col("id").mod(3).as("key"));
 Dataset<Row> sampled = df.stat().sampleBy("key", ImmutableMap.of(0, 0.1, 1, 0.2), 0L);
 List<Row> actual = sampled.groupBy("key").count().orderBy("key").collectAsList();
 Assert.assertEquals(0, actual.get(0).getLong(0));
 Assert.assertTrue(0 <= actual.get(0).getLong(1) && actual.get(0).getLong(1) <= 8);
 Assert.assertEquals(1, actual.get(1).getLong(0));
 Assert.assertTrue(2 <= actual.get(1).getLong(1) && actual.get(1).getLong(1) <= 13);
}

@Override
public Dataset<T> orderBy(final String sortCol, final String... sortCols) {
 final boolean userTriggered = initializeFunction(sortCol, sortCols);
 final Dataset<T> result = from(super.orderBy(sortCol, sortCols));
 this.setIsUserTriggered(userTriggered);
 return result;
}

@Override
public Dataset<T> orderBy(final Column... sortExprs) {
 final boolean userTriggered = initializeFunction(sortExprs);
 final Dataset<T> result = from(super.orderBy(sortExprs));
 this.setIsUserTriggered(userTriggered);
 return result;
}

@Override
public Dataset<T> orderBy(final scala.collection.Seq<Column> sortExprs) {
 final boolean userTriggered = initializeFunction(sortExprs);
 final Dataset<T> result = from(super.orderBy(sortExprs));
 this.setIsUserTriggered(userTriggered);
 return result;
}

@Override
public Dataset<T> orderBy(final String sortCol, final scala.collection.Seq<String> sortCols) {
 final boolean userTriggered = initializeFunction(sortCol, sortCols);
 final Dataset<T> result = from(super.orderBy(sortCol, sortCols));
 this.setIsUserTriggered(userTriggered);
 return result;
}

 private void start() {
  SparkSession spark = SparkSession.builder()
    .appName("Dataset from MySQL JDBC Connection")
    .master("local")
    .getOrCreate();

  java.util.Properties props = new Properties();
  props.put("user", "root");
  props.put("password", "password");
  props.put("useSSL", "false");
  Dataset<Row> df = spark.read().jdbc(
    "jdbc:mysql://localhost:3306/sakila?serverTimezone=EST",
    "actor", props);
  df = df.orderBy(df.col("last_name"));
  df.show();
 }
}

df = spark.read().option("dateFormat", "yyyy-mm-dd").json(fileToAnalyze);
df = df.withColumn("district", df.col("fields.district"));
df = df.groupBy("district").count().orderBy(df.col("district"));
df.show(150, false);

df.orderBy("key", "value");
df.orderBy(col("key"), col("value"));

df.orderBy("key", "value");
df.orderBy(col("key"), col("value"));

df.orderBy("key", "value");
df.orderBy(col("key"), col("value"));

  .load(location.toString());
List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList();

  .load(location.toString());
List<SimpleRecord> actual = result.orderBy("id").as(
  Encoders.bean(SimpleRecord.class)).collectAsList();

@Test
public void testStringIndexer() {
 StructType schema = createStructType(new StructField[]{
  createStructField("id", IntegerType, false),
  createStructField("label", StringType, false)
 });
 List<Row> data = Arrays.asList(
  cr(0, "a"), cr(1, "b"), cr(2, "c"), cr(3, "a"), cr(4, "a"), cr(5, "c"));
 Dataset<Row> dataset = spark.createDataFrame(data, schema);
 StringIndexer indexer = new StringIndexer()
  .setInputCol("label")
  .setOutputCol("labelIndex");
 Dataset<Row> output = indexer.fit(dataset).transform(dataset);
 Assert.assertEquals(
  Arrays.asList(cr(0, 0.0), cr(1, 2.0), cr(2, 1.0), cr(3, 0.0), cr(4, 0.0), cr(5, 1.0)),
  output.orderBy("id").select("id", "labelIndex").collectAsList());
}

@Test
public void testStringIndexer() {
 StructType schema = createStructType(new StructField[]{
  createStructField("id", IntegerType, false),
  createStructField("label", StringType, false)
 });
 List<Row> data = Arrays.asList(
  cr(0, "a"), cr(1, "b"), cr(2, "c"), cr(3, "a"), cr(4, "a"), cr(5, "c"));
 Dataset<Row> dataset = spark.createDataFrame(data, schema);
 StringIndexer indexer = new StringIndexer()
  .setInputCol("label")
  .setOutputCol("labelIndex");
 Dataset<Row> output = indexer.fit(dataset).transform(dataset);
 Assert.assertEquals(
  Arrays.asList(cr(0, 0.0), cr(1, 2.0), cr(2, 1.0), cr(3, 0.0), cr(4, 0.0), cr(5, 1.0)),
  output.orderBy("id").select("id", "labelIndex").collectAsList());
}

@Test
public void testStringIndexer() {
 StructType schema = createStructType(new StructField[]{
  createStructField("id", IntegerType, false),
  createStructField("label", StringType, false)
 });
 List<Row> data = Arrays.asList(
  cr(0, "a"), cr(1, "b"), cr(2, "c"), cr(3, "a"), cr(4, "a"), cr(5, "c"));
 Dataset<Row> dataset = spark.createDataFrame(data, schema);
 StringIndexer indexer = new StringIndexer()
  .setInputCol("label")
  .setOutputCol("labelIndex");
 Dataset<Row> output = indexer.fit(dataset).transform(dataset);
 Assert.assertEquals(
  Arrays.asList(cr(0, 0.0), cr(1, 2.0), cr(2, 1.0), cr(3, 0.0), cr(4, 0.0), cr(5, 1.0)),
  output.orderBy("id").select("id", "labelIndex").collectAsList());
}

Popular methods of Dataset

Popular in Java

Reactive rest calls using spring rest template
scheduleAtFixedRate (ScheduledExecutorService)
setScale (BigDecimal)
scheduleAtFixedRate (Timer)
Socket (java.net)
Provides a client-side TCP socket.
NoSuchElementException (java.util)
Thrown when trying to retrieve an element past the end of an Enumeration or Iterator.
Random (java.util)
This class provides methods that return pseudo-random values.It is dangerous to seed Random with the
SortedMap (java.util)
A map that has its keys ordered. The sorting is according to either the natural ordering of its keys
Semaphore (java.util.concurrent)
A counting semaphore. Conceptually, a semaphore maintains a set of permits. Each #acquire blocks if
SAXParseException (org.xml.sax)
Encapsulate an XML parse error or warning.> This module, both source code and documentation, is in t
Top plugins for Android Studio

How to use orderBymethodin org.apache.spark.sql.Dataset

Best Java code snippets using org.apache.spark.sql.Dataset.orderBy (Showing top 20 results out of 315)

How to use
orderBy
method
in
org.apache.spark.sql.Dataset