@Test public void testCommonOperation() { List<String> data = Arrays.asList("hello", "world"); Dataset<String> ds = spark.createDataset(data, Encoders.STRING()); Assert.assertEquals("hello", ds.first()); Dataset<String> filtered = ds.filter((FilterFunction<String>) v -> v.startsWith("h")); Assert.assertEquals(Arrays.asList("hello"), filtered.collectAsList()); Dataset<Integer> mapped = ds.map((MapFunction<String, Integer>) String::length, Encoders.INT()); Assert.assertEquals(Arrays.asList(5, 5), mapped.collectAsList()); Dataset<String> parMapped = ds.mapPartitions((MapPartitionsFunction<String, String>) it -> { List<String> ls = new LinkedList<>(); while (it.hasNext()) { ls.add(it.next().toUpperCase(Locale.ROOT)); } return ls.iterator(); }, Encoders.STRING()); Assert.assertEquals(Arrays.asList("HELLO", "WORLD"), parMapped.collectAsList()); Dataset<String> flatMapped = ds.flatMap((FlatMapFunction<String, String>) s -> { List<String> ls = new LinkedList<>(); for (char c : s.toCharArray()) { ls.add(String.valueOf(c)); } return ls.iterator(); }, Encoders.STRING()); Assert.assertEquals( Arrays.asList("h", "e", "l", "l", "o", "w", "o", "r", "l", "d"), flatMapped.collectAsList()); }
@Test public void testSerializeNull() { NestedSmallBean bean = new NestedSmallBean(); Encoder<NestedSmallBean> encoder = Encoders.bean(NestedSmallBean.class); List<NestedSmallBean> beans = Arrays.asList(bean); Dataset<NestedSmallBean> ds1 = spark.createDataset(beans, encoder); Assert.assertEquals(beans, ds1.collectAsList()); Dataset<NestedSmallBean> ds2 = ds1.map((MapFunction<NestedSmallBean, NestedSmallBean>) b -> b, encoder); Assert.assertEquals(beans, ds2.collectAsList()); }
@Test public void testSerializeNull() { NestedSmallBean bean = new NestedSmallBean(); Encoder<NestedSmallBean> encoder = Encoders.bean(NestedSmallBean.class); List<NestedSmallBean> beans = Arrays.asList(bean); Dataset<NestedSmallBean> ds1 = spark.createDataset(beans, encoder); Assert.assertEquals(beans, ds1.collectAsList()); Dataset<NestedSmallBean> ds2 = ds1.map((MapFunction<NestedSmallBean, NestedSmallBean>) b -> b, encoder); Assert.assertEquals(beans, ds2.collectAsList()); }
@Test public void testCommonOperation() { List<String> data = Arrays.asList("hello", "world"); Dataset<String> ds = spark.createDataset(data, Encoders.STRING()); Assert.assertEquals("hello", ds.first()); Dataset<String> filtered = ds.filter((FilterFunction<String>) v -> v.startsWith("h")); Assert.assertEquals(Arrays.asList("hello"), filtered.collectAsList()); Dataset<Integer> mapped = ds.map((MapFunction<String, Integer>) String::length, Encoders.INT()); Assert.assertEquals(Arrays.asList(5, 5), mapped.collectAsList()); Dataset<String> parMapped = ds.mapPartitions((MapPartitionsFunction<String, String>) it -> { List<String> ls = new LinkedList<>(); while (it.hasNext()) { ls.add(it.next().toUpperCase(Locale.ROOT)); } return ls.iterator(); }, Encoders.STRING()); Assert.assertEquals(Arrays.asList("HELLO", "WORLD"), parMapped.collectAsList()); Dataset<String> flatMapped = ds.flatMap((FlatMapFunction<String, String>) s -> { List<String> ls = new LinkedList<>(); for (char c : s.toCharArray()) { ls.add(String.valueOf(c)); } return ls.iterator(); }, Encoders.STRING()); Assert.assertEquals( Arrays.asList("h", "e", "l", "l", "o", "w", "o", "r", "l", "d"), flatMapped.collectAsList()); }
@Test public void testSerializeNull() { NestedSmallBean bean = new NestedSmallBean(); Encoder<NestedSmallBean> encoder = Encoders.bean(NestedSmallBean.class); List<NestedSmallBean> beans = Arrays.asList(bean); Dataset<NestedSmallBean> ds1 = spark.createDataset(beans, encoder); Assert.assertEquals(beans, ds1.collectAsList()); Dataset<NestedSmallBean> ds2 = ds1.map((MapFunction<NestedSmallBean, NestedSmallBean>) b -> b, encoder); Assert.assertEquals(beans, ds2.collectAsList()); } }
@Test public void testCommonOperation() { List<String> data = Arrays.asList("hello", "world"); Dataset<String> ds = spark.createDataset(data, Encoders.STRING()); Assert.assertEquals("hello", ds.first()); Dataset<String> filtered = ds.filter((FilterFunction<String>) v -> v.startsWith("h")); Assert.assertEquals(Arrays.asList("hello"), filtered.collectAsList()); Dataset<Integer> mapped = ds.map((MapFunction<String, Integer>) String::length, Encoders.INT()); Assert.assertEquals(Arrays.asList(5, 5), mapped.collectAsList()); Dataset<String> parMapped = ds.mapPartitions((MapPartitionsFunction<String, String>) it -> { List<String> ls = new LinkedList<>(); while (it.hasNext()) { ls.add(it.next().toUpperCase(Locale.ROOT)); } return ls.iterator(); }, Encoders.STRING()); Assert.assertEquals(Arrays.asList("HELLO", "WORLD"), parMapped.collectAsList()); Dataset<String> flatMapped = ds.flatMap((FlatMapFunction<String, String>) s -> { List<String> ls = new LinkedList<>(); for (char c : s.toCharArray()) { ls.add(String.valueOf(c)); } return ls.iterator(); }, Encoders.STRING()); Assert.assertEquals( Arrays.asList("h", "e", "l", "l", "o", "w", "o", "r", "l", "d"), flatMapped.collectAsList()); }
@Override public <U> Dataset<U> map(final scala.Function1<T, U> func, final Encoder<U> evidence) { final boolean userTriggered = initializeFunction(func, evidence); final Dataset<U> result = from(super.map(func, evidence)); this.setIsUserTriggered(userTriggered); return result; }
@Override public <U> Dataset<U> map(final MapFunction<T, U> func, final Encoder<U> encoder) { final boolean userTriggered = initializeFunction(func, encoder); final Dataset<U> result = from(super.map(func, encoder)); this.setIsUserTriggered(userTriggered); return result; }
@Override public Dataset<Row> check(Dataset<Row> dataset, Map<String, Dataset<Row>> stepDependencies) { return dataset.map(new CheckRule(rowRule, name), RowEncoder.apply(SCHEMA)).select(new BooleanAggregator(name).toColumn()); }
@Override public Dataset<Row> check(Dataset<Row> dataset, Map<String, Dataset<Row>> stepDependencies) { if (isDependency()) { Dataset<Row> expectedDependency = stepDependencies.get(dependency); if (expectedDependency.count() == 1 && expectedDependency.schema().fields().length == 1 && expectedDependency.schema().apply(0).dataType() == DataTypes.LongType) { expected = expectedDependency.collectAsList().get(0).getLong(0); } else { throw new RuntimeException("Step dependency for count rule must have one row with a single field of long type"); } } if (expected < 0) { throw new RuntimeException("Failed to determine expected count: must be specified either as literal or step dependency"); } return dataset.groupBy().count().map(new CheckCount(expected, name), RowEncoder.apply(SCHEMA)); }
/** * Converts a set of FHIR resources to JSON. * * @param dataset a dataset containing FHIR resources * @param resourceType the FHIR resource type * @return a dataset of JSON strings for the FHIR resources */ public static Dataset<String> toJson(Dataset<?> dataset, String resourceType) { Dataset<IBaseResource> resourceDataset = dataset.as(FhirEncoders.forR4() .getOrCreate() .of(resourceType)); return resourceDataset.map(new ToJson(), Encoders.STRING()); }
/** * Converts a set of FHIR resources to JSON. * * @param dataset a dataset containing FHIR resources * @param resourceType the FHIR resource type * @return a dataset of JSON strings for the FHIR resources */ public static Dataset<String> toJson(Dataset<?> dataset, String resourceType) { Dataset<IBaseResource> resourceDataset = dataset.as(FhirEncoders.forR4() .getOrCreate() .of(resourceType)); return resourceDataset.map(new ToJson(), Encoders.STRING()); }
/** * Converts a set of FHIR resources to JSON. * * @param dataset a dataset containing FHIR resources * @param resourceType the FHIR resource type * @return a dataset of JSON strings for the FHIR resources */ public static Dataset<String> toJson(Dataset<?> dataset, String resourceType) { Dataset<IBaseResource> resourceDataset = dataset.as(FhirEncoders.forStu3() .getOrCreate() .of(resourceType)); return resourceDataset.map(new ToJson(), Encoders.STRING()); }
private Dataset<Row> readText(String path) throws Exception { Dataset<Row> lines = Contexts.getSparkSession().read().text(path); if (translatorConfig != null) { Dataset<Tuple2<String, String>> keyedLines = lines.map( new PrepareLineForTranslationFunction(), Encoders.tuple(Encoders.STRING(), Encoders.STRING())); TranslateFunction<String, String> translateFunction = getTranslateFunction(translatorConfig); return keyedLines.flatMap(translateFunction, RowEncoder.apply(translateFunction.getSchema())); } else { return lines; } }
@Override public Dataset<Row> read() throws Exception { Dataset<Row> df = Contexts.getSparkSession() .range(numPartitions * 10) .repartition(numPartitions) .map(new LongToRowFunction(), RowEncoder.apply(DataTypes.createStructType( Lists.newArrayList( DataTypes.createStructField("value", DataTypes.LongType, true), DataTypes.createStructField("modulo", DataTypes.LongType, true)) ))); return df; }
private void start() { SparkSession spark = SparkSession.builder().appName("Book URL Builder") .master("local").getOrCreate(); String filename = "data/books.csv"; Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true") .option("header", "true") .load(filename); df.show(); Dataset<String> ds = df.map(new BookUrlBuilder(), Encoders.STRING()); ds.printSchema(); ds.show(20, 80); } }
private void start() { SparkSession spark = SparkSession.builder().appName("CSV to Dataset<Book>") .master("local").getOrCreate(); String filename = "data/books.csv"; Dataset<Row> df = spark.read().format("csv") .option("inferSchema", "true") .option("header", "true") .load(filename); df.show(); Dataset<Book> bookDs = df.map(new BookMapper(), Encoders.bean(Book.class)); bookDs.show(); bookDs.printSchema(); } }
private void start() { SparkSession spark = SparkSession.builder().appName( "CSV to Dataset<Book> as JSON").master("local").getOrCreate(); String filename = "data/books.csv"; Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true") .option("header", "true") .load(filename); df.show(); Dataset<String> bookDf = df.map(new BookMapper(), Encoders.STRING()); bookDf.show(20, 132); Dataset<Row> bookAsJsonDf = spark.read().json(bookDf); bookAsJsonDf.show(); } }
private void start() { SparkSession spark = SparkSession.builder().appName("CSV to Dataset<Book>") .master("local").getOrCreate(); String filename = "data/books.csv"; Dataset<Row> df = spark.read().format("csv") .option("inferSchema", "true") .option("header", "true") .load(filename); df.show(); Dataset<Book> bookDs = df.map(new BookMapper(), Encoders.bean(Book.class)); bookDs.show(); bookDs.printSchema(); Dataset<Row> df2 = bookDs.toDF(); df2.show(); df2.printSchema(); } }
/** * Reads the LOINC mutliaxial hierarchy file and converts it to a {@link HierarchicalElement} * dataset. * * @param spark the Spark session * @param loincHierarchyPath path to the multiaxial hierarchy CSV * @return a dataset of {@link HierarchicalElement} representing the hierarchical relationship. */ public static Dataset<HierarchicalElement> readMultiaxialHierarchyFile(SparkSession spark, String loincHierarchyPath) { return spark.read() .option("header", true) .csv(loincHierarchyPath) .select(col("IMMEDIATE_PARENT"), col("CODE")) .where(col("IMMEDIATE_PARENT").isNotNull() .and(col("IMMEDIATE_PARENT").notEqual(lit("")))) .where(col("CODE").isNotNull() .and(col("CODE").notEqual(lit("")))) .map((MapFunction<Row, HierarchicalElement>) row -> { HierarchicalElement element = new HierarchicalElement(); element.setAncestorSystem(LOINC_CODE_SYSTEM_URI); element.setAncestorValue(row.getString(0)); element.setDescendantSystem(LOINC_CODE_SYSTEM_URI); element.setDescendantValue(row.getString(1)); return element; }, Hierarchies.getHierarchicalElementEncoder()); }