List<String> teenagerNames = teenagers.toJavaRDD().map(new Function<Row, String>() { @Override public String call(Row row) { teenagerNames = teenagers2.toJavaRDD().map(new Function<Row, String>() { @Override public String call(Row row) { teenagerNames = teenagers3.toJavaRDD().map(new Function<Row, String>() { @Override public String call(Row row) { return "Name: " + row.getString(0); } List<String> nameAndCity = peopleWithCity.toJavaRDD().map(new Function<Row, String>() { @Override public String call(Row row) {
List<String> teenagerNames = teenagers.toJavaRDD() .map((Row row) -> "Name: " + row.getString(0)).collect(); teenagerNames = teenagers2.toJavaRDD() .map((Row row) -> "Name: " + row.getString(0)).collect(); teenagerNames = teenagers3.toJavaRDD() .map((Row row) -> "Name: " + row.getString(0)).collect(); List<String> nameAndCity = peopleWithCity.toJavaRDD() .map((Row row) -> "Name: " + row.getString(0) + ", City: " + row.getString(1)).collect();
@Test public void dataFrameRDDOperations() { List<Person> personList = new ArrayList<>(2); Person person1 = new Person(); person1.setName("Michael"); person1.setAge(29); personList.add(person1); Person person2 = new Person(); person2.setName("Yin"); person2.setAge(28); personList.add(person2); JavaRDD<Row> rowRDD = jsc.parallelize(personList).map( person -> RowFactory.create(person.getName(), person.getAge())); List<StructField> fields = new ArrayList<>(2); fields.add(DataTypes.createStructField("", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("age", DataTypes.IntegerType, false)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> df = spark.createDataFrame(rowRDD, schema); df.createOrReplaceTempView("people"); List<String> actual = spark.sql("SELECT * FROM people").toJavaRDD() .map(row -> row.getString(0) + "_" + row.get(1)).collect(); List<String> expected = new ArrayList<>(2); expected.add("Michael_29"); expected.add("Yin_28"); Assert.assertEquals(expected, actual); }
@Test public void dataFrameRDDOperations() { List<Person> personList = new ArrayList<>(2); Person person1 = new Person(); person1.setName("Michael"); person1.setAge(29); personList.add(person1); Person person2 = new Person(); person2.setName("Yin"); person2.setAge(28); personList.add(person2); JavaRDD<Row> rowRDD = jsc.parallelize(personList).map( person -> RowFactory.create(person.getName(), person.getAge())); List<StructField> fields = new ArrayList<>(2); fields.add(DataTypes.createStructField("", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("age", DataTypes.IntegerType, false)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> df = spark.createDataFrame(rowRDD, schema); df.createOrReplaceTempView("people"); List<String> actual = spark.sql("SELECT * FROM people").toJavaRDD() .map(row -> row.getString(0) + "_" + row.get(1)).collect(); List<String> expected = new ArrayList<>(2); expected.add("Michael_29"); expected.add("Yin_28"); Assert.assertEquals(expected, actual); }
@Test public void dataFrameRDDOperations() { List<Person> personList = new ArrayList<>(2); Person person1 = new Person(); person1.setName("Michael"); person1.setAge(29); personList.add(person1); Person person2 = new Person(); person2.setName("Yin"); person2.setAge(28); personList.add(person2); JavaRDD<Row> rowRDD = jsc.parallelize(personList).map( person -> RowFactory.create(person.getName(), person.getAge())); List<StructField> fields = new ArrayList<>(2); fields.add(DataTypes.createStructField("", DataTypes.StringType, false)); fields.add(DataTypes.createStructField("age", DataTypes.IntegerType, false)); StructType schema = DataTypes.createStructType(fields); Dataset<Row> df = spark.createDataFrame(rowRDD, schema); df.createOrReplaceTempView("people"); List<String> actual = spark.sql("SELECT * FROM people").toJavaRDD() .map(row -> row.getString(0) + "_" + row.get(1)).collect(); List<String> expected = new ArrayList<>(2); expected.add("Michael_29"); expected.add("Yin_28"); Assert.assertEquals(expected, actual); }
/** * Convert given H2O frame into a RDD * * @param fr the frame to be used * @return a new RDD */ public JavaRDD<Row> asRDD(H2OFrame fr) { return hc.asDataFrame(fr, true).toJavaRDD(); }
/** * Returns an RDD of bundles loaded from the given dataset of XML-encoded * bundles. * * @param xmlBundles a dataset of XML-encoded bundles * @return an RDD of FHIR Bundles */ public JavaRDD<BundleContainer> fromXml(Dataset<String> xmlBundles) { return xmlBundles.toJavaRDD().map(new StringToBundle(true, fhirVersion)); }
/** * Returns an RDD of bundles loaded from the given dataset of JSON-encoded * bundles. * * @param jsonBundles a dataset of JSON-encoded bundles * @return an RDD of FHIR Bundles */ public JavaRDD<BundleContainer> fromJson(Dataset<String> jsonBundles) { return jsonBundles.toJavaRDD().map(new StringToBundle(false, fhirVersion)); }
/** * Returns an RDD of bundles loaded from the given dataset of XML-encoded * bundles. * * @param xmlBundles a dataset of XML-encoded bundles * @return an RDD of FHIR Bundles */ public JavaRDD<BundleContainer> fromXml(Dataset<String> xmlBundles) { return xmlBundles.toJavaRDD().map(new StringToBundle(true, fhirVersion)); }
/** * Returns an RDD of bundles loaded from the given dataset of JSON-encoded * bundles. * * @param jsonBundles a dataset of JSON-encoded bundles * @return an RDD of FHIR Bundles */ public JavaRDD<BundleContainer> fromJson(Dataset<String> jsonBundles) { return jsonBundles.toJavaRDD().map(new StringToBundle(false, fhirVersion)); }
@Override public Dataset<Row> derive(Map<String, Dataset<Row>> dependencies) throws Exception { if (!dependencies.containsKey(stepName)) { throw new RuntimeException("Step not found in the dependencies list"); } Dataset<Row> sourceStep = dependencies.get(stepName); // For each partition in the DataFrame / RDD JavaRDD<Row> outputRDD = sourceStep.toJavaRDD().flatMap( MorphlineUtils.morphlineMapper(this.morphlineFile, this.morphlineId, getSchema(), errorOnEmpty)); // Convert all the Rows into a new DataFrame return Contexts.getSparkSession().createDataFrame(outputRDD, getSchema()); }
/** * Returns latest versions of the given hierarchies. * * @param uris a set of URIs for which to retrieve the latest versions, or null to load them all * @return a map of value set URIs to the latest versions for them. */ public Map<String,String> getLatestVersions(final Set<String> uris) { JavaRDD<UrlAndVersion> members = this.members.toJavaRDD() .filter(uriAndVersion -> (uris == null || uris.contains(uriAndVersion.getUrl()))) .mapToPair(uriAndVersion -> new Tuple2<>(uriAndVersion.getUrl(), uriAndVersion.getVersion())) .reduceByKey((leftVersion, rightVersion) -> leftVersion.compareTo(rightVersion) > 0 ? leftVersion : rightVersion) .map(tuple -> new UrlAndVersion(tuple._1, tuple._2)); return spark.createDataset(members.rdd(), URI_AND_VERSION_ENCODER) .collectAsList() .stream() .collect(Collectors.toMap(UrlAndVersion::getUrl, UrlAndVersion::getVersion)); }
/** * Returns latest versions of the given hierarchies. * * @param uris a set of URIs for which to retrieve the latest versions, or null to load them all * @return a map of value set URIs to the latest versions for them. */ public Map<String,String> getLatestVersions(final Set<String> uris) { JavaRDD<UrlAndVersion> members = this.members.toJavaRDD() .filter(uriAndVersion -> (uris == null || uris.contains(uriAndVersion.getUrl()))) .mapToPair(uriAndVersion -> new Tuple2<>(uriAndVersion.getUrl(), uriAndVersion.getVersion())) .reduceByKey((leftVersion, rightVersion) -> leftVersion.compareTo(rightVersion) > 0 ? leftVersion : rightVersion) .map(tuple -> new UrlAndVersion(tuple._1, tuple._2)); return spark.createDataset(members.rdd(), URI_AND_VERSION_ENCODER) .collectAsList() .stream() .collect(Collectors.toMap(UrlAndVersion::getUrl, UrlAndVersion::getVersion)); }
/** * Returns the latest versions of a given set of value sets. * * @param uris a set of URIs for which to retrieve the latest versions, or null to load them all * @param includeExperimental whether to include value sets marked as experimental * @return a map of value set URIs to the latest versions for them. */ public Map<String,String> getLatestVersions(final Set<String> uris, boolean includeExperimental) { // Reduce by the concept map URI to return only the latest version // per concept map. Spark's provided max aggregation function // only works on numeric types, so we jump into RDDs and perform // the reduce by hand. JavaRDD<UrlAndVersion> members = this.valueSets.select("url", "version", "experimental") .toJavaRDD() .filter(row -> (uris == null || uris.contains(row.getString(0))) && (includeExperimental || row.isNullAt(2) || !row.getBoolean(2))) .mapToPair(row -> new Tuple2<>(row.getString(0), row.getString(1))) .reduceByKey((leftVersion, rightVersion) -> leftVersion.compareTo(rightVersion) > 0 ? leftVersion : rightVersion) .map(tuple -> new UrlAndVersion(tuple._1, tuple._2)); return spark.createDataset(members.rdd(), URL_AND_VERSION_ENCODER) .collectAsList() .stream() .collect(Collectors.toMap(UrlAndVersion::getUrl, UrlAndVersion::getVersion)); }
/** * Returns the latest versions of a given set of value sets. * * @param uris a set of URIs for which to retrieve the latest versions, or null to load them all * @param includeExperimental whether to include value sets marked as experimental * @return a map of value set URIs to the latest versions for them. */ public Map<String,String> getLatestVersions(final Set<String> uris, boolean includeExperimental) { // Reduce by the concept map URI to return only the latest version // per concept map. Spark's provided max aggregation function // only works on numeric types, so we jump into RDDs and perform // the reduce by hand. JavaRDD<UrlAndVersion> members = this.valueSets.select("url", "version", "experimental") .toJavaRDD() .filter(row -> (uris == null || uris.contains(row.getString(0))) && (includeExperimental || row.isNullAt(2) || !row.getBoolean(2))) .mapToPair(row -> new Tuple2<>(row.getString(0), row.getString(1))) .reduceByKey((leftVersion, rightVersion) -> leftVersion.compareTo(rightVersion) > 0 ? leftVersion : rightVersion) .map(tuple -> new UrlAndVersion(tuple._1, tuple._2)); return spark.createDataset(members.rdd(), URL_AND_VERSION_ENCODER) .collectAsList() .stream() .collect(Collectors.toMap(UrlAndVersion::getUrl, UrlAndVersion::getVersion)); }
/** * Returns the latest versions of a given set of concept maps. * * @param urls a set of URLs to retrieve the latest version for, or null to load them all. * @param includeExperimental flag to include concept maps marked as experimental * * @return a map of concept map URLs to the latest version for them. */ public Map<String,String> getLatestVersions(final Set<String> urls, boolean includeExperimental) { // Reduce by the concept map URI to return only the latest version // per concept map. Spark's provided max aggregation function // only works on numeric types, so we jump into RDDs and perform // the reduce by hand. JavaRDD<UrlAndVersion> changes = this.conceptMaps.select(col("url"), col("version"), col("experimental")) .toJavaRDD() .filter(row -> (urls == null || urls.contains(row.getString(0))) && (includeExperimental || row.isNullAt(2) || !row.getBoolean(2))) .mapToPair(row -> new Tuple2<>(row.getString(0), row.getString(1))) .reduceByKey((leftVersion, rightVersion) -> leftVersion.compareTo(rightVersion) > 0 ? leftVersion : rightVersion) .map(tuple -> new UrlAndVersion(tuple._1, tuple._2)); return this.spark.createDataset(changes.rdd(), URL_AND_VERSION_ENCODER) .collectAsList() .stream() .collect(Collectors.toMap(UrlAndVersion::getUrl, UrlAndVersion::getVersion)); }
/** * Returns the latest versions of a given set of concept maps. * * @param urls a set of URLs to retrieve the latest version for, or null to load them all. * @param includeExperimental flag to include concept maps marked as experimental * * @return a map of concept map URLs to the latest version for them. */ public Map<String,String> getLatestVersions(final Set<String> urls, boolean includeExperimental) { // Reduce by the concept map URI to return only the latest version // per concept map. Spark's provided max aggregation function // only works on numeric types, so we jump into RDDs and perform // the reduce by hand. JavaRDD<UrlAndVersion> changes = this.conceptMaps.select(col("url"), col("version"), col("experimental")) .toJavaRDD() .filter(row -> (urls == null || urls.contains(row.getString(0))) && (includeExperimental || row.isNullAt(2) || !row.getBoolean(2))) .mapToPair(row -> new Tuple2<>(row.getString(0), row.getString(1))) .reduceByKey((leftVersion, rightVersion) -> leftVersion.compareTo(rightVersion) > 0 ? leftVersion : rightVersion) .map(tuple -> new UrlAndVersion(tuple._1, tuple._2)); return this.spark.createDataset(changes.rdd(), URL_AND_VERSION_ENCODER) .collectAsList() .stream() .collect(Collectors.toMap(UrlAndVersion::getUrl, UrlAndVersion::getVersion)); }
.setK(3) .fit(df); List<Row> result = pca.transform(df).select("pca_features", "expected").toJavaRDD().collect(); for (Row r : result) { Vector calculatedVector = (Vector) r.get(0);
.setK(3) .fit(df); List<Row> result = pca.transform(df).select("pca_features", "expected").toJavaRDD().collect(); for (Row r : result) { Vector calculatedVector = (Vector) r.get(0);
.setK(3) .fit(df); List<Row> result = pca.transform(df).select("pca_features", "expected").toJavaRDD().collect(); for (Row r : result) { Vector calculatedVector = (Vector) r.get(0);