private static ArrayList<String> getColumnStates(DataFrame data, String name) { ArrayList<String> states = new ArrayList(); final Row[] statesRow = data.select(name).distinct().collect(); for (Row r : statesRow) states.add( r.getString(0) ); return states; }
import org.apache.spark.api.java.*; import org.apache.spark.SparkConf; import org.apache.spark.sql.SQLContext; import static org.apache.spark.sql.functions.*; import org.apache.spark.sql.DataFrame; public class App { public static void main(String[] args) { SparkConf conf = new SparkConf().setMaster("local"); JavaSparkContext sc = new JavaSparkContext(conf); SQLContext sqlContext= new SQLContext(sc); DataFrame df = sqlContext.sql( "SELECT CAST('2012-01-01' AS DATE), CAST('2013-08-02' AS DATE)").toDF("first", "second"); df.select(datediff(df.col("first"), df.col("second"))).show(); } }
DataFrame personPositions = persons.select(persons.col("fullName").as("personName"), org.apache.spark.sql.functions.explode(persons.col("positions")).as("pos")); DataFrame test = personPositions.select(personPositions.col("personName"), personPositions.col("pos").getField("companyName").as("companyName"), personPositions.col("pos").getField("title").as("positionTitle"));
for (Row r: results.select("features", "label", "myProbability", "prediction").collect()) { System.out.println("(" + r.get(0) + ", " + r.get(1) + ") -> prob=" + r.get(2) + ", prediction=" + r.get(3));
String dataPath = new Path(path, "data").toString(); DataFrame df = sqlContext().read().format("parquet").load(dataPath); Row row = df.select("markovOrder", "weights", "tagDictionary").head();
DataFrame df0 = (new SQLContext(jsc)).createDataFrame(jrdd, WhitespaceContext.class); DataFrame df1 = model.transform(df0); prediction = jsc.broadcast(df1.select("prediction").collect()); if (df1.count() > 0) { output = s.map(new WhitespaceClassificationFunction());
DataFrame predictionAndLabel = result.select("prediction", "label"); MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator().setMetricName("precision"); if (verbose) {
JavaRDD<Row> wt = df.select("word", "label").javaRDD(); JavaPairRDD<String, Set<Integer>> tagDictionary = wt.mapToPair(new PairFunction<Row, String, Set<Integer>>(){ private static final long serialVersionUID = 5865372074294028547L;
df.select("dependency").write().text(outputFileName); else df.repartition(1).write().json(outputFileName);