public static List<Record> dataFrameToRecordsList(int tenantId, String tableName, DataFrame dataFrame) { Row[] rows = dataFrame.collect(); List<Record> records = new ArrayList<>(); StructType schema = dataFrame.schema(); for (Row row : rows) { records.add(new Record(tenantId, tableName, convertRowAndSchemaToValuesMap(row, schema))); } return records; }
private static ArrayList<String> getColumnStates(DataFrame data, String name) { ArrayList<String> states = new ArrayList(); final Row[] statesRow = data.select(name).distinct().collect(); for (Row r : statesRow) states.add( r.getString(0) ); return states; }
@Override public List<String> call(JobContext jc) throws Exception { InputStream source = getClass().getResourceAsStream("/testweet.json"); // Save the resource as a file in HDFS (or the local tmp dir when using a local filesystem). URI input; File local = File.createTempFile("tweets", ".json", jc.getLocalTmpDir()); Files.copy(source, local.toPath(), StandardCopyOption.REPLACE_EXISTING); FileSystem fs = FileSystem.get(jc.sc().sc().hadoopConfiguration()); if ("file".equals(fs.getUri().getScheme())) { input = local.toURI(); } else { String uuid = UUID.randomUUID().toString(); Path target = new Path("/tmp/" + uuid + "-tweets.json"); fs.copyFromLocalFile(new Path(local.toURI()), target); input = target.toUri(); } SQLContext sqlctx = useHiveContext ? jc.hivectx() : jc.sqlctx(); sqlctx.jsonFile(input.toString()).registerTempTable("tweets"); List<String> tweetList = new ArrayList<>(); Row[] result = (Row[])(sqlctx.sql("SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10") .collect()); for (Row r : result) { tweetList.add(r.toString()); } return tweetList; }
@Override public SqlTypedResult sqlTyped(String command, Integer maxRows, DataSourceDescriptor dataSource) throws DDFException { DataFrame rdd = ((SparkDDFManager) this.getManager()).getHiveContext().sql(command); Schema schema = SparkUtils.schemaFromDataFrame(rdd); int columnSize = schema.getNumColumns(); Row[] rddRows = rdd.collect(); List<List<SqlTypedCell>> sqlTypedResult = new ArrayList<List<SqlTypedCell>>(); // Scan every cell and add the type information. for (int rowIdx = 0; rowIdx < rddRows.length; ++rowIdx) { List<SqlTypedCell> row = new ArrayList<SqlTypedCell>(); for (int colIdx = 0; colIdx < columnSize; ++ colIdx) { // TODO: Optimize by reducing getType(). row.add(new SqlTypedCell(schema.getColumn(colIdx).getType(), rddRows[rowIdx].get(colIdx).toString())); } sqlTypedResult.add(row); } return new SqlTypedResult(schema, sqlTypedResult); }
private AnalyticsQueryResult toResult(DataFrame dataFrame) throws AnalyticsExecutionException { int resultsLimit = this.sparkConf.getInt("carbon.spark.results.limit", -1); if (resultsLimit != -1) { return new AnalyticsQueryResult(dataFrame.schema().fieldNames(), convertRowsToObjects(dataFrame.limit(resultsLimit).collect())); } else { return new AnalyticsQueryResult(dataFrame.schema().fieldNames(), convertRowsToObjects(dataFrame.collect())); } }
String sqlCmd = String.format("select distinct(%s) from %s where %s is not null", column.getName(), this.getDDF().getTableName(), column.getName()); DataFrame sqlresult = sqlContext.sql(sqlCmd); Row[] rows = sqlresult.collect(); List<String> values = new ArrayList<>(); for(Row row: rows) { sql = String.format("select %s from %s", sql, this.getDDF().getTableName()); DataFrame sqlResult = sqlContext.sql(sql); Row[] rows = sqlResult.collect(); Row result = rows[0]; int i = 0;
DataFrame df0 = (new SQLContext(jsc)).createDataFrame(jrdd, WhitespaceContext.class); DataFrame df1 = model.transform(df0); prediction = jsc.broadcast(df1.select("prediction").collect()); if (df1.count() > 0) { output = s.map(new WhitespaceClassificationFunction());