public void registerTable(EntityMetadata m, SparkClient sparkClient) { final Class clazz = m.getEntityClazz(); SparkContext sc = sparkClient.sparkContext; Configuration config = new Configuration(); config.set( "mongo.input.uri", buildMongoURIPath(sc.getConf().get("hostname"), sc.getConf().get("portname"), m.getSchema(), m.getTableName())); JavaRDD<Tuple2<Object, BSONObject>> mongoJavaRDD = sc.newAPIHadoopRDD(config, MongoInputFormat.class, Object.class, BSONObject.class).toJavaRDD(); JavaRDD<Object> mongoRDD = mongoJavaRDD.flatMap(new FlatMapFunction<Tuple2<Object, BSONObject>, Object>() { @Override public Iterable<Object> call(Tuple2<Object, BSONObject> arg) { BSONObject obj = arg._2(); Object javaObject = generateJavaObjectFromBSON(obj, clazz); return Arrays.asList(javaObject); } }); sparkClient.sqlContext.createDataFrame(mongoRDD, m.getEntityClazz()).registerTempTable(m.getTableName()); }
GraphXInputFormat.setScanAuthorizations(job, authorizations); return sc.newAPIHadoopRDD(job.getConfiguration(), GraphXInputFormat.class, Object.class, RyaTypeWritable.class);
String tableName = RdfCloudTripleStoreUtils.layoutPrefixToTable(TABLE_LAYOUT.SPO, tablePrefix); InputFormatBase.setInputTableName(job, tableName); return sc.newAPIHadoopRDD(job.getConfiguration(), GraphXEdgeInputFormat.class, Object.class, Edge.class);
private RDD<Element> doOperationUsingElementInputFormat(final GetRDDOfAllElements operation, final Context context, final AccumuloStore accumuloStore) throws OperationException { final Configuration conf = getConfiguration(operation); addIterators(accumuloStore, conf, context.getUser(), operation); final String useBatchScannerRDD = operation.getOption(USE_BATCH_SCANNER_RDD); if (Boolean.parseBoolean(useBatchScannerRDD)) { InputConfigurator.setBatchScan(AccumuloInputFormat.class, conf, true); } final RDD<Tuple2<Element, NullWritable>> pairRDD = SparkContextUtil.getSparkSession(context, accumuloStore.getProperties()).sparkContext().newAPIHadoopRDD(conf, ElementInputFormat.class, Element.class, NullWritable.class); return pairRDD.map(new FirstElement(), ELEMENT_CLASS_TAG); }
private RDD<Element> doOperation(final GetRDDOfElements operation, final Context context, final AccumuloStore accumuloStore) throws OperationException { final Configuration conf = getConfiguration(operation); final SparkContext sparkContext = SparkContextUtil.getSparkSession(context, accumuloStore.getProperties()).sparkContext(); sparkContext.hadoopConfiguration().addResource(conf); // Use batch scan option when performing seeded operation InputConfigurator.setBatchScan(AccumuloInputFormat.class, conf, true); addIterators(accumuloStore, conf, context.getUser(), operation); addRanges(accumuloStore, conf, operation); final RDD<Tuple2<Element, NullWritable>> pairRDD = sparkContext.newAPIHadoopRDD(conf, ElementInputFormat.class, Element.class, NullWritable.class); return pairRDD.map(new FirstElement(), ClassTagConstants.ELEMENT_CLASS_TAG); }
private RDD<Element> doOperation(final GetRDDOfElementsInRanges operation, final Context context, final AccumuloStore accumuloStore) throws OperationException { final Configuration conf = getConfiguration(operation); final SparkContext sparkContext = SparkContextUtil.getSparkSession(context, accumuloStore.getProperties()).sparkContext(); sparkContext.hadoopConfiguration().addResource(conf); // Use batch scan option when performing seeded operation InputConfigurator.setBatchScan(AccumuloInputFormat.class, conf, true); addIterators(accumuloStore, conf, context.getUser(), operation); addRangesFromPairs(accumuloStore, conf, operation); final RDD<Tuple2<Element, NullWritable>> pairRDD = sparkContext.newAPIHadoopRDD(conf, ElementInputFormat.class, Element.class, NullWritable.class); return pairRDD.map(new FirstElement(), ClassTagConstants.ELEMENT_CLASS_TAG); } }
sc.newAPIHadoopRDD( conf, GeoWaveInputFormat.class,
RDD<Tuple2<Text, Tuple>> hadoopRDD = sparkContext.newAPIHadoopRDD( jobConf, PigInputFormatSpark.class, Text.class, Tuple.class);