@Override public boolean cp(final String sourceLocation, final String targetLocation) { final List<String> rdds = Spark.getRDDs().stream().filter(r -> r.name().startsWith(sourceLocation)).map(RDD::name).collect(Collectors.toList()); if (rdds.size() == 0) return false; for (final String rdd : rdds) { Spark.getRDD(rdd).toJavaRDD().filter(a -> true).setName(rdd.equals(sourceLocation) ? targetLocation : rdd.replace(sourceLocation, targetLocation)).cache().count(); // TODO: this should use the original storage level } return true; }
@Override public Iterator<String> head(final String location, final int totalLines) { return IteratorUtils.map(Spark.getRDD(location).toJavaRDD().take(totalLines).iterator(), Object::toString); }
@Override public <K, V> JavaPairRDD<K, V> readMemoryRDD(final Configuration configuration, final String memoryKey, final JavaSparkContext sparkContext) { if (!configuration.containsKey(Constants.GREMLIN_HADOOP_INPUT_LOCATION)) throw new IllegalArgumentException("There is no provided " + Constants.GREMLIN_HADOOP_INPUT_LOCATION + " to read the persisted RDD from"); return JavaPairRDD.fromJavaRDD((JavaRDD) Spark.getRDD(Constants.getMemoryLocation(configuration.getString(Constants.GREMLIN_HADOOP_INPUT_LOCATION), memoryKey)).toJavaRDD()); } }
@Override public JavaPairRDD<Object, VertexWritable> readGraphRDD(final Configuration configuration, final JavaSparkContext sparkContext) { if (!configuration.containsKey(Constants.GREMLIN_HADOOP_INPUT_LOCATION)) throw new IllegalArgumentException("There is no provided " + Constants.GREMLIN_HADOOP_INPUT_LOCATION + " to read the persisted RDD from"); Spark.create(sparkContext.sc()); final Optional<String> graphLocation = Constants.getSearchGraphLocation(configuration.getString(Constants.GREMLIN_HADOOP_INPUT_LOCATION), SparkContextStorage.open()); return graphLocation.isPresent() ? JavaPairRDD.fromJavaRDD((JavaRDD) Spark.getRDD(graphLocation.get()).toJavaRDD()) : JavaPairRDD.fromJavaRDD(sparkContext.emptyRDD()); }
@Override public boolean persist(List listEntity, EntityMetadata m, SparkClient sparkClient) { Seq s = scala.collection.JavaConversions.asScalaBuffer(listEntity).toList(); ClassTag tag = scala.reflect.ClassTag$.MODULE$.apply(m.getEntityClazz()); JavaRDD personRDD = sparkClient.sparkContext.parallelize(s, 1, tag).toJavaRDD(); DataFrame df = sparkClient.sqlContext.createDataFrame(personRDD, m.getEntityClazz()); String outputFilePath = getOutputFilePath(sparkClient.properties); String ext = (String) sparkClient.properties.get("format"); FileType fileType = FileFormatConstants.extension.get(ext); switch (fileType) { case CSV: return writeDataInCsvFile(df, outputFilePath); case JSON: return writeDataInJsonFile(df, outputFilePath); default: throw new UnsupportedOperationException("Files of type " + ext + " are not yet supported."); } }
JavaRDD javaRDD = sparkClient.sparkContext.parallelize(s, 1, tag).toJavaRDD();
public void registerTable(EntityMetadata m, SparkClient sparkClient) { final Class clazz = m.getEntityClazz(); SparkContext sc = sparkClient.sparkContext; Configuration config = new Configuration(); config.set( "mongo.input.uri", buildMongoURIPath(sc.getConf().get("hostname"), sc.getConf().get("portname"), m.getSchema(), m.getTableName())); JavaRDD<Tuple2<Object, BSONObject>> mongoJavaRDD = sc.newAPIHadoopRDD(config, MongoInputFormat.class, Object.class, BSONObject.class).toJavaRDD(); JavaRDD<Object> mongoRDD = mongoJavaRDD.flatMap(new FlatMapFunction<Tuple2<Object, BSONObject>, Object>() { @Override public Iterable<Object> call(Tuple2<Object, BSONObject> arg) { BSONObject obj = arg._2(); Object javaObject = generateJavaObjectFromBSON(obj, clazz); return Arrays.asList(javaObject); } }); sparkClient.sqlContext.createDataFrame(mongoRDD, m.getEntityClazz()).registerTempTable(m.getTableName()); }
@Override public boolean persist(List listEntity, EntityMetadata m, SparkClient sparkClient) { try { Seq s = scala.collection.JavaConversions.asScalaBuffer(listEntity).toList(); ClassTag tag = scala.reflect.ClassTag$.MODULE$.apply(m.getEntityClazz()); JavaRDD personRDD = sparkClient.sparkContext.parallelize(s, 1, tag).toJavaRDD(); DataFrame df = sparkClient.sqlContext.createDataFrame(personRDD, m.getEntityClazz()); sparkClient.sqlContext.sql("use " + m.getSchema()); if (logger.isDebugEnabled()) { logger.info("Below are the registered table with hive context: "); sparkClient.sqlContext.sql("show tables").show(); } df.write().insertInto(m.getTableName()); return true; } catch (Exception e) { throw new KunderaException("Cannot persist object(s)", e); } }
@Override public boolean persist(List listEntity, EntityMetadata m, SparkClient sparkClient) { try { Seq s = scala.collection.JavaConversions.asScalaBuffer(listEntity).toList(); ClassTag tag = scala.reflect.ClassTag$.MODULE$.apply(m.getEntityClazz()); JavaRDD personRDD = sparkClient.sparkContext.parallelize(s, 1, tag).toJavaRDD(); CassandraJavaUtil.javaFunctions(personRDD) .writerBuilder(m.getSchema(), m.getTableName(), CassandraJavaUtil.mapToRow(m.getEntityClazz())) .saveToCassandra(); return true; } catch (Exception e) { throw new KunderaException("Cannot persist object(s)", e); } }
@Override public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, POStream poStream) throws IOException { SparkUtil.assertPredecessorSize(predecessors, poStream, 1); RDD<Tuple> rdd = predecessors.get(0); StreamFunction streamFunction = new StreamFunction(poStream); return rdd.toJavaRDD().mapPartitions(streamFunction, true).rdd(); }
@Override public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, POPoissonSampleSpark po) throws IOException { SparkUtil.assertPredecessorSize(predecessors, po, 1); RDD<Tuple> rdd = predecessors.get(0); PoissionSampleFunction poissionSampleFunction = new PoissionSampleFunction(po); return rdd.toJavaRDD().mapPartitions(poissionSampleFunction, false).rdd(); }
@Override public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, POCollectedGroup physicalOperator) throws IOException { SparkUtil.assertPredecessorSize(predecessors, physicalOperator, 1); RDD<Tuple> rdd = predecessors.get(0); CollectedGroupFunction collectedGroupFunction = new CollectedGroupFunction(physicalOperator); return rdd.toJavaRDD().mapPartitions(collectedGroupFunction, true) .rdd(); }
@Override public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, POMergeCogroup physicalOperator) { SparkUtil.assertPredecessorSize(predecessors, physicalOperator, 1); RDD<Tuple> rdd = predecessors.get(0); MergeCogroupFunction mergeCogroupFunction = new MergeCogroupFunction(physicalOperator); return rdd.toJavaRDD().mapPartitions(mergeCogroupFunction, true).rdd(); }
@Override public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, POCounter poCounter) throws IOException { SparkUtil.assertPredecessorSize(predecessors, poCounter, 1); RDD<Tuple> rdd = predecessors.get(0); CounterConverterFunction f = new CounterConverterFunction(poCounter); JavaRDD<Tuple> jRdd = rdd.toJavaRDD().mapPartitionsWithIndex(f, true); // jRdd = jRdd.cache(); return jRdd.rdd(); }
@Override public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, POMergeJoin poMergeJoin) throws IOException { SparkUtil.assertPredecessorSize(predecessors, poMergeJoin, 1); RDD<Tuple> rdd = predecessors.get(0); MergeJoinFunction mergeJoinFunction = new MergeJoinFunction(poMergeJoin); return rdd.toJavaRDD().mapPartitions(mergeJoinFunction, true).rdd(); }
public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, POFRJoin poFRJoin) throws IOException { SparkUtil.assertPredecessorSizeGreaterThan(predecessors, poFRJoin, 1); RDD<Tuple> rdd = predecessors.get(0); attachReplicatedInputs((POFRJoinSpark) poFRJoin); FRJoinFunction frJoinFunction = new FRJoinFunction(poFRJoin); return rdd.toJavaRDD().mapPartitions(frJoinFunction, true).rdd(); }
@Override public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, POLimit poLimit) throws IOException { SparkUtil.assertPredecessorSize(predecessors, poLimit, 1); RDD<Tuple> rdd = predecessors.get(0); LimitFunction limitFunction = new LimitFunction(poLimit); RDD<Tuple> rdd2 = rdd.coalesce(1, false, null); return rdd2.toJavaRDD().mapPartitions(limitFunction, false).rdd(); }
@Override public RDD<Tuple> convert(List<RDD<Tuple>> predecessors, POForEach physicalOperator) { byte[] confBytes = KryoSerializer.serializeJobConf(jobConf); SparkUtil.assertPredecessorSize(predecessors, physicalOperator, 1); RDD<Tuple> rdd = predecessors.get(0); ForEachFunction forEachFunction = new ForEachFunction(physicalOperator, confBytes); return rdd.toJavaRDD().mapPartitions(forEachFunction, true).rdd(); }
@SuppressWarnings({"unchecked"}) public JavaRDD<SubdocLookupResult> couchbaseSubdocLookup(List<String> get, List<String> exists, String bucket) { return new RDDFunctions<T>(source.rdd()).couchbaseSubdocLookup( SparkUtil.listToSeq(get), SparkUtil.listToSeq(exists), bucket, scala.Option.<Duration>apply(null), LCLIdentity.INSTANCE ).toJavaRDD(); }
@SuppressWarnings({"unchecked"}) public JavaRDD<SubdocLookupResult> couchbaseSubdocLookup(List<String> get, List<String> exists, String bucket, long timeout) { return new RDDFunctions<T>(source.rdd()).couchbaseSubdocLookup( SparkUtil.listToSeq(get), SparkUtil.listToSeq(exists), bucket, scala.Option.<Duration>apply(Duration.create(timeout, TimeUnit.MILLISECONDS)), LCLIdentity.INSTANCE ).toJavaRDD(); }