/** * Constructor. * * @param sparkContext the spark context. * @param inputPath the path of the target text file. * @param numPartitions the number of partitions. */ public SparkTextFileBoundedSourceVertex(final SparkContext sparkContext, final String inputPath, final int numPartitions) { super(); this.readables = new ArrayList<>(); final Partition[] partitions = sparkContext.textFile(inputPath, numPartitions).getPartitions(); for (int i = 0; i < partitions.length; i++) { readables.add(new SparkTextFileBoundedSourceReadable( partitions[i], sparkContext.getConf(), i, inputPath, numPartitions)); } }
@Override public JavaRDD<AvroPayload> getData(@NonNull final FileWorkUnitCalculator.FileWorkUnitCalculatorResult result) { Preconditions.checkState(result.hasWorkUnits(), "no work to do: " + this.conf.getDirectory()); // todo: support more types Preconditions.checkState(this.conf.getType().equals("json"), "only json files supported"); try { final FileSystem fs = this.conf.getFileSystem(); final String filesToRead = result.getWorkUnits().stream() .map(LocatedFileStatus::getPath) .map(Path::toString) .collect(Collectors.joining(",")); final RDD<String> fileRows = this.jsc.sc().textFile(filesToRead, 1); return this.converter.map(fileRows.toJavaRDD()).getData(); } catch (IOException e) { throw new JobRuntimeException("Error getting files", e); } } }
private RDD<Cells> createRDDFromFilePath(String filePath, TextFileDataTable textFileDataTable) { RDD<String> result = this.sc().textFile(filePath.toString(), 1); JavaRDD<Cells> resultCells = result.toJavaRDD().map(new MapSchemaFromLines(textFileDataTable)); return resultCells.rdd(); }
private RDD<Cells> createRDDFromFilePath(String filePath, TextFileDataTable textFileDataTable) { RDD<String> result = this.sc().textFile(filePath.toString(), 1); JavaRDD<Cells> resultCells = result.toJavaRDD().map(new MapSchemaFromLines(textFileDataTable)); return resultCells.rdd(); }
@Override protected Iterator<String> initializeIterator() { // for setting up the same environment in the executors. final SparkContext sparkContext = SparkContext.getOrCreate(sparkConf); // Spark does lazy evaluation: it doesn't load the full data in rdd, but only the partition it is asked for. final RDD<String> rdd = sparkContext.textFile(inputPath, numPartitions); final Iterable<String> iterable = () -> JavaConverters.asJavaIteratorConverter( rdd.iterator(rdd.getPartitions()[partitionIndex], TaskContext$.MODULE$.empty())).asJava(); return iterable.iterator(); } }
tokenizer, featureGenerators); JavaRDD<String> data = spark.sparkContext().textFile(dataIn, 48).toJavaRDD() .cache();
@Test public void createHDFSRDDTest() throws Exception { deepSparkContext = createDeepSparkContext(); DeepSparkContext deepSparkContextSpy = PowerMockito.spy(deepSparkContext); SQLContext sqlContext = mock(SQLContext.class); Whitebox.setInternalState(deepSparkContextSpy, "sc", sparkContext); Whitebox.setInternalState(deepSparkContextSpy, "sqlContext", sqlContext); RDD<String> rdd = mock(RDD.class); JavaRDD<String> javaRdd = mock(JavaRDD.class); when(deepSparkContextSpy.sc().textFile(anyString(), anyInt())).thenReturn(rdd); doReturn(javaRdd).when(deepSparkContextSpy).textFile(anyString()); when(rdd.toJavaRDD()).thenReturn(javaRdd); when(rdd.toJavaRDD().map(any(Function.class))).thenReturn(singleRdd); ExtractorConfig<Cells> config = createHDFSDeepJobConfig(); RDD rddReturn = deepSparkContextSpy.createHDFSRDD(config); verify(deepSparkContextSpy.sc(), times(1)).textFile(anyString(), anyInt()); verify(javaRdd, times(1)).map(any(Function.class)); }
@Test public void createS3RDDTest() throws Exception { deepSparkContext = createDeepSparkContext(); Configuration hadoopConf = mock(Configuration.class); when(sparkContext.hadoopConfiguration()).thenReturn(hadoopConf); DeepSparkContext deepSparkContextSpy = PowerMockito.spy(deepSparkContext); SQLContext sqlContext = mock(SQLContext.class); Whitebox.setInternalState(deepSparkContextSpy, "sc", sparkContext); Whitebox.setInternalState(deepSparkContextSpy, "sqlContext", sqlContext); RDD<String> rdd = mock(RDD.class); JavaRDD<String> javaRDD = mock(JavaRDD.class); when(deepSparkContextSpy.sc().textFile(anyString(), anyInt())).thenReturn(rdd); doReturn(javaRDD).when(deepSparkContextSpy).textFile(anyString()); when(rdd.toJavaRDD()).thenReturn(javaRDD); when(rdd.toJavaRDD().map(any(Function.class))).thenReturn(singleRdd); ExtractorConfig<Cells> config = createS3DeepJobConfig(); deepSparkContextSpy.createS3RDD(config); verify(hadoopConf, times(1)).set("fs.s3n.awsAccessKeyId", config.getString(ExtractorConstants.S3_ACCESS_KEY_ID)); verify(hadoopConf, times(1)).set("fs.s3n.awsSecretAccessKey", config.getString(ExtractorConstants.S3_SECRET_ACCESS_KEY)); verify(deepSparkContextSpy.sc(), times(1)).textFile(anyString(), anyInt()); verify(javaRDD, times(1)).map(any(Function.class)); }
/** * Static method to create a JavaRDD object from an text file. * * @param sparkContext the spark context containing configurations. * @param minPartitions the minimum number of partitions. * @param inputPath the path of the input text file. * @return the new JavaRDD object */ public static JavaRDD<String> of(final SparkContext sparkContext, final int minPartitions, final String inputPath) { final DAGBuilder<IRVertex, IREdge> builder = new DAGBuilder<>(); final org.apache.spark.rdd.RDD<String> textRdd = sparkContext.textFile(inputPath, minPartitions); final int numPartitions = textRdd.getNumPartitions(); final IRVertex textSourceVertex = new SparkTextFileBoundedSourceVertex(sparkContext, inputPath, numPartitions); textSourceVertex.setProperty(ParallelismProperty.of(numPartitions)); builder.addVertex(textSourceVertex); return new JavaRDD<>(textRdd, sparkContext, builder.buildWithoutSourceSinkCheck(), textSourceVertex); }
null : new AgeClassifyModelWrapper(classifyModel); JavaRDD<String> data = spark.sparkContext().textFile(dataIn,8).toJavaRDD().cache();
int iterations = getIterations(params); JavaRDD<String> data = spark.sparkContext().textFile(eventDir, 24).toJavaRDD() .cache();