org.apache.spark.SparkContext.textFile java code examples

/**
 * Constructor.
 *
 * @param sparkContext  the spark context.
 * @param inputPath     the path of the target text file.
 * @param numPartitions the number of partitions.
 */
public SparkTextFileBoundedSourceVertex(final SparkContext sparkContext,
                    final String inputPath,
                    final int numPartitions) {
 super();
 this.readables = new ArrayList<>();
 final Partition[] partitions = sparkContext.textFile(inputPath, numPartitions).getPartitions();
 for (int i = 0; i < partitions.length; i++) {
  readables.add(new SparkTextFileBoundedSourceReadable(
    partitions[i],
    sparkContext.getConf(),
    i,
    inputPath,
    numPartitions));
 }
}

  @Override
  public JavaRDD<AvroPayload> getData(@NonNull final FileWorkUnitCalculator.FileWorkUnitCalculatorResult result) {
    Preconditions.checkState(result.hasWorkUnits(), "no work to do: " + this.conf.getDirectory());
    // todo: support more types
    Preconditions.checkState(this.conf.getType().equals("json"), "only json files supported");
    try {
      final FileSystem fs = this.conf.getFileSystem();
      final String filesToRead = result.getWorkUnits().stream()
        .map(LocatedFileStatus::getPath)
        .map(Path::toString)
        .collect(Collectors.joining(","));
      final RDD<String> fileRows = this.jsc.sc().textFile(filesToRead, 1);
      return this.converter.map(fileRows.toJavaRDD()).getData();

    } catch (IOException e) {
      throw new JobRuntimeException("Error getting files", e);
    }
  }
}

private RDD<Cells> createRDDFromFilePath(String filePath, TextFileDataTable textFileDataTable) {
  RDD<String> result = this.sc().textFile(filePath.toString(), 1);
  JavaRDD<Cells> resultCells = result.toJavaRDD().map(new MapSchemaFromLines(textFileDataTable));
  return resultCells.rdd();
}

private RDD<Cells> createRDDFromFilePath(String filePath, TextFileDataTable textFileDataTable) {
  RDD<String> result = this.sc().textFile(filePath.toString(), 1);
  JavaRDD<Cells> resultCells = result.toJavaRDD().map(new MapSchemaFromLines(textFileDataTable));
  return resultCells.rdd();
}

 @Override
 protected Iterator<String> initializeIterator() {
  // for setting up the same environment in the executors.
  final SparkContext sparkContext = SparkContext.getOrCreate(sparkConf);
  // Spark does lazy evaluation: it doesn't load the full data in rdd, but only the partition it is asked for.
  final RDD<String> rdd = sparkContext.textFile(inputPath, numPartitions);
  final Iterable<String> iterable = () -> JavaConverters.asJavaIteratorConverter(
   rdd.iterator(rdd.getPartitions()[partitionIndex], TaskContext$.MODULE$.empty())).asJava();
  return iterable.iterator();
 }
}

  tokenizer, featureGenerators);
JavaRDD<String> data = spark.sparkContext().textFile(dataIn, 48).toJavaRDD()
  .cache();

@Test
public void createHDFSRDDTest() throws Exception {
  deepSparkContext = createDeepSparkContext();
  DeepSparkContext deepSparkContextSpy = PowerMockito.spy(deepSparkContext);
  SQLContext sqlContext = mock(SQLContext.class);
  Whitebox.setInternalState(deepSparkContextSpy, "sc", sparkContext);
  Whitebox.setInternalState(deepSparkContextSpy, "sqlContext", sqlContext);
  RDD<String> rdd = mock(RDD.class);
  JavaRDD<String> javaRdd = mock(JavaRDD.class);
  when(deepSparkContextSpy.sc().textFile(anyString(), anyInt())).thenReturn(rdd);
  doReturn(javaRdd).when(deepSparkContextSpy).textFile(anyString());
  when(rdd.toJavaRDD()).thenReturn(javaRdd);
  when(rdd.toJavaRDD().map(any(Function.class))).thenReturn(singleRdd);
  ExtractorConfig<Cells> config = createHDFSDeepJobConfig();
  RDD rddReturn = deepSparkContextSpy.createHDFSRDD(config);
  verify(deepSparkContextSpy.sc(), times(1)).textFile(anyString(), anyInt());
  verify(javaRdd, times(1)).map(any(Function.class));
}

@Test
public void createS3RDDTest() throws Exception {
  deepSparkContext = createDeepSparkContext();
  Configuration hadoopConf = mock(Configuration.class);
  when(sparkContext.hadoopConfiguration()).thenReturn(hadoopConf);
  DeepSparkContext deepSparkContextSpy = PowerMockito.spy(deepSparkContext);
  SQLContext sqlContext = mock(SQLContext.class);
  Whitebox.setInternalState(deepSparkContextSpy, "sc", sparkContext);
  Whitebox.setInternalState(deepSparkContextSpy, "sqlContext", sqlContext);
  RDD<String> rdd = mock(RDD.class);
  JavaRDD<String> javaRDD = mock(JavaRDD.class);
  when(deepSparkContextSpy.sc().textFile(anyString(), anyInt())).thenReturn(rdd);
  doReturn(javaRDD).when(deepSparkContextSpy).textFile(anyString());
  when(rdd.toJavaRDD()).thenReturn(javaRDD);
  when(rdd.toJavaRDD().map(any(Function.class))).thenReturn(singleRdd);
  ExtractorConfig<Cells> config = createS3DeepJobConfig();
  deepSparkContextSpy.createS3RDD(config);
  verify(hadoopConf, times(1)).set("fs.s3n.awsAccessKeyId", config.getString(ExtractorConstants.S3_ACCESS_KEY_ID));
  verify(hadoopConf, times(1)).set("fs.s3n.awsSecretAccessKey", config.getString(ExtractorConstants.S3_SECRET_ACCESS_KEY));
  verify(deepSparkContextSpy.sc(), times(1)).textFile(anyString(), anyInt());
  verify(javaRDD, times(1)).map(any(Function.class));
}

/**
 * Static method to create a JavaRDD object from an text file.
 *
 * @param sparkContext  the spark context containing configurations.
 * @param minPartitions the minimum number of partitions.
 * @param inputPath     the path of the input text file.
 * @return the new JavaRDD object
 */
public static JavaRDD<String> of(final SparkContext sparkContext,
                 final int minPartitions,
                 final String inputPath) {
 final DAGBuilder<IRVertex, IREdge> builder = new DAGBuilder<>();
 final org.apache.spark.rdd.RDD<String> textRdd = sparkContext.textFile(inputPath, minPartitions);
 final int numPartitions = textRdd.getNumPartitions();
 final IRVertex textSourceVertex = new SparkTextFileBoundedSourceVertex(sparkContext, inputPath, numPartitions);
 textSourceVertex.setProperty(ParallelismProperty.of(numPartitions));
 builder.addVertex(textSourceVertex);
 return new JavaRDD<>(textRdd, sparkContext, builder.buildWithoutSourceSinkCheck(), textSourceVertex);
}

  null : new AgeClassifyModelWrapper(classifyModel);
JavaRDD<String> data = spark.sparkContext().textFile(dataIn,8).toJavaRDD().cache();

int iterations = getIterations(params);
JavaRDD<String> data = spark.sparkContext().textFile(eventDir, 24).toJavaRDD()
  .cache();

How to use textFilemethodin org.apache.spark.SparkContext

Best Java code snippets using org.apache.spark.SparkContext.textFile (Showing top 11 results out of 315)

How to use
textFile
method
in
org.apache.spark.SparkContext