org.apache.spark.SparkContext.hadoopConfiguration java code examples

public static void modifySparkHadoopConfiguration(SparkContext sc) throws Exception {
  sc.hadoopConfiguration().set("dfs.replication", "2"); // cuboid intermediate files, replication=2
  sc.hadoopConfiguration().set("mapreduce.output.fileoutputformat.compress", "true");
  sc.hadoopConfiguration().set("mapreduce.output.fileoutputformat.compress.type", "BLOCK");
  sc.hadoopConfiguration().set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.DefaultCodec"); // or org.apache.hadoop.io.compress.SnappyCodec
}

public static void modifySparkHadoopConfiguration(SparkContext sc) throws Exception {
  sc.hadoopConfiguration().set("dfs.replication", "2"); // cuboid intermediate files, replication=2
  sc.hadoopConfiguration().set("mapreduce.output.fileoutputformat.compress", "true");
  sc.hadoopConfiguration().set("mapreduce.output.fileoutputformat.compress.type", "BLOCK");
  sc.hadoopConfiguration().set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.DefaultCodec"); // or org.apache.hadoop.io.compress.SnappyCodec
}

/**
 * Read a UTF-8 format String from HDFS (or local)
 *
 * @param path    Path to write the string
 * @param sc      Spark context
 */
public static String readStringFromFile(String path, SparkContext sc) throws IOException {
  return readStringFromFile(path, sc.hadoopConfiguration());
}

/**
 * Read an object from HDFS (or local) using default Java object serialization
 *
 * @param path    File to read
 * @param type    Class of the object to read
 * @param sc      Spark context
 * @param <T>     Type of the object to read
 */
public static <T> T readObjectFromFile(String path, Class<T> type, SparkContext sc) throws IOException {
  return readObjectFromFile(path, type, sc.hadoopConfiguration());
}

/**
 * Write an object to HDFS (or local) using default Java object serialization
 *
 * @param path       Path to write the object to
 * @param toWrite    Object to write
 * @param sc         Spark context
 */
public static void writeObjectToFile(String path, Object toWrite, SparkContext sc) throws IOException {
  writeObjectToFile(path, toWrite, sc.hadoopConfiguration());
}

/**
 * Write a String to a file (on HDFS or local) in UTF-8 format
 *
 * @param path       Path to write to
 * @param toWrite    String to write
 * @param sc         Spark context
 */
public static void writeStringToFile(String path, String toWrite, SparkContext sc) throws IOException {
  writeStringToFile(path, toWrite, sc.hadoopConfiguration());
}

/**
 * Write a String to a file (on HDFS or local) in UTF-8 format
 *
 * @param path       Path to write to
 * @param toWrite    String to write
 * @param sc         Spark context
 */
public static void writeStringToFile(String path, String toWrite, SparkContext sc) throws IOException {
  writeStringToFile(path, toWrite, sc.hadoopConfiguration());
}

/**
 * Read a UTF-8 format String from HDFS (or local)
 *
 * @param path    Path to write the string
 * @param sc      Spark context
 */
public static String readStringFromFile(String path, SparkContext sc) throws IOException {
  return readStringFromFile(path, sc.hadoopConfiguration());
}

/**
 * Write an object to HDFS (or local) using default Java object serialization
 *
 * @param path       Path to write the object to
 * @param toWrite    Object to write
 * @param sc         Spark context
 */
public static void writeObjectToFile(String path, Object toWrite, SparkContext sc) throws IOException {
  writeObjectToFile(path, toWrite, sc.hadoopConfiguration());
}

/**
 * Read an object from HDFS (or local) using default Java object serialization
 *
 * @param path    File to read
 * @param type    Class of the object to read
 * @param sc      Spark context
 * @param <T>     Type of the object to read
 */
public static <T> T readObjectFromFile(String path, Class<T> type, SparkContext sc) throws IOException {
  return readObjectFromFile(path, type, sc.hadoopConfiguration());
}

private void updateSparkContext(@NonNull final SparkArgs sparkArgs,
  @NonNull final SparkContext sc) {
  for (SparkListener sparkListener : getSparkEventListeners()) {
    sc.addSparkListener(sparkListener);
  }
  sc.hadoopConfiguration().addResource(sparkArgs.getHadoopConfiguration());
}

 protected Configuration lazyConf() {
  if (lazyConf == null) {
   this.lazyConf = lazySparkSession().sparkContext().hadoopConfiguration();
  }
  return lazyConf;
 }
}

 public static Seq<CatalogTablePartition> partitions(SparkSession spark, String name) {
  List<String> parts = Lists.newArrayList(Splitter.on('.').limit(2).split(name));
  String db = parts.size() == 1 ? "default" : parts.get(0);
  String table = parts.get(parts.size() == 1 ? 0 : 1);

  HiveClient client = HiveUtils$.MODULE$.newClientForMetadata(
    spark.sparkContext().conf(),
    spark.sparkContext().hadoopConfiguration());
  client.getPartitions(db, table, Option.empty());

  return client.getPartitions(db, table, Option.empty());
 }
}

/**
 * Returns a Cells RDD from S3 fileSystem.
 * @param config Amazon S3 ExtractorConfig.
 * @return RDD of Cells.
 */
public RDD<Cells> createS3RDD(ExtractorConfig<Cells> config) {
  Serializable bucket = config.getValues().get(ExtractorConstants.S3_BUCKET);
  Serializable path = config.getValues().get(ExtractorConstants.FS_FILE_PATH);
  final TextFileDataTable textFileDataTable = UtilFS.createTextFileMetaDataFromConfig(config, this);
  String filePath = path.toString();
  if (config.getExtractorImplClassName().equals(ExtractorConstants.S3)) {
    filePath = ExtractorConstants.S3_PREFIX + bucket.toString() + path.toString();
  }
  Configuration hadoopConf = this.sc().hadoopConfiguration();
  hadoopConf.set("fs.s3n.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem");
  hadoopConf.set("fs.s3n.awsAccessKeyId", config.getString(ExtractorConstants.S3_ACCESS_KEY_ID));
  hadoopConf.set("fs.s3n.awsSecretAccessKey", config.getString(ExtractorConstants.S3_SECRET_ACCESS_KEY));
  return createRDDFromFilePath(filePath, textFileDataTable);
}

/**
 * Returns a Cells RDD from S3 fileSystem.
 * @param config Amazon S3 ExtractorConfig.
 * @return RDD of Cells.
 */
public RDD<Cells> createS3RDD(ExtractorConfig<Cells> config) {
  Serializable bucket = config.getValues().get(ExtractorConstants.S3_BUCKET);
  Serializable path = config.getValues().get(ExtractorConstants.FS_FILE_PATH);
  final TextFileDataTable textFileDataTable = UtilFS.createTextFileMetaDataFromConfig(config, this);
  String filePath = path.toString();
  if (config.getExtractorImplClassName().equals(ExtractorConstants.S3)) {
    filePath = ExtractorConstants.S3_PREFIX + bucket.toString() + path.toString();
  }
  Configuration hadoopConf = this.sc().hadoopConfiguration();
  hadoopConf.set("fs.s3n.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem");
  hadoopConf.set("fs.s3n.awsAccessKeyId", config.getString(ExtractorConstants.S3_ACCESS_KEY_ID));
  hadoopConf.set("fs.s3n.awsSecretAccessKey", config.getString(ExtractorConstants.S3_SECRET_ACCESS_KEY));
  return createRDDFromFilePath(filePath, textFileDataTable);
}

public static void configureSparkForAddElements(final SparkSession spark, final ParquetStoreProperties props) {
  final Integer numberOfOutputFiles = props.getAddElementsOutputFilesPerGroup();
  String shufflePartitions = spark.conf().getOption("spark.sql.shuffle.partitions").get();
  if (null == shufflePartitions) {
    shufflePartitions = SQLConf.SHUFFLE_PARTITIONS().defaultValueString();
  }
  if (numberOfOutputFiles > Integer.parseInt(shufflePartitions)) {
    LOGGER.debug("Setting the number of Spark shuffle partitions to {}", numberOfOutputFiles);
    spark.conf().set("spark.sql.shuffle.partitions", numberOfOutputFiles);
  }
  final Configuration hadoopConf = spark.sparkContext().hadoopConfiguration();
  configureSparkConfForAddElements(hadoopConf, props);
}

private RDD<Element> doOperation(final GetRDDOfAllElements operation,
                 final Context context,
                 final AccumuloStore accumuloStore)
    throws OperationException {
  SparkSession sparkSession = SparkContextUtil.getSparkSession(context, accumuloStore.getProperties());
  if (sparkSession == null) {
    throw new OperationException("This operation requires an active SparkSession.");
  }
  sparkSession.sparkContext().hadoopConfiguration().addResource(getConfiguration(operation));
  final String useRFileReaderRDD = operation.getOption(USE_RFILE_READER_RDD);
  if (Boolean.parseBoolean(useRFileReaderRDD)) {
    return doOperationUsingRFileReaderRDD(operation, context, accumuloStore);
  } else {
    return doOperationUsingElementInputFormat(operation, context, accumuloStore);
  }
}

private void assertExpectationsOnSparkContext(
  @NonNull final SparkArgs sparkArgs,
  @NonNull final SparkContext sc) {
  final String registeredAvroSchemaStr = sc.conf().getAvroSchema().head()._2();
  final Schema expectedAvroSchema = sparkArgs.getAvroSchemas().get().get(0);
  Assert.assertEquals(expectedAvroSchema.toString(), registeredAvroSchemaStr);
  Assert.assertEquals("foo_bar", sc.appName());
  Assert.assertEquals("512", sc.hadoopConfiguration().get("mapreduce.map.memory.mb"));
}

private RDD<Element> doOperation(final GetRDDOfElements operation,
                 final Context context,
                 final AccumuloStore accumuloStore)
    throws OperationException {
  final Configuration conf = getConfiguration(operation);
  final SparkContext sparkContext = SparkContextUtil.getSparkSession(context, accumuloStore.getProperties()).sparkContext();
  sparkContext.hadoopConfiguration().addResource(conf);
  // Use batch scan option when performing seeded operation
  InputConfigurator.setBatchScan(AccumuloInputFormat.class, conf, true);
  addIterators(accumuloStore, conf, context.getUser(), operation);
  addRanges(accumuloStore, conf, operation);
  final RDD<Tuple2<Element, NullWritable>> pairRDD = sparkContext.newAPIHadoopRDD(conf,
      ElementInputFormat.class,
      Element.class,
      NullWritable.class);
  return pairRDD.map(new FirstElement(), ClassTagConstants.ELEMENT_CLASS_TAG);
}

  private RDD<Element> doOperation(final GetRDDOfElementsInRanges operation,
                   final Context context,
                   final AccumuloStore accumuloStore)
      throws OperationException {
    final Configuration conf = getConfiguration(operation);
    final SparkContext sparkContext = SparkContextUtil.getSparkSession(context, accumuloStore.getProperties()).sparkContext();
    sparkContext.hadoopConfiguration().addResource(conf);
    // Use batch scan option when performing seeded operation
    InputConfigurator.setBatchScan(AccumuloInputFormat.class, conf, true);
    addIterators(accumuloStore, conf, context.getUser(), operation);
    addRangesFromPairs(accumuloStore, conf, operation);
    final RDD<Tuple2<Element, NullWritable>> pairRDD = sparkContext.newAPIHadoopRDD(conf,
        ElementInputFormat.class,
        Element.class,
        NullWritable.class);
    return pairRDD.map(new FirstElement(), ClassTagConstants.ELEMENT_CLASS_TAG);
  }
}

How to use hadoopConfigurationmethodin org.apache.spark.SparkContext

Best Java code snippets using org.apache.spark.SparkContext.hadoopConfiguration (Showing top 20 results out of 315)

How to use
hadoopConfiguration
method
in
org.apache.spark.SparkContext