public static void modifySparkHadoopConfiguration(SparkContext sc) throws Exception { sc.hadoopConfiguration().set("dfs.replication", "2"); // cuboid intermediate files, replication=2 sc.hadoopConfiguration().set("mapreduce.output.fileoutputformat.compress", "true"); sc.hadoopConfiguration().set("mapreduce.output.fileoutputformat.compress.type", "BLOCK"); sc.hadoopConfiguration().set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.DefaultCodec"); // or org.apache.hadoop.io.compress.SnappyCodec }
public static void modifySparkHadoopConfiguration(SparkContext sc) throws Exception { sc.hadoopConfiguration().set("dfs.replication", "2"); // cuboid intermediate files, replication=2 sc.hadoopConfiguration().set("mapreduce.output.fileoutputformat.compress", "true"); sc.hadoopConfiguration().set("mapreduce.output.fileoutputformat.compress.type", "BLOCK"); sc.hadoopConfiguration().set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.DefaultCodec"); // or org.apache.hadoop.io.compress.SnappyCodec }
/** * Read a UTF-8 format String from HDFS (or local) * * @param path Path to write the string * @param sc Spark context */ public static String readStringFromFile(String path, SparkContext sc) throws IOException { return readStringFromFile(path, sc.hadoopConfiguration()); }
/** * Read an object from HDFS (or local) using default Java object serialization * * @param path File to read * @param type Class of the object to read * @param sc Spark context * @param <T> Type of the object to read */ public static <T> T readObjectFromFile(String path, Class<T> type, SparkContext sc) throws IOException { return readObjectFromFile(path, type, sc.hadoopConfiguration()); }
/** * Write an object to HDFS (or local) using default Java object serialization * * @param path Path to write the object to * @param toWrite Object to write * @param sc Spark context */ public static void writeObjectToFile(String path, Object toWrite, SparkContext sc) throws IOException { writeObjectToFile(path, toWrite, sc.hadoopConfiguration()); }
/** * Write a String to a file (on HDFS or local) in UTF-8 format * * @param path Path to write to * @param toWrite String to write * @param sc Spark context */ public static void writeStringToFile(String path, String toWrite, SparkContext sc) throws IOException { writeStringToFile(path, toWrite, sc.hadoopConfiguration()); }
/** * Write a String to a file (on HDFS or local) in UTF-8 format * * @param path Path to write to * @param toWrite String to write * @param sc Spark context */ public static void writeStringToFile(String path, String toWrite, SparkContext sc) throws IOException { writeStringToFile(path, toWrite, sc.hadoopConfiguration()); }
/** * Read a UTF-8 format String from HDFS (or local) * * @param path Path to write the string * @param sc Spark context */ public static String readStringFromFile(String path, SparkContext sc) throws IOException { return readStringFromFile(path, sc.hadoopConfiguration()); }
/** * Write an object to HDFS (or local) using default Java object serialization * * @param path Path to write the object to * @param toWrite Object to write * @param sc Spark context */ public static void writeObjectToFile(String path, Object toWrite, SparkContext sc) throws IOException { writeObjectToFile(path, toWrite, sc.hadoopConfiguration()); }
/** * Read an object from HDFS (or local) using default Java object serialization * * @param path File to read * @param type Class of the object to read * @param sc Spark context * @param <T> Type of the object to read */ public static <T> T readObjectFromFile(String path, Class<T> type, SparkContext sc) throws IOException { return readObjectFromFile(path, type, sc.hadoopConfiguration()); }
private void updateSparkContext(@NonNull final SparkArgs sparkArgs, @NonNull final SparkContext sc) { for (SparkListener sparkListener : getSparkEventListeners()) { sc.addSparkListener(sparkListener); } sc.hadoopConfiguration().addResource(sparkArgs.getHadoopConfiguration()); }
protected Configuration lazyConf() { if (lazyConf == null) { this.lazyConf = lazySparkSession().sparkContext().hadoopConfiguration(); } return lazyConf; } }
public static Seq<CatalogTablePartition> partitions(SparkSession spark, String name) { List<String> parts = Lists.newArrayList(Splitter.on('.').limit(2).split(name)); String db = parts.size() == 1 ? "default" : parts.get(0); String table = parts.get(parts.size() == 1 ? 0 : 1); HiveClient client = HiveUtils$.MODULE$.newClientForMetadata( spark.sparkContext().conf(), spark.sparkContext().hadoopConfiguration()); client.getPartitions(db, table, Option.empty()); return client.getPartitions(db, table, Option.empty()); } }
/** * Returns a Cells RDD from S3 fileSystem. * @param config Amazon S3 ExtractorConfig. * @return RDD of Cells. */ public RDD<Cells> createS3RDD(ExtractorConfig<Cells> config) { Serializable bucket = config.getValues().get(ExtractorConstants.S3_BUCKET); Serializable path = config.getValues().get(ExtractorConstants.FS_FILE_PATH); final TextFileDataTable textFileDataTable = UtilFS.createTextFileMetaDataFromConfig(config, this); String filePath = path.toString(); if (config.getExtractorImplClassName().equals(ExtractorConstants.S3)) { filePath = ExtractorConstants.S3_PREFIX + bucket.toString() + path.toString(); } Configuration hadoopConf = this.sc().hadoopConfiguration(); hadoopConf.set("fs.s3n.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem"); hadoopConf.set("fs.s3n.awsAccessKeyId", config.getString(ExtractorConstants.S3_ACCESS_KEY_ID)); hadoopConf.set("fs.s3n.awsSecretAccessKey", config.getString(ExtractorConstants.S3_SECRET_ACCESS_KEY)); return createRDDFromFilePath(filePath, textFileDataTable); }
/** * Returns a Cells RDD from S3 fileSystem. * @param config Amazon S3 ExtractorConfig. * @return RDD of Cells. */ public RDD<Cells> createS3RDD(ExtractorConfig<Cells> config) { Serializable bucket = config.getValues().get(ExtractorConstants.S3_BUCKET); Serializable path = config.getValues().get(ExtractorConstants.FS_FILE_PATH); final TextFileDataTable textFileDataTable = UtilFS.createTextFileMetaDataFromConfig(config, this); String filePath = path.toString(); if (config.getExtractorImplClassName().equals(ExtractorConstants.S3)) { filePath = ExtractorConstants.S3_PREFIX + bucket.toString() + path.toString(); } Configuration hadoopConf = this.sc().hadoopConfiguration(); hadoopConf.set("fs.s3n.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem"); hadoopConf.set("fs.s3n.awsAccessKeyId", config.getString(ExtractorConstants.S3_ACCESS_KEY_ID)); hadoopConf.set("fs.s3n.awsSecretAccessKey", config.getString(ExtractorConstants.S3_SECRET_ACCESS_KEY)); return createRDDFromFilePath(filePath, textFileDataTable); }
public static void configureSparkForAddElements(final SparkSession spark, final ParquetStoreProperties props) { final Integer numberOfOutputFiles = props.getAddElementsOutputFilesPerGroup(); String shufflePartitions = spark.conf().getOption("spark.sql.shuffle.partitions").get(); if (null == shufflePartitions) { shufflePartitions = SQLConf.SHUFFLE_PARTITIONS().defaultValueString(); } if (numberOfOutputFiles > Integer.parseInt(shufflePartitions)) { LOGGER.debug("Setting the number of Spark shuffle partitions to {}", numberOfOutputFiles); spark.conf().set("spark.sql.shuffle.partitions", numberOfOutputFiles); } final Configuration hadoopConf = spark.sparkContext().hadoopConfiguration(); configureSparkConfForAddElements(hadoopConf, props); }
private RDD<Element> doOperation(final GetRDDOfAllElements operation, final Context context, final AccumuloStore accumuloStore) throws OperationException { SparkSession sparkSession = SparkContextUtil.getSparkSession(context, accumuloStore.getProperties()); if (sparkSession == null) { throw new OperationException("This operation requires an active SparkSession."); } sparkSession.sparkContext().hadoopConfiguration().addResource(getConfiguration(operation)); final String useRFileReaderRDD = operation.getOption(USE_RFILE_READER_RDD); if (Boolean.parseBoolean(useRFileReaderRDD)) { return doOperationUsingRFileReaderRDD(operation, context, accumuloStore); } else { return doOperationUsingElementInputFormat(operation, context, accumuloStore); } }
private void assertExpectationsOnSparkContext( @NonNull final SparkArgs sparkArgs, @NonNull final SparkContext sc) { final String registeredAvroSchemaStr = sc.conf().getAvroSchema().head()._2(); final Schema expectedAvroSchema = sparkArgs.getAvroSchemas().get().get(0); Assert.assertEquals(expectedAvroSchema.toString(), registeredAvroSchemaStr); Assert.assertEquals("foo_bar", sc.appName()); Assert.assertEquals("512", sc.hadoopConfiguration().get("mapreduce.map.memory.mb")); }
private RDD<Element> doOperation(final GetRDDOfElements operation, final Context context, final AccumuloStore accumuloStore) throws OperationException { final Configuration conf = getConfiguration(operation); final SparkContext sparkContext = SparkContextUtil.getSparkSession(context, accumuloStore.getProperties()).sparkContext(); sparkContext.hadoopConfiguration().addResource(conf); // Use batch scan option when performing seeded operation InputConfigurator.setBatchScan(AccumuloInputFormat.class, conf, true); addIterators(accumuloStore, conf, context.getUser(), operation); addRanges(accumuloStore, conf, operation); final RDD<Tuple2<Element, NullWritable>> pairRDD = sparkContext.newAPIHadoopRDD(conf, ElementInputFormat.class, Element.class, NullWritable.class); return pairRDD.map(new FirstElement(), ClassTagConstants.ELEMENT_CLASS_TAG); }
private RDD<Element> doOperation(final GetRDDOfElementsInRanges operation, final Context context, final AccumuloStore accumuloStore) throws OperationException { final Configuration conf = getConfiguration(operation); final SparkContext sparkContext = SparkContextUtil.getSparkSession(context, accumuloStore.getProperties()).sparkContext(); sparkContext.hadoopConfiguration().addResource(conf); // Use batch scan option when performing seeded operation InputConfigurator.setBatchScan(AccumuloInputFormat.class, conf, true); addIterators(accumuloStore, conf, context.getUser(), operation); addRangesFromPairs(accumuloStore, conf, operation); final RDD<Tuple2<Element, NullWritable>> pairRDD = sparkContext.newAPIHadoopRDD(conf, ElementInputFormat.class, Element.class, NullWritable.class); return pairRDD.map(new FirstElement(), ClassTagConstants.ELEMENT_CLASS_TAG); } }