/** * Please note that the method is very tied with Spark documentation 1.4.1 regarding * dynamic allocation, such as default values. * @return */ private int getExecutorsToWarm() { int minExecutors = HiveConf.getIntVar(hiveConf, HiveConf.ConfVars.HIVE_PREWARM_NUM_CONTAINERS); boolean dynamicAllocation = hiveConf.getBoolean("spark.dynamicAllocation.enabled", false); if (dynamicAllocation) { int min = sparkConf.getInt("spark.dynamicAllocation.minExecutors", 0); int initExecutors = sparkConf.getInt("spark.dynamicAllocation.initialExecutors", min); minExecutors = Math.min(minExecutors, initExecutors); } else { int execInstances = sparkConf.getInt("spark.executor.instances", 2); minExecutors = Math.min(minExecutors, execInstances); } return minExecutors; }
/** * Please note that the method is very tied with Spark documentation 1.4.1 regarding * dynamic allocation, such as default values. * @return */ private int getExecutorsToWarm() { int minExecutors = HiveConf.getIntVar(hiveConf, HiveConf.ConfVars.HIVE_PREWARM_NUM_CONTAINERS); boolean dynamicAllocation = hiveConf.getBoolean("spark.dynamicAllocation.enabled", false); if (dynamicAllocation) { int min = sparkConf.getInt("spark.dynamicAllocation.minExecutors", 0); int initExecutors = sparkConf.getInt("spark.dynamicAllocation.initialExecutors", min); minExecutors = Math.min(minExecutors, initExecutors); } else { int execInstances = sparkConf.getInt("spark.executor.instances", 2); minExecutors = Math.min(minExecutors, execInstances); } return minExecutors; }
if (masterURL.startsWith("spark") || masterURL.startsWith("local")) { totalCores = sparkConf.contains("spark.default.parallelism") ? sparkConf.getInt("spark.default.parallelism", 1) : hiveSparkClient.getDefaultParallelism(); totalCores = Math.max(totalCores, numExecutors); } else { int coresPerExecutor = sparkConf.getInt("spark.executor.cores", 1); totalCores = numExecutors * coresPerExecutor; totalCores = totalCores / sparkConf.getInt("spark.task.cpus", 1);
if (masterURL.startsWith("spark")) { totalCores = sparkConf.contains("spark.default.parallelism") ? sparkConf.getInt("spark.default.parallelism", 1) : hiveSparkClient.getDefaultParallelism(); totalCores = Math.max(totalCores, numExecutors); } else { int coresPerExecutor = sparkConf.getInt("spark.executor.cores", 1); totalCores = numExecutors * coresPerExecutor; totalCores = totalCores / sparkConf.getInt("spark.task.cpus", 1);
this.sparkConf = sparkConf; this.transferToEnabled = sparkConf.getBoolean("spark.file.transferTo", true); this.initialSortBufferSize = sparkConf.getInt("spark.shuffle.sort.initialBufferSize", DEFAULT_INITIAL_SORT_BUFFER_SIZE); open();
this.sparkConf = sparkConf; this.transferToEnabled = sparkConf.getBoolean("spark.file.transferTo", true); this.initialSortBufferSize = sparkConf.getInt("spark.shuffle.sort.initialBufferSize", DEFAULT_INITIAL_SORT_BUFFER_SIZE); this.inputBufferSizeInBytes =
this.sparkConf = sparkConf; this.transferToEnabled = sparkConf.getBoolean("spark.file.transferTo", true); this.initialSortBufferSize = sparkConf.getInt("spark.shuffle.sort.initialBufferSize", DEFAULT_INITIAL_SORT_BUFFER_SIZE); this.inputBufferSizeInBytes =
public GryoSerializer(final SparkConf sparkConfiguration) { final long bufferSizeKb = sparkConfiguration.getSizeAsKb("spark.kryoserializer.buffer", "64k"); final long maxBufferSizeMb = sparkConfiguration.getSizeAsMb("spark.kryoserializer.buffer.max", "64m"); this.referenceTracking = sparkConfiguration.getBoolean("spark.kryo.referenceTracking", true); this.registrationRequired = sparkConfiguration.getBoolean(Constants.SPARK_KRYO_REGISTRATION_REQUIRED, false); if (bufferSizeKb >= ByteUnit.GiB.toKiB(2L)) { throw new IllegalArgumentException("spark.kryoserializer.buffer must be less than 2048 mb, got: " + bufferSizeKb + " mb."); } else { this.bufferSize = (int) ByteUnit.KiB.toBytes(bufferSizeKb); if (maxBufferSizeMb >= ByteUnit.GiB.toMiB(2L)) { throw new IllegalArgumentException("spark.kryoserializer.buffer.max must be less than 2048 mb, got: " + maxBufferSizeMb + " mb."); } else { this.maxBufferSize = (int) ByteUnit.MiB.toBytes(maxBufferSizeMb); //this.userRegistrator = sparkConfiguration.getOption("spark.kryo.registrator"); } } // create a GryoPool and store it in static HadoopPools final List<Object> ioRegistries = new ArrayList<>(); ioRegistries.addAll(makeApacheConfiguration(sparkConfiguration).getList(IoRegistry.IO_REGISTRY, Collections.emptyList())); ioRegistries.add(SparkIoRegistry.class.getCanonicalName().replace("." + SparkIoRegistry.class.getSimpleName(), "$" + SparkIoRegistry.class.getSimpleName())); HadoopPools.initialize(GryoPool.build(). version(GryoVersion.valueOf(sparkConfiguration.get(GryoPool.CONFIG_IO_GRYO_VERSION, GryoPool.CONFIG_IO_GRYO_POOL_VERSION_DEFAULT.name()))). poolSize(sparkConfiguration.getInt(GryoPool.CONFIG_IO_GRYO_POOL_SIZE, GryoPool.CONFIG_IO_GRYO_POOL_SIZE_DEFAULT)). ioRegistries(ioRegistries). initializeMapper(builder -> builder.referenceTracking(this.referenceTracking). registrationRequired(this.registrationRequired)). create()); }
comparatorSupplier, prefixComparator, SparkEnv.get().conf().getInt("spark.shuffle.sort.initialBufferSize", UnsafeExternalRowSorter.DEFAULT_INITIAL_SORT_BUFFER_SIZE), pageSizeBytes, comparatorSupplier, prefixComparator, SparkEnv.get().conf().getInt("spark.shuffle.sort.initialBufferSize", UnsafeExternalRowSorter.DEFAULT_INITIAL_SORT_BUFFER_SIZE), pageSizeBytes,
comparatorSupplier, prefixComparator, SparkEnv.get().conf().getInt("spark.shuffle.sort.initialBufferSize", UnsafeExternalRowSorter.DEFAULT_INITIAL_SORT_BUFFER_SIZE), pageSizeBytes, comparatorSupplier, prefixComparator, SparkEnv.get().conf().getInt("spark.shuffle.sort.initialBufferSize", UnsafeExternalRowSorter.DEFAULT_INITIAL_SORT_BUFFER_SIZE), pageSizeBytes,
public int getNumPartitionsHint() throws AnalyticsException { /* all workers will not have the same CPU count, this is just an approximation */ int workerCount = this.getWorkerCount(); int workerCores = this.sparkConf.getInt(AnalyticsConstants.SPARK_WORKER_CORES, 1); int partitionCount = workerCount * workerCores; if (workerCount == 0) { throw new AnalyticsException("Error while calculating NumPartitionsHint. Worker count is zero."); } if (log.isDebugEnabled()) { log.debug("Partition count: " + partitionCount); } return partitionCount; }
/** * this method starts a spark master with a given parameters. */ private synchronized void startMaster() throws AnalyticsClusterException { if (!this.masterActive) { String host = this.myHost; int port = this.sparkConf.getInt(AnalyticsConstants.SPARK_MASTER_PORT, 7077 + this.portOffset); int webUiPort = this.sparkConf.getInt(AnalyticsConstants.SPARK_MASTER_WEBUI_PORT, 8081 + this.portOffset); Master.startRpcEnvAndEndpoint(host, port, webUiPort, this.sparkConf); log.info("[Spark init - master] Started SPARK MASTER in spark://" + host + ":" + port + " with webUI port : " + webUiPort); updateMaster(this.sparkConf); this.masterActive = true; } else { logDebug("Master is already active in this node, therefore ignoring Master startup"); } }
public static JavaPairRDD<String, String> reduceJSON(JavaSparkContext sc, JavaPairRDD<String, String> input, final Properties karmaSettings) { return reduceJSON(sc, input, sc.getConf().getInt("spark.default.parallelism", 1), karmaSettings); } public static JavaPairRDD<String, String> reduceJSON(JavaSparkContext sc,
public static JavaRDD<String> reduceJSON(JavaSparkContext jsc, JavaRDD<String> input, final Properties karmaSettings) { return reduceJSON(jsc, input, jsc.getConf().getInt("spark.default.parallelism", 1), karmaSettings); }
recordComparator, prefixComparator, SparkEnv.get().conf().getInt("spark.shuffle.sort.initialBufferSize", UnsafeExternalRowSorter.DEFAULT_INITIAL_SORT_BUFFER_SIZE), pageSizeBytes, new KVComparator(ordering, keySchema.length()), prefixComparator, SparkEnv.get().conf().getInt("spark.shuffle.sort.initialBufferSize", UnsafeExternalRowSorter.DEFAULT_INITIAL_SORT_BUFFER_SIZE), pageSizeBytes,
public static void main(String[] args) { // TODO Auto-generated method stub SparkConf conf = new SparkConf().setAppName("ImagenetSampler") .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); JavaSparkContext sc = new JavaSparkContext(conf); int numExecutors = conf.getInt("spark.executor.instances", -1); System.out.println("number of executors = " + numExecutors); System.out.println("Data Loading..."); JavaPairRDD<FloatWritable, ArrayPrimitiveWritable> train_seq = sc.sequenceFile("imagenet_sampled.hsf", FloatWritable.class, ArrayPrimitiveWritable.class); train_seq.foreach(new VoidFunction<Tuple2<FloatWritable,ArrayPrimitiveWritable>>() { @Override public void call(Tuple2<FloatWritable, ArrayPrimitiveWritable> arg0) throws Exception { System.out.println(arg0._1.get() + " " + ((float[]) arg0._2.get()).length); } }); sc.close(); }
public GeoSparkConf(SparkConf sparkConf) { this.useIndex = sparkConf.getBoolean("geospark.global.index", true); this.indexType = IndexType.getIndexType(sparkConf.get("geospark.global.indextype", "rtree")); this.joinApproximateTotalCount = sparkConf.getLong("geospark.join.approxcount", -1); String[] boundaryString = sparkConf.get("geospark.join.boundary", "0,0,0,0").split(","); this.datasetBoundary = new Envelope(Double.parseDouble(boundaryString[0]), Double.parseDouble(boundaryString[0]), Double.parseDouble(boundaryString[0]), Double.parseDouble(boundaryString[0])); this.joinGridType = GridType.getGridType(sparkConf.get("geospark.join.gridtype", "quadtree")); this.joinBuildSide = JoinBuildSide.getBuildSide(sparkConf.get("geospark.join.indexbuildside", "left")); this.joinSparitionDominantSide = JoinSparitionDominantSide.getJoinSparitionDominantSide(sparkConf.get("geospark.join.spatitionside", "left")); this.fallbackPartitionNum = sparkConf.getInt("geospark.join.numpartition", -1); }
public GeoSparkConf(SparkConf sparkConf) { this.useIndex = sparkConf.getBoolean("geospark.global.index", true); this.indexType = IndexType.getIndexType(sparkConf.get("geospark.global.indextype", "rtree")); this.joinApproximateTotalCount = sparkConf.getLong("geospark.join.approxcount", -1); String[] boundaryString = sparkConf.get("geospark.join.boundary", "0,0,0,0").split(","); this.datasetBoundary = new Envelope(Double.parseDouble(boundaryString[0]), Double.parseDouble(boundaryString[0]), Double.parseDouble(boundaryString[0]), Double.parseDouble(boundaryString[0])); this.joinGridType = GridType.getGridType(sparkConf.get("geospark.join.gridtype", "quadtree")); this.joinBuildSide = JoinBuildSide.getBuildSide(sparkConf.get("geospark.join.indexbuildside", "left")); this.joinSparitionDominantSide = JoinSparitionDominantSide.getJoinSparitionDominantSide(sparkConf.get("geospark.join.spatitionside", "left")); this.fallbackPartitionNum = sparkConf.getInt("geospark.join.numpartition", -1); }
private AnalyticsQueryResult toResult(DataFrame dataFrame) throws AnalyticsExecutionException { int resultsLimit = this.sparkConf.getInt("carbon.spark.results.limit", -1); if (resultsLimit != -1) { return new AnalyticsQueryResult(dataFrame.schema().fieldNames(), convertRowsToObjects(dataFrame.limit(resultsLimit).collect())); } else { return new AnalyticsQueryResult(dataFrame.schema().fieldNames(), convertRowsToObjects(dataFrame.collect())); } }
public SparkExecutor(SparkPlatform platform, Job job) { super(job); this.platform = platform; this.sparkContextReference = this.platform.getSparkContext(job); this.sparkContextReference.noteObtainedReference(); this.sc = this.sparkContextReference.get(); if (this.sc.getConf().contains("spark.executor.cores")) { this.numDefaultPartitions = 2 * this.sc.getConf().getInt("spark.executor.cores", -1); } else { this.numDefaultPartitions = (int) (2 * this.getConfiguration().getLongProperty("rheem.spark.machines") * this.getConfiguration().getLongProperty("rheem.spark.cores-per-machine")); } }