public JavaPairDStream<Long, Integer> readStats(JavaStreamingContext jssc, String inputDirectory) { // Note: This example doesn't work until Spark 1.2 JavaPairDStream<LongWritable, Text> input = jssc.fileStream(inputDirectory, LongWritable.class, Text.class, TextInputFormat.class); // convert the input from Writables to native types JavaPairDStream<Long, Integer> usefulInput = input.mapToPair( new PairFunction<Tuple2<LongWritable, Text>, Long, Integer>() { public Tuple2<Long, Integer> call(Tuple2<LongWritable, Text> input) { return new Tuple2(input._1().get(), Integer.parseInt(input._2().toString())); } }); return usefulInput; }
@Override public synchronized void close() { if (streamingContext != null) { log.info("Shutting down Spark Streaming; this may take some time"); streamingContext.stop(true, true); streamingContext = null; } }
public static void main(String[] args) throws Exception { String master = args[0]; JavaSparkContext sc = new JavaSparkContext(master, "StreamingLogInput"); // Create a StreamingContext with a 1 second batch size JavaStreamingContext jssc = new JavaStreamingContext(sc, new Duration(1000)); // Create a DStream from all the input on port 7777 JavaDStream<String> lines = jssc.socketTextStream("localhost", 7777); // Filter our DStream for lines with "error" JavaDStream<String> errorLines = lines.filter(new Function<String, Boolean>() { public Boolean call(String line) { return line.contains("error"); }}); // Print out the lines with errors, which causes this DStream to be evaluated errorLines.print(); // start our streaming context and wait for it to "finish" jssc.start(); // Wait for 10 seconds then exit. To run forever call without a timeout jssc.awaitTermination(10000); // Stop the streaming context jssc.stop(); } }
JavaStreamingContext jssc = new JavaStreamingContext(conf, Flags.getInstance().getSlideInterval()); jssc.checkpoint(Flags.getInstance().getCheckpointDirectory()); JavaDStream<String> logData = jssc.textFileStream(Flags.getInstance().getLogsDirectory()); = logData.map(new Functions.ParseFromLogLine()).cache(); accessLogsDStream.foreachRDD(new Function<JavaRDD<ApacheAccessLog>, Void>() { public Void call(JavaRDD<ApacheAccessLog> rdd) { jssc.start(); // Start the computation jssc.awaitTermination(); // Wait for the computation to terminate
public static void main(String[] args) throws Exception { String zkQuorum = args[0]; String group = args[1]; SparkConf conf = new SparkConf().setAppName("KafkaInput"); // Create a StreamingContext with a 1 second batch size JavaStreamingContext jssc = new JavaStreamingContext(conf, new Duration(1000)); Map<String, Integer> topics = new HashMap<String, Integer>(); topics.put("pandas", 1); JavaPairDStream<String, String> input = KafkaUtils.createStream(jssc, zkQuorum, group, topics); input.print(); // start our streaming context and wait for it to "finish" jssc.start(); // Wait for 10 seconds then exit. To run forever call without a timeout jssc.awaitTermination(10000); // Stop the streaming context jssc.stop(); } }
accessLogsDStream.foreachRDD(new Function<JavaRDD<ApacheAccessLog>, Void>() { public Void call(JavaRDD<ApacheAccessLog> accessLogs) { Tuple4<Long, Long, Long, Long> stats = JavaPairDStream<Integer, Long> responseCodeCountDStream = accessLogsDStream.transformToPair( new Function<JavaRDD<ApacheAccessLog>, JavaPairRDD<Integer, Long>>() { public JavaPairRDD<Integer, Long> call(JavaRDD<ApacheAccessLog> rdd) { return Functions.responseCodeCount(rdd); }}) .updateStateByKey(new Functions.ComputeRunningSum()); responseCodeCountDStream.foreachRDD(new Function<JavaPairRDD<Integer, Long>, Void>() { public Void call(JavaPairRDD<Integer, Long> rdd) { currentResponseCodeCounts = rdd.take(100); JavaPairDStream<String, Long> ipRawDStream = accessLogsDStream.transformToPair( new Function<JavaRDD<ApacheAccessLog>, JavaPairRDD<String, Long>>(){ public JavaPairRDD<String, Long> call(JavaRDD<ApacheAccessLog> rdd) { JavaPairDStream<String, Long> ipCumDStream = ipRawDStream.updateStateByKey( new Functions.ComputeRunningSum()); JavaPairDStream<String, Long> ipDStream = accessLogsDStream.mapToPair(new Functions.IpTuple()); JavaPairDStream<String, Long> ipCountsDStream = ipDStream.reduceByKey(new Functions.LongSumReducer()); JavaPairDStream<String, Long> ipBytesDStream = accessLogsDStream.mapToPair(new Functions.IpContentTuple()); JavaPairDStream<String, Long> ipBytesSumDStream = ipBytesDStream.reduceByKey(new Functions.LongSumReducer()); JavaPairDStream<String, Tuple2<Long, Long>> ipBytesRequestCountDStream = ipBytesSumDStream.join(ipCountsDStream); JavaPairDStream<Text, LongWritable> writableDStream = ipDStream.mapToPair(
JavaSparkContext sparkContext = streamingContext.sparkContext(); Configuration hadoopConf = sparkContext.hadoopConfiguration(); JavaInputDStream<ConsumerRecord<K,M>> kafkaDStream = buildInputDStream(streamingContext); JavaPairDStream<K,M> pairDStream = kafkaDStream.mapToPair(mAndM -> new Tuple2<>(mAndM.key(), mAndM.value())); pairDStream.foreachRDD( new BatchUpdateFunction<>(getConfig(), keyClass, pairDStream.foreachRDD(new SaveToHDFSFunction<>( dataDirString + "/oryx", "data", kafkaDStream.foreachRDD(new UpdateOffsetsFn<>(getGroupID(), getInputTopicLockMaster())); pairDStream.foreachRDD(new DeleteOldDataFn<>(hadoopConf, dataDirString, Pattern.compile("-(\\d+)\\."), pairDStream.foreachRDD(new DeleteOldDataFn<>(hadoopConf, modelDirString, Pattern.compile("(\\d+)"), streamingContext.start();
public void processAccessLogs(String outDir, JavaDStream<ApacheAccessLog> accessLogsDStream) { JavaDStream<ApacheAccessLog> windowDStream = accessLogsDStream.window( Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval()); JavaDStream<String> ip = accessLogsDStream.map( new Function<ApacheAccessLog, String>() { public String call(ApacheAccessLog entry) { JavaDStream<Long> requestCountRBW = accessLogsDStream.map(new Function<ApacheAccessLog, Long>() { public Long call(ApacheAccessLog entry) { return 1L; }}).reduceByWindow(new Function2<Long, Long, Long>() { public Long call(Long v1, Long v2) { return v1+v2; return v1-v2; }}, Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval()); requestCountRBW.print(); JavaPairDStream<String, Long> ipAddressPairDStream = accessLogsDStream.mapToPair( new PairFunction<ApacheAccessLog, String, Long>() { public Tuple2<String, Long> call(ApacheAccessLog entry) { return new Tuple2(entry.getIpAddress(), 1L); }}); JavaPairDStream<String, Long> ipCountDStream = ipAddressPairDStream.reduceByKeyAndWindow( Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval()); ipCountDStream.print();
protected final JavaStreamingContext buildStreamingContext() { log.info("Starting SparkContext with interval {} seconds", generationIntervalSec); SparkConf sparkConf = new SparkConf(); // Only for tests, really if (sparkConf.getOption("spark.master").isEmpty()) { log.info("Overriding master to {} for tests", streamingMaster); sparkConf.setMaster(streamingMaster); } // Only for tests, really if (sparkConf.getOption("spark.app.name").isEmpty()) { String appName = "Oryx" + getLayerName(); if (id != null) { appName = appName + "-" + id; } log.info("Overriding app name to {} for tests", appName); sparkConf.setAppName(appName); } extraSparkConfig.forEach((key, value) -> sparkConf.setIfMissing(key, value.toString())); // Turn this down to prevent long blocking at shutdown sparkConf.setIfMissing( "spark.streaming.gracefulStopTimeout", Long.toString(TimeUnit.MILLISECONDS.convert(generationIntervalSec, TimeUnit.SECONDS))); sparkConf.setIfMissing("spark.cleaner.ttl", Integer.toString(20 * generationIntervalSec)); long generationIntervalMS = TimeUnit.MILLISECONDS.convert(generationIntervalSec, TimeUnit.SECONDS); JavaSparkContext jsc = JavaSparkContext.fromSparkContext(SparkContext.getOrCreate(sparkConf)); return new JavaStreamingContext(jsc, new Duration(generationIntervalMS)); }
@Override public void onReceiverError(JavaStreamingListenerReceiverError receiverError) { JavaReceiverInfo receiverInfo = receiverError.receiverInfo(); receiverInfo.streamId(); receiverInfo.name(); receiverInfo.active(); receiverInfo.location(); receiverInfo.executorId(); receiverInfo.lastErrorMessage(); receiverInfo.lastError(); receiverInfo.lastErrorTime(); }
public void await() throws InterruptedException { Preconditions.checkState(streamingContext != null); log.info("Spark Streaming is running"); streamingContext.awaitTermination(); }
BatchUpdateFunction(Config config, Class<K> keyClass, Class<M> messageClass, Class<? extends Writable> keyWritableClass, Class<? extends Writable> messageWritableClass, String dataDirString, String modelDirString, BatchLayerUpdate<K,M,U> updateInstance, JavaStreamingContext streamingContext) { this.keyClass = keyClass; this.messageClass = messageClass; this.keyWritableClass = keyWritableClass; this.messageWritableClass = messageWritableClass; this.dataDirString = dataDirString; this.modelDirString = modelDirString; this.updateBroker = ConfigUtils.getOptionalString(config, "oryx.update-topic.broker"); this.updateTopic = ConfigUtils.getOptionalString(config, "oryx.update-topic.message.topic"); this.updateInstance = updateInstance; this.sparkContext = streamingContext.sparkContext(); }
JavaInputDStream<ConsumerRecord<K,M>> kafkaDStream = buildInputDStream(streamingContext); JavaPairDStream<K,M> pairDStream = kafkaDStream.mapToPair(mAndM -> new Tuple2<>(mAndM.key(), mAndM.value())); Configuration hadoopConf = streamingContext.sparkContext().hadoopConfiguration(); new Thread(LoggingCallable.log(() -> { try { pairDStream.foreachRDD(new SpeedLayerUpdate<>(modelManager, updateBroker, updateTopic)); kafkaDStream.foreachRDD(new UpdateOffsetsFn<>(getGroupID(), getInputTopicLockMaster())); streamingContext.start();
@Override public void onReceiverError(JavaStreamingListenerReceiverError receiverError) { JavaReceiverInfo receiverInfo = receiverError.receiverInfo(); receiverInfo.streamId(); receiverInfo.name(); receiverInfo.active(); receiverInfo.location(); receiverInfo.executorId(); receiverInfo.lastErrorMessage(); receiverInfo.lastError(); receiverInfo.lastErrorTime(); }
@Override public synchronized void close() { if (modelManager != null) { log.info("Shutting down model manager"); modelManager.close(); modelManager = null; } if (consumerIterator != null) { log.info("Shutting down consumer"); consumerIterator.close(); consumerIterator = null; } if (streamingContext != null) { log.info("Shutting down Spark Streaming; this may take some time"); streamingContext.stop(true, true); streamingContext = null; } }
public void await() throws InterruptedException { JavaStreamingContext theStreamingContext; synchronized (this) { theStreamingContext = streamingContext; Preconditions.checkState(theStreamingContext != null); } log.info("Spark Streaming is running"); theStreamingContext.awaitTermination(); // Can't do this with lock }
@Override public void onReceiverStarted(JavaStreamingListenerReceiverStarted receiverStarted) { JavaReceiverInfo receiverInfo = receiverStarted.receiverInfo(); receiverInfo.streamId(); receiverInfo.name(); receiverInfo.active(); receiverInfo.location(); receiverInfo.executorId(); receiverInfo.lastErrorMessage(); receiverInfo.lastError(); receiverInfo.lastErrorTime(); }
@Override public void onReceiverStopped(JavaStreamingListenerReceiverStopped receiverStopped) { JavaReceiverInfo receiverInfo = receiverStopped.receiverInfo(); receiverInfo.streamId(); receiverInfo.name(); receiverInfo.active(); receiverInfo.location(); receiverInfo.executorId(); receiverInfo.lastErrorMessage(); receiverInfo.lastError(); receiverInfo.lastErrorTime(); }
@Override public void onReceiverStarted(JavaStreamingListenerReceiverStarted receiverStarted) { JavaReceiverInfo receiverInfo = receiverStarted.receiverInfo(); receiverInfo.streamId(); receiverInfo.name(); receiverInfo.active(); receiverInfo.location(); receiverInfo.executorId(); receiverInfo.lastErrorMessage(); receiverInfo.lastError(); receiverInfo.lastErrorTime(); }
@Override public void onReceiverStopped(JavaStreamingListenerReceiverStopped receiverStopped) { JavaReceiverInfo receiverInfo = receiverStopped.receiverInfo(); receiverInfo.streamId(); receiverInfo.name(); receiverInfo.active(); receiverInfo.location(); receiverInfo.executorId(); receiverInfo.lastErrorMessage(); receiverInfo.lastError(); receiverInfo.lastErrorTime(); }