= logData.map(new Functions.ParseFromLogLine()).cache();
@Override public SparkCollection<T> cache() { return wrap(stream.cache()); }
@Override public void run() { Compat.foreachRDD(stream.cache(), new StreamingSparkSinkFunction<T>(sec, stageSpec)); } };
@Override public void run() { // cache since the streaming sink function will check if the rdd is empty, which can cause recomputation // and confusing metrics if its not cached. Compat.foreachRDD(stream.cache(), new StreamingBatchSinkFunction<>(sinkFunction, sec, stageSpec)); } };
@Override @SuppressWarnings("unchecked") public void cache(String storageLevel, Coder<?> coder) { // we "force" MEMORY storage level in streaming if (!StorageLevel.fromString(storageLevel).equals(StorageLevel.MEMORY_ONLY_SER())) { LOG.warn( "Provided StorageLevel: {} is ignored for streams, using the default level: {}", storageLevel, StorageLevel.MEMORY_ONLY_SER()); } // Caching can cause Serialization, we need to code to bytes // more details in https://issues.apache.org/jira/browse/BEAM-2669 Coder<WindowedValue<T>> wc = (Coder<WindowedValue<T>>) coder; this.dStream = dStream.map(CoderHelpers.toByteFunction(wc)).cache().map(CoderHelpers.fromByteFunction(wc)); }
filteredIotDataStream.cache();
return list; }).cache();
public static void main(String[] args) throws DataIngestException { CmdLineParser cmdLineParser = new CmdLineParser(); final AppArgs appArgs = cmdLineParser.validateArgs(args); System.setProperty("HADOOP_USER_NAME", appArgs.getProperty(DiPConfiguration.HADOOP_USER_NAME)); SparkConf conf = new SparkConf().setAppName("SparkTwitterStreaming") .setMaster("local[*]"); try (JavaStreamingContext jsc = new JavaStreamingContext(new JavaSparkContext(conf), new Duration(1000))) { JavaPairReceiverInputDStream<String, String> stream = KafkaUtils.createStream(jsc, appArgs.getProperty(DiPConfiguration.ZK_HOST)+":"+appArgs.getProperty(DiPConfiguration.ZK_PORT), "spark-stream", getKafkaTopics(appArgs)); JavaDStream<Object[]> twitterStreams = stream.map(tuple -> FlatJsonConverter.convertToValuesArray(tuple._2)) .cache(); SparkHdfsWriter.write(twitterStreams, appArgs); new SparkHBaseWriter(jsc.sparkContext(), appArgs).write(twitterStreams); SparkJdbcSourceWriter jdbcSourceWriter = new SparkJdbcSourceWriter(new SQLContext(jsc.sparkContext()), appArgs); new TopNLocationByTweets(jdbcSourceWriter,Integer.valueOf(appArgs.getProperty("topN"))).compute(twitterStreams); new TopNUsersWithMaxFollowers(jdbcSourceWriter,Integer.valueOf(appArgs.getProperty("topN"))).compute(twitterStreams); jsc.start(); jsc.awaitTermination(); } }