static JavaRDD<Pair<Long, String>> getHdfsSource(JavaSparkContext sc, URI inputPath) throws IOException { JavaRDD<String> input = sc.textFile(inputPath.toString()); SearchEventsParser parser = new SearchEventsParser(); return input.map(parser::parse) .filter(q -> q != null && q.query != null && !q.query.isEmpty()) .map(q -> Pair.of(q.timestamp, q.query)); }
static DataSet<Tuple2<Long, String>> getHdfsSource(ExecutionEnvironment env, URI inputPath) throws IOException { SearchEventsParser parser = new SearchEventsParser(); return env.readFile(new TextInputFormat(new Path(inputPath)), inputPath.toString()) .map(parser::parse) .filter(q -> q != null && q.query != null && !q.query.isEmpty()) .map(q -> Tuple2.of(q.timestamp, q.query)) .returns(new TypeHint<Tuple2<Long, String>>() {}); }
return FlatMap.of(input) .using(new UnaryFunctor<Pair<byte[], byte[]>, Pair<Long, String>>() { private final SearchEventsParser parser = new SearchEventsParser(); @Override public void apply(Pair<byte[], byte[]> pair, Collector<Pair<Long, String>> context) { .of(in) .using(new UnaryFunctor<String, Pair<Long, String>>() { SearchEventsParser parser = new SearchEventsParser(); @Override public void apply(String line, Collector<Pair<Long, String>> context) {
return ppl.apply(Read.from(HDFSFileSource.fromText(inputUri))) .apply("MapSource", ParDo.of(new DoFn<String, Tuple2<Long, String>>() { SearchEventsParser parser = new SearchEventsParser(); @ProcessElement public void processElement(ProcessContext c) {