private static Pipeline buildPipeline() { Pattern delimiter = Pattern.compile("\\W+"); Pipeline p = Pipeline.create(); p.drawFrom(Sources.<Long, String>map(BOOK_LINES)) .flatMap(e -> traverseArray(delimiter.split(e.getValue().toLowerCase()))) .filter(word -> !word.isEmpty()) .groupingKey(wholeItem()) .aggregate(counting()) .drainTo(Sinks.map(COUNTS)); return p; }
private static Pipeline buildPipeline() { Pipeline p = Pipeline.create(); p.drawFrom(Sources.files(getBooksPath())) .filter(line -> line.startsWith("The ")) .drainTo(buildTopicSink()); return p; }
private static Pipeline buildPipeline(JobConf jobConfig) { Pipeline p = Pipeline.create(); p.drawFrom(HdfsSources.<AvroWrapper<User>, NullWritable>hdfs(jobConfig)) .filter(entry -> entry.getKey().datum().get(3).equals(Boolean.TRUE)) .peek(entry -> entry.getKey().datum().toString()) .drainTo(HdfsSinks.hdfs(jobConfig)); return p; }
/** * Helper method to construct the pipeline for the job * * @return the pipeline for the job */ public static Pipeline buildPipeline() { final Pipeline p = Pipeline.create(); // Compute map server side final BatchStage<Horse> c = p.drawFrom(Sources.map(EVENTS_BY_NAME, t -> true, HORSE_FROM_EVENT)); final BatchStage<Entry<Horse, Long>> c2 = c.groupingKey(wholeItem()) .aggregate(counting()) .filter(ent -> ent.getValue() > 1); c2.drainTo(Sinks.map(MULTIPLE)); return p; }
private static Pipeline buildPipeline(String sourceDir, String targetDir) { Pipeline p = Pipeline.create(); p.drawFrom(Sources.files(sourceDir)) .map(LogLine::parse) .filter((LogLine log) -> log.getResponseCode() >= 200 && log.getResponseCode() < 400) .flatMap(AccessLogAnalyzer::explodeSubPaths) .groupingKey(wholeItem()) .aggregate(counting()) .drainTo(Sinks.files(targetDir)); return p; }