private Configuration getConfig() { return (Configuration) this.getFlowStep().getConfig(); }
private Configuration getConfig() { return (Configuration) this.getFlowStep().getConfig(); }
private void prepareBloomFilterBuilder(FlowStep<JobConf> currentStep) { JobConf currentStepConf = currentStep.getConfig(); currentStepConf.set("mapred.reduce.tasks", Integer.toString(BloomProps.getNumSplits(currentStepConf))); }
@Override public void apply(Flow<JobConf> flow, List<FlowStep<JobConf>> predecessorSteps, FlowStep<JobConf> flowStep) { JobConf conf = flowStep.getConfig(); String targetBloomID = conf.get(BloomProps.TARGET_BLOOM_FILTER_ID); if (targetBloomID != null) { prepareBloomFilterBuilder(flowStep); } // the job is the filter which needs to use the bloom filter String sourceBloomID = conf.get(BloomProps.SOURCE_BLOOM_FILTER_ID); if (sourceBloomID != null) { buildBloomfilter(sourceBloomID, flowStep, predecessorSteps); } }
@Override public void apply(Flow<JobConf> flow, List<FlowStep<JobConf>> predecessorSteps, FlowStep<JobConf> flowStep) { // Give jobs human readable names. The default naming scheme includes a bunch of randomly // generated IDs. flowStep.getConfig().setJobName(formatJobName(flowStep)); }
Configuration config = (Configuration) flowStep.getConfig();
Configuration config = (Configuration) flowStep.getConfig();
public static Pair<Flow, Supplier<BloomFilter>> createBloomFlowForKeys(Tap source, Pipe keys, String keyField, FlowConnector connector) throws IOException { String bloomJobID = UUID.randomUUID().toString(); Path bloomTempDir = FileSystemHelper.getRandomTemporaryPath("/tmp/bloom_tmp/"); String bloomPartsDir = bloomTempDir + "/parts"; String bloomFinalFilter = bloomTempDir + "/filter.bloomfilter"; String approxCountPartsDir = bloomTempDir + "/approx_distinct_keys_parts/"; // These pipes write the bloom filter to 100 part files, representing the first 1/100 of the bits in the filter, then // the second 1/100 etc. These splits are the concatenated in in BloomUtil.writeFilterToHdfs. // The end result is that we run this pipe without any apparent output, then read the assembled filter from the side file // on HDFS and return it to the user Pipe filterPipe = new CreateBloomFilter(keys, bloomJobID, approxCountPartsDir, bloomPartsDir, keyField); Flow flow = connector.connect(source, new NullTap(), filterPipe); FlowStep<JobConf> last = Accessors.<FlowStep<JobConf>>last(flow.getFlowSteps()); JobConf conf = last.getConfig(); Supplier<BloomFilter> filterSupplier = () -> { try { return BloomUtil.retrieveFilter(conf); } catch (IOException | CardinalityMergeException e) { throw new RuntimeException(e); } }; return Pair.of(flow, filterSupplier); }
@Test public void testLocalModeSink() throws Exception { Tap source = new Hfs( new TextLine(), "input/path" ); Tap sink = new Lfs( new TextLine(), "output/path", SinkMode.REPLACE ); Pipe pipe = new Pipe( "test" ); Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); List<FlowStep> steps = flow.getFlowSteps(); assertEquals( "wrong size", 1, steps.size() ); FlowStep step = steps.get( 0 ); boolean isLocal = HadoopUtil.isLocal( (Configuration) step.getConfig() ); assertTrue( "is not local", isLocal ); }
@Test public void testLocalModeSink() throws Exception { Tap source = new Hfs( new TextLine(), "input/path" ); Tap sink = new Lfs( new TextLine(), "output/path", SinkMode.REPLACE ); Pipe pipe = new Pipe( "test" ); Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); List<FlowStep> steps = flow.getFlowSteps(); assertEquals( "wrong size", 1, steps.size() ); FlowStep step = steps.get( 0 ); boolean isLocal = HadoopUtil.isLocal( (Configuration) step.getConfig() ); assertTrue( "is not local", isLocal ); }
@Test public void testLocalModeSource() throws Exception { Tap source = new Lfs( new TextLine(), "input/path" ); Tap sink = new Hfs( new TextLine(), "output/path", SinkMode.REPLACE ); Pipe pipe = new Pipe( "test" ); Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); List<FlowStep> steps = flow.getFlowSteps(); assertEquals( "wrong size", 1, steps.size() ); FlowStep step = steps.get( 0 ); boolean isLocal = HadoopUtil.isLocal( (Configuration) step.getConfig() ); assertTrue( "is not local", isLocal ); }
@Test public void testLocalModeSource() throws Exception { Tap source = new Lfs( new TextLine(), "input/path" ); Tap sink = new Hfs( new TextLine(), "output/path", SinkMode.REPLACE ); Pipe pipe = new Pipe( "test" ); Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); List<FlowStep> steps = flow.getFlowSteps(); assertEquals( "wrong size", 1, steps.size() ); FlowStep step = steps.get( 0 ); boolean isLocal = HadoopUtil.isLocal( (Configuration) step.getConfig() ); assertTrue( "is not local", isLocal ); }