TupleEntryIterator in = approxCountsTap.openForRead(CascadingUtil.get().getFlowProcess()); List<HyperLogLog> countParts = new LinkedList<HyperLogLog>();
TupleEntryIterator output = sink.openForRead(CascadingUtil.get().getFlowProcess()); System.out.println("Output tuples from flow:"); while (output.hasNext()) {
public static void main(String[] args) throws IOException { if (args.length != 1) { System.out.println("Usage: hadoop jar cascading_ext.job.jar com.liveramp.cascading_ext.example.BloomJoinExample <output dir>"); return; } Map<String, Tap> sources = new HashMap<String, Tap>(); sources.put("source1", ExampleFixtures.SOURCE_TAP_1); sources.put("source2", ExampleFixtures.SOURCE_TAP_2); String outputDir = args[0]; Hfs sink = new Hfs(new SequenceFile(new Fields("field1", "field2", "field3", "field4")), outputDir); Pipe source1 = new Pipe("source1"); Pipe source2 = new Pipe("source2"); Pipe joined = new BloomJoin(source1, new Fields("field1"), source2, new Fields("field3")); CascadingUtil.get().getFlowConnector().connect("Example flow", sources, sink, joined).complete(); // Take a look at the output tuples TupleEntryIterator output = sink.openForRead(CascadingUtil.get().getFlowProcess()); System.out.println("Output tuples from flow:"); while (output.hasNext()) { System.out.println(output.next().getTuple()); } } }
public static void main(String[] args) throws IOException { if (args.length != 1) { System.out.println("Usage: hadoop jar cascading_ext.job.jar com.liveramp.cascading_ext.example.SimpleFlowExample <output dir>"); return; } String outputDir = args[0]; Hfs sink = new Hfs(new SequenceFile(new Fields("field1", "field2", "field3", "field4")), outputDir); Pipe source1 = new Pipe("source1"); Pipe source2 = new Pipe("source2"); Pipe joined = new CoGroup(source1, new Fields("field1"), source2, new Fields("field3")); Map<String, Tap> sources = new HashMap<String, Tap>(); sources.put("source1", ExampleFixtures.SOURCE_TAP_1); sources.put("source2", ExampleFixtures.SOURCE_TAP_2); CascadingUtil.get().getFlowConnector().connect("Example flow", sources, sink, joined).complete(); // Take a look at the output tuples TupleEntryIterator output = sink.openForRead(CascadingUtil.get().getFlowProcess()); System.out.println("Output tuples from flow:"); while (output.hasNext()) { System.out.println(output.next().getTuple()); } } }
reader = temp.openForRead( new HadoopFlowProcess( jobConf ) );
public static void main(String[] args) throws IOException { if (args.length != 1) { System.out.println("Usage: hadoop jar cascading_ext.job.jar com.liveramp.cascading_ext.example.BloomJoinExampleWithoutCascadingUtil <output dir>"); return; } String outputDir = args[0]; Hfs sink = new Hfs(new SequenceFile(new Fields("field1", "field2", "field3", "field4")), outputDir); Pipe source1 = new Pipe("source1"); Pipe source2 = new Pipe("source2"); Pipe joined = new BloomJoin(source1, new Fields("field1"), source2, new Fields("field3")); Map<String, Tap> sources = new HashMap<String, Tap>(); sources.put("source1", ExampleFixtures.SOURCE_TAP_1); sources.put("source2", ExampleFixtures.SOURCE_TAP_2); // set some default properties and set the flow step strategy Flow f = new HadoopFlowConnector(BloomProps.getDefaultProperties()).connect("Example BloomJoin", sources, sink, joined); f.setFlowStepStrategy(new BloomAssemblyStrategy()); f.complete(); // Take a look at the output tuples TupleEntryIterator output = sink.openForRead(CascadingUtil.get().getFlowProcess()); System.out.println("Output tuples from flow:"); while (output.hasNext()) { System.out.println(output.next().getTuple()); } }
reader = temp.openForRead( new HadoopFlowProcess( jobConf ) );
private static BloomFilter mergeBloomParts(String tapPath, long numBloomBits, long splitSize, int numBloomHashes, long numElems, HashFunctionFactory hashFactory) throws IOException { FixedSizeBitSet bitSet = new FixedSizeBitSet(numBloomBits); if (FileSystemHelper.getFS().exists(new Path(tapPath))) { Hfs tap = new Hfs(new SequenceFile(new Fields("split", "filter")), tapPath); TupleEntryIterator itr = tap.openForRead(CascadingUtil.get().getFlowProcess()); while (itr.hasNext()) { TupleEntry cur = itr.next(); long split = cur.getLong(0); FixedSizeBitSet curSet = new FixedSizeBitSet(splitSize, ((BytesWritable)cur.getObject(1)).getBytes()); for (long i = 0; i < curSet.numBits(); i++) { if (curSet.get(i)) { bitSet.set(split * splitSize + i); } } } itr.close(); } return new BloomFilter(numBloomBits, numBloomHashes, bitSet, numElems, hashFactory); }
@Test public void testFlow() throws IOException { getPlatform().copyFromLocal( inputFileApache ); String outputPath1 = getOutputPath( "flowTest1" ); String outputPath2 = getOutputPath( "flowTest2" ); String outputPath3 = getOutputPath( "flowTest3" ); remove( outputPath1, true ); remove( outputPath2, true ); remove( outputPath3, true ); JobConf defaultConf = (JobConf) ( (BaseHadoopPlatform) getPlatform() ).getConfiguration(); JobConf conf1 = createJob( defaultConf, "mr1", InputData.inputFileApache, outputPath1 ); JobConf conf2 = createJob( defaultConf, "mr2", outputPath1, outputPath2 ); JobConf conf3 = createJob( defaultConf, "mr3", outputPath2, outputPath3 ); MultiMapReduceFlow flow = new MultiMapReduceFlow( "mrflow", conf1, conf2, conf3 ); validateLength( new Hfs( new TextLine(), InputData.inputFileApache ).openForRead( new HadoopFlowProcess( defaultConf ) ), 10 ); flow.complete(); validateLength( new Hfs( new TextLine(), outputPath1 ).openForRead( new HadoopFlowProcess( defaultConf ) ), 10 ); Collection<Tap> sinks = flow.getSinks().values(); assertEquals( 1, sinks.size() ); String identifier = sinks.iterator().next().getIdentifier(); assertEquals( "flowTest3", identifier.substring( identifier.lastIndexOf( '/' ) + 1 ) ); }
@Test public void testFlow() throws IOException { getPlatform().copyFromLocal( inputFileApache ); String outputPath1 = getOutputPath( "flowTest1" ); String outputPath2 = getOutputPath( "flowTest2" ); String outputPath3 = getOutputPath( "flowTest3" ); remove( outputPath1, true ); remove( outputPath2, true ); remove( outputPath3, true ); JobConf defaultConf = (JobConf) ( (BaseHadoopPlatform) getPlatform() ).getConfiguration(); JobConf conf1 = createJob( defaultConf, "mr1", InputData.inputFileApache, outputPath1 ); JobConf conf2 = createJob( defaultConf, "mr2", outputPath1, outputPath2 ); JobConf conf3 = createJob( defaultConf, "mr3", outputPath2, outputPath3 ); MultiMapReduceFlow flow = new MultiMapReduceFlow( "mrflow", conf1, conf2, conf3 ); validateLength( new Hfs( new TextLine(), InputData.inputFileApache ).openForRead( new HadoopFlowProcess( defaultConf ) ), 10 ); flow.complete(); validateLength( new Hfs( new TextLine(), outputPath1 ).openForRead( new HadoopFlowProcess( defaultConf ) ), 10 ); Collection<Tap> sinks = flow.getSinks().values(); assertEquals( 1, sinks.size() ); String identifier = sinks.iterator().next().getIdentifier(); assertEquals( "flowTest3", identifier.substring( identifier.lastIndexOf( '/' ) + 1 ) ); }
@Test public void testFlow() throws IOException { getPlatform().copyFromLocal( inputFileApache ); JobConf defaultConf = (JobConf) ( (BaseHadoopPlatform) getPlatform() ).getConfiguration(); JobConf conf = new JobConf( defaultConf ); conf.setJobName( "mrflow" ); conf.setOutputKeyClass( LongWritable.class ); conf.setOutputValueClass( Text.class ); conf.setMapperClass( IdentityMapper.class ); conf.setReducerClass( IdentityReducer.class ); conf.setInputFormat( TextInputFormat.class ); conf.setOutputFormat( TextOutputFormat.class ); FileInputFormat.setInputPaths( conf, new Path( inputFileApache ) ); String outputPath = getOutputPath( "flowTest" ); FileOutputFormat.setOutputPath( conf, new Path( outputPath ) ); Flow flow = new MapReduceFlow( "mrflow", conf, true ); validateLength( new Hfs( new TextLine(), inputFileApache ).openForRead( new HadoopFlowProcess( defaultConf ) ), 10 ); flow.complete(); validateLength( new Hfs( new TextLine(), outputPath ).openForRead( new HadoopFlowProcess( defaultConf ) ), 10 ); }
@Test public void testFlow() throws IOException { getPlatform().copyFromLocal( inputFileApache ); JobConf defaultConf = (JobConf) ( (BaseHadoopPlatform) getPlatform() ).getConfiguration(); JobConf conf = new JobConf( defaultConf ); conf.setJobName( "mrflow" ); conf.setOutputKeyClass( LongWritable.class ); conf.setOutputValueClass( Text.class ); conf.setMapperClass( IdentityMapper.class ); conf.setReducerClass( IdentityReducer.class ); conf.setInputFormat( TextInputFormat.class ); conf.setOutputFormat( TextOutputFormat.class ); FileInputFormat.setInputPaths( conf, new Path( inputFileApache ) ); String outputPath = getOutputPath( "flowTest" ); FileOutputFormat.setOutputPath( conf, new Path( outputPath ) ); Flow flow = new MapReduceFlow( "mrflow", conf, true ); validateLength( new Hfs( new TextLine(), inputFileApache ).openForRead( new HadoopFlowProcess( defaultConf ) ), 10 ); flow.complete(); validateLength( new Hfs( new TextLine(), outputPath ).openForRead( new HadoopFlowProcess( defaultConf ) ), 10 ); }
@Test public void testFlowLazy() throws IOException { getPlatform().copyFromLocal( inputFileApache ); String outputPath1 = getOutputPath( "flowTest1" ); String outputPath2 = getOutputPath( "flowTest2" ); String outputPath3 = getOutputPath( "flowTest3" ); remove( outputPath1, true ); remove( outputPath2, true ); remove( outputPath3, true ); JobConf defaultConf = (JobConf) ( (BaseHadoopPlatform) getPlatform() ).getConfiguration(); JobConf conf1 = createJob( defaultConf, "mr1", InputData.inputFileApache, outputPath1 ); JobConf conf2 = createJob( defaultConf, "mr2", outputPath1, outputPath2 ); JobConf conf3 = createJob( defaultConf, "mr3", outputPath2, outputPath3 ); validateLength( new Hfs( new TextLine(), InputData.inputFileApache ).openForRead( new HadoopFlowProcess( defaultConf ) ), 10 ); MultiMapReduceFlow flow = new MultiMapReduceFlow( "mrflow", conf1 ); flow.start(); Util.safeSleep( 3000 ); flow.attachFlowStep( conf2 ); Util.safeSleep( 3000 ); flow.attachFlowStep( conf3 ); flow.complete(); validateLength( new Hfs( new TextLine(), outputPath1 ).openForRead( new HadoopFlowProcess( defaultConf ) ), 10 ); Collection<Tap> sinks = flow.getSinks().values(); assertEquals( 1, sinks.size() ); String identifier = sinks.iterator().next().getIdentifier(); assertEquals( "flowTest3", identifier.substring( identifier.lastIndexOf( '/' ) + 1 ) ); }
@Test public void testFlowLazy() throws IOException { getPlatform().copyFromLocal( inputFileApache ); String outputPath1 = getOutputPath( "flowTest1" ); String outputPath2 = getOutputPath( "flowTest2" ); String outputPath3 = getOutputPath( "flowTest3" ); remove( outputPath1, true ); remove( outputPath2, true ); remove( outputPath3, true ); JobConf defaultConf = (JobConf) ( (BaseHadoopPlatform) getPlatform() ).getConfiguration(); JobConf conf1 = createJob( defaultConf, "mr1", InputData.inputFileApache, outputPath1 ); JobConf conf2 = createJob( defaultConf, "mr2", outputPath1, outputPath2 ); JobConf conf3 = createJob( defaultConf, "mr3", outputPath2, outputPath3 ); validateLength( new Hfs( new TextLine(), InputData.inputFileApache ).openForRead( new HadoopFlowProcess( defaultConf ) ), 10 ); MultiMapReduceFlow flow = new MultiMapReduceFlow( "mrflow", conf1 ); flow.start(); Util.safeSleep( 3000 ); flow.attachFlowStep( conf2 ); Util.safeSleep( 3000 ); flow.attachFlowStep( conf3 ); flow.complete(); validateLength( new Hfs( new TextLine(), outputPath1 ).openForRead( new HadoopFlowProcess( defaultConf ) ), 10 ); Collection<Tap> sinks = flow.getSinks().values(); assertEquals( 1, sinks.size() ); String identifier = sinks.iterator().next().getIdentifier(); assertEquals( "flowTest3", identifier.substring( identifier.lastIndexOf( '/' ) + 1 ) ); }
@Test public void testHfsAsterisk() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); Hfs sourceExists = new Hfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "*" ); assertTrue( sourceExists.resourceExists( getPlatform().getFlowProcess() ) ); TupleEntryIterator iterator = sourceExists.openForRead( getPlatform().getFlowProcess() ); assertTrue( iterator.hasNext() ); iterator.close(); try { Hfs sourceNotExists = new Hfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "/blah/" ); iterator = sourceNotExists.openForRead( getPlatform().getFlowProcess() ); fail(); } catch( IOException exception ) { // do nothing } }
@Test public void testHfsAsterisk() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); Hfs sourceExists = new Hfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "*" ); assertTrue( sourceExists.resourceExists( getPlatform().getFlowProcess() ) ); TupleEntryIterator iterator = sourceExists.openForRead( getPlatform().getFlowProcess() ); assertTrue( iterator.hasNext() ); iterator.close(); try { Hfs sourceNotExists = new Hfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "/blah/" ); iterator = sourceNotExists.openForRead( getPlatform().getFlowProcess() ); fail(); } catch( IOException exception ) { // do nothing } }
@Test public void testHfsBracketAsterisk() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); Hfs sourceExists = new Hfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "{*}" ); assertTrue( sourceExists.resourceExists( getPlatform().getFlowProcess() ) ); TupleEntryIterator iterator = sourceExists.openForRead( getPlatform().getFlowProcess() ); assertTrue( iterator.hasNext() ); iterator.close(); try { Hfs sourceNotExists = new Hfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "/blah/" ); iterator = sourceNotExists.openForRead( getPlatform().getFlowProcess() ); fail(); } catch( IOException exception ) { // do nothing } }
@Test public void testHfsBracketAsterisk() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); Hfs sourceExists = new Hfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "{*}" ); assertTrue( sourceExists.resourceExists( getPlatform().getFlowProcess() ) ); TupleEntryIterator iterator = sourceExists.openForRead( getPlatform().getFlowProcess() ); assertTrue( iterator.hasNext() ); iterator.close(); try { Hfs sourceNotExists = new Hfs( new TextLine( new Fields( "offset", "line" ) ), InputData.inputPath + "/blah/" ); iterator = sourceNotExists.openForRead( getPlatform().getFlowProcess() ); fail(); } catch( IOException exception ) { // do nothing } }
@Test(expected = IllegalStateException.class) public void testFlowLazyFail() throws IOException { getPlatform().copyFromLocal( inputFileApache ); String outputPath1 = getOutputPath( "flowTest1" ); String outputPath2 = getOutputPath( "flowTest2" ); remove( outputPath1, true ); remove( outputPath2, true ); JobConf defaultConf = (JobConf) ( (BaseHadoopPlatform) getPlatform() ).getConfiguration(); JobConf conf1 = createJob( defaultConf, "mr1", InputData.inputFileApache, outputPath1 ); JobConf conf2 = createJob( defaultConf, "mr2", outputPath1, outputPath2 ); validateLength( new Hfs( new TextLine(), InputData.inputFileApache ).openForRead( new HadoopFlowProcess( defaultConf ) ), 10 ); MultiMapReduceFlow flow = new MultiMapReduceFlow( "mrflow", conf1 ); flow.complete(); flow.attachFlowStep( conf2 ); }
@Test(expected = IllegalStateException.class) public void testFlowLazyFail() throws IOException { getPlatform().copyFromLocal( inputFileApache ); String outputPath1 = getOutputPath( "flowTest1" ); String outputPath2 = getOutputPath( "flowTest2" ); remove( outputPath1, true ); remove( outputPath2, true ); JobConf defaultConf = (JobConf) ( (BaseHadoopPlatform) getPlatform() ).getConfiguration(); JobConf conf1 = createJob( defaultConf, "mr1", InputData.inputFileApache, outputPath1 ); JobConf conf2 = createJob( defaultConf, "mr2", outputPath1, outputPath2 ); validateLength( new Hfs( new TextLine(), InputData.inputFileApache ).openForRead( new HadoopFlowProcess( defaultConf ) ), 10 ); MultiMapReduceFlow flow = new MultiMapReduceFlow( "mrflow", conf1 ); flow.complete(); flow.attachFlowStep( conf2 ); }