@Test public void testDupeSource2() { Tap source1 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo/merge" ); Tap sink = new Hfs( new TextLine(), "foo" ); Pipe left = new Pipe( "left" ); Pipe right = new Pipe( "right" ); Pipe merge = new CoGroup( "cogroup", left, new Fields( "offset" ), right, new Fields( "offset" ), Fields.size( 4 ) ); Map sources = new HashMap(); sources.put( "left", source1 ); sources.put( "right", source1 ); Map sinks = new HashMap(); sinks.put( "cogroup", sink ); Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, merge ); }
@Override public void cleanup(FlowProcess flowProcess, OperationCall operationCall) { JobConf conf = (JobConf) flowProcess.getConfigCopy(); try { LOG.info("HLL counter found " + approxCounter.cardinality() + " distinct keys"); Hfs tap = new Hfs(new SequenceFile(new Fields("bytes")), BloomProps.getApproxCountsDir(conf)); TupleEntryCollector out = tap.openForWrite(new HadoopFlowProcess(conf)); out.add(new Tuple(new BytesWritable(approxCounter.getBytes()))); out.close(); } catch (IOException e) { throw new RuntimeException("couldn't write approximate counts to side bucket", e); } }
@Test public void testPartitionedWriteReadHDFS() throws Exception { copyFromLocal( inputFileLhs ); Tap source = new FileTap( new cascading.scheme.local.TextDelimited( new Fields( "num", "char" ), " " ), inputFileLhs ); Hfs original = new Hfs( new TextDelimited( new Fields( "num", "char" ), " " ), getOutputPath( "/intermediate" ), SinkMode.REPLACE ); Tap intermediate = new LocalHfsAdaptor( new PartitionTap( original, new DelimitedPartition( new Fields( "num" ), "/" ) ) ); Tap sink = new FileTap( new cascading.scheme.local.TextDelimited( new Fields( "num", "char" ), " " ), getOutputPath( "/final" ), SinkMode.REPLACE ); Pipe pipe = new Pipe( "test" ); Flow first = new LocalFlowConnector( getPlatform().getProperties() ).connect( source, intermediate, pipe ); first.complete(); validateLength( first, 13 ); Flow second = new LocalFlowConnector( getPlatform().getProperties() ).connect( intermediate, sink, pipe ); second.complete(); validateLength( second, 13 ); } }
@Test public void testDupeSource2() { Tap source1 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo/merge" ); Tap sink = new Hfs( new TextLine(), "foo" ); Pipe left = new Pipe( "left" ); Pipe right = new Pipe( "right" ); Pipe merge = new CoGroup( "cogroup", left, new Fields( "offset" ), right, new Fields( "offset" ), Fields.size( 4 ) ); Map sources = new HashMap(); sources.put( "left", source1 ); sources.put( "right", source1 ); Map sinks = new HashMap(); sinks.put( "cogroup", sink ); Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, merge ); }
@Test public void testWriteReadHDFS() throws Exception { copyFromLocal( inputFileApache ); Tap source = new FileTap( new cascading.scheme.local.TextLine( new Fields( "offset", "line" ) ), inputFileApache ); Tap intermediate = new LocalHfsAdaptor( new Hfs( new cascading.scheme.hadoop.TextLine(), getOutputPath( "/intermediate" ), SinkMode.REPLACE ) ); Tap sink = new FileTap( new cascading.scheme.local.TextLine(), getOutputPath( "/final" ), SinkMode.REPLACE ); Pipe pipe = new Pipe( "test" ); Flow first = new LocalFlowConnector( getPlatform().getProperties() ).connect( source, intermediate, pipe ); first.complete(); validateLength( first, 10 ); Flow second = new LocalFlowConnector( getPlatform().getProperties() ).connect( intermediate, sink, pipe ); second.complete(); validateLength( second, 10 ); }
@Test public void testSourceIsSink() { Tap tap = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo/merge" ); Pipe pipe = new Pipe( "left" ); try { Flow flow = getPlatform().getFlowConnector().connect( tap, tap, pipe ); fail( "did not throw planner exception" ); } catch( Exception exception ) { // exception.printStackTrace(); } }
@Test public void testCoGroupWithResultGroupFields() throws Exception { Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo" ); Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "bar" ); Map sources = new HashMap(); sources.put( "lower", sourceLower ); sources.put( "upper", sourceUpper ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); // using null pos so all fields are written Tap sink = new Hfs( new TextLine(), "/complex/cogroup/", SinkMode.REPLACE ); Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); Pipe splice = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new Fields( "num1", "lhs", "num2", "rhs" ), new Fields( "somenum", "somenum2" ) ); splice = new Every( splice, new First( new Fields( "value" ) ), new Fields( "somenum", "value" ) ); Flow countFlow = getPlatform().getFlowConnector().connect( sources, sink, splice ); }
@Test public void testCoGroupWithResultGroupFields() throws Exception { Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo" ); Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "bar" ); Map sources = new HashMap(); sources.put( "lower", sourceLower ); sources.put( "upper", sourceUpper ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); // using null pos so all fields are written Tap sink = new Hfs( new TextLine(), "/complex/cogroup/", SinkMode.REPLACE ); Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); Pipe splice = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new Fields( "num1", "lhs", "num2", "rhs" ), new Fields( "somenum", "somenum2" ) ); splice = new Every( splice, new First( new Fields( "value" ) ), new Fields( "somenum", "value" ) ); Flow countFlow = getPlatform().getFlowConnector().connect( sources, sink, splice ); }
@Test public void testCoGroupWithResultGroupFieldsDefault() throws Exception { Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo" ); Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "bar" ); Map sources = new HashMap(); sources.put( "lower", sourceLower ); sources.put( "upper", sourceUpper ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); // using null pos so all fields are written Tap sink = new Hfs( new TextLine(), "/complex/cogroup/", SinkMode.REPLACE ); Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); Pipe splice = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new Fields( "num1", "lhs", "num2", "rhs" ) ); splice = new Every( splice, new First( new Fields( "value" ) ), new Fields( "num1", "value" ) ); Flow countFlow = getPlatform().getFlowConnector().connect( sources, sink, splice ); }
@Test public void testCoGroupWithResultGroupFieldsDefault() throws Exception { Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo" ); Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "bar" ); Map sources = new HashMap(); sources.put( "lower", sourceLower ); sources.put( "upper", sourceUpper ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); // using null pos so all fields are written Tap sink = new Hfs( new TextLine(), "/complex/cogroup/", SinkMode.REPLACE ); Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); Pipe splice = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new Fields( "num1", "lhs", "num2", "rhs" ) ); splice = new Every( splice, new First( new Fields( "value" ) ), new Fields( "num1", "value" ) ); Flow countFlow = getPlatform().getFlowConnector().connect( sources, sink, splice ); }
@Test public void testCoGroupAroundCoGroup() throws Exception { Tap source10 = new Hfs( new TextLine( new Fields( "num" ) ), "foo" ); Tap source20 = new Hfs( new TextLine( new Fields( "num" ) ), "bar" ); Map sources = new HashMap(); sources.put( "source20", source20 ); sources.put( "source101", source10 ); sources.put( "source102", source10 ); // using null pos so all fields are written Tap sink = new Hfs( new TextLine(), "baz", SinkMode.REPLACE ); Pipe pipeNum20 = new Pipe( "source20" ); Pipe pipeNum101 = new Pipe( "source101" ); Pipe pipeNum102 = new Pipe( "source102" ); Pipe splice1 = new CoGroup( pipeNum20, new Fields( "num" ), pipeNum101, new Fields( "num" ), new Fields( "num1", "num2" ) ); Pipe splice2 = new CoGroup( splice1, new Fields( "num1" ), pipeNum102, new Fields( "num" ), new Fields( "num1", "num2", "num3" ) ); Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice2 ); assertEquals( "not equal: steps.size()", 2, flow.getFlowSteps().size() ); }
@Test public void testSourceIsSink() { Tap tap = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo/merge" ); Pipe pipe = new Pipe( "left" ); try { Flow flow = getPlatform().getFlowConnector().connect( tap, tap, pipe ); fail( "did not throw planner exception" ); } catch( Exception exception ) { // exception.printStackTrace(); } }
@Test public void testDupeSourceRepeat() { Tap source1 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo/merge" ); Tap sink = new Hfs( new TextLine(), "foo" ); Pipe pipe = new Pipe( "pipe" ); Pipe merge = new CoGroup( "cogroup", pipe, new Fields( "offset" ), 1, Fields.size( 4 ) ); Map sources = new HashMap(); sources.put( "pipe", source1 ); Map sinks = new HashMap(); sinks.put( "cogroup", sink ); Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, merge ); List<FlowStep> steps = flow.getFlowSteps(); assertEquals( "not equal: steps.size()", 1, steps.size() ); }
@Test public void testCoGroupAroundCoGroup() throws Exception { Tap source10 = new Hfs( new TextLine( new Fields( "num" ) ), "foo" ); Tap source20 = new Hfs( new TextLine( new Fields( "num" ) ), "bar" ); Map sources = new HashMap(); sources.put( "source20", source20 ); sources.put( "source101", source10 ); sources.put( "source102", source10 ); // using null pos so all fields are written Tap sink = new Hfs( new TextLine(), "baz", SinkMode.REPLACE ); Pipe pipeNum20 = new Pipe( "source20" ); Pipe pipeNum101 = new Pipe( "source101" ); Pipe pipeNum102 = new Pipe( "source102" ); Pipe splice1 = new CoGroup( pipeNum20, new Fields( "num" ), pipeNum101, new Fields( "num" ), new Fields( "num1", "num2" ) ); Pipe splice2 = new CoGroup( splice1, new Fields( "num1" ), pipeNum102, new Fields( "num" ), new Fields( "num1", "num2", "num3" ) ); Flow flow = getPlatform().getFlowConnector().connect( sources, sink, splice2 ); assertEquals( "not equal: steps.size()", 2, flow.getFlowSteps().size() ); }
@Test public void testDupeSourceRepeat() { Tap source1 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo/merge" ); Tap sink = new Hfs( new TextLine(), "foo" ); Pipe pipe = new Pipe( "pipe" ); Pipe merge = new CoGroup( "cogroup", pipe, new Fields( "offset" ), 1, Fields.size( 4 ) ); Map sources = new HashMap(); sources.put( "pipe", source1 ); Map sinks = new HashMap(); sinks.put( "cogroup", sink ); Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, merge ); List<FlowStep> steps = flow.getFlowSteps(); assertEquals( "not equal: steps.size()", 1, steps.size() ); }
@Test public void testCoGroupAroundCoGroupOptimized() throws Exception { Tap source10 = new Hfs( new TextLine( new Fields( "num" ) ), "foo" ); Tap source20 = new Hfs( new TextLine( new Fields( "num" ) ), "bar" ); Map sources = new HashMap(); sources.put( "source20", source20 ); sources.put( "source101", source10 ); sources.put( "source102", source10 ); // using null pos so all fields are written Tap sink = new Hfs( new TextLine(), "baz", SinkMode.REPLACE ); Pipe pipeNum20 = new Pipe( "source20" ); Pipe pipeNum101 = new Pipe( "source101" ); Pipe pipeNum102 = new Pipe( "source102" ); Pipe splice1 = new CoGroup( pipeNum20, new Fields( "num" ), pipeNum101, new Fields( "num" ), new Fields( "num1", "num2" ) ); Pipe splice2 = new CoGroup( splice1, new Fields( "num1" ), pipeNum102, new Fields( "num" ), new Fields( "num1", "num2", "num3" ) ); Properties properties = new Properties(); FlowConnectorProps.setIntermediateSchemeClass( properties, TextLine.class ); FlowConnector flowConnector = getPlatform().getFlowConnector( properties ); Flow flow = flowConnector.connect( sources, sink, splice2 ); assertEquals( "not equal: steps.size()", 2, flow.getFlowSteps().size() ); }
@Test public void testDupeSource3() { Tap source1 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo/merge" ); Tap source2 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "bar/merge" ); Tap sink = new Hfs( new TextLine(), "foo" ); Pipe left = new Pipe( "left" ); Pipe middle = new Pipe( "middle" ); Pipe right = new Pipe( "right" ); Pipe[] pipes = Pipe.pipes( left, middle, right ); Fields[] fields = Fields.fields( new Fields( "offset" ), new Fields( "offset" ), new Fields( "offset" ) ); Pipe merge = new CoGroup( "cogroup", pipes, fields, Fields.size( 6 ) ); Map sources = new HashMap(); sources.put( "left", source1 ); sources.put( "middle", source2 ); sources.put( "right", source1 ); Map sinks = new HashMap(); sinks.put( "cogroup", sink ); Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, merge ); }
@Test public void testDupeSource3() { Tap source1 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo/merge" ); Tap source2 = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "bar/merge" ); Tap sink = new Hfs( new TextLine(), "foo" ); Pipe left = new Pipe( "left" ); Pipe middle = new Pipe( "middle" ); Pipe right = new Pipe( "right" ); Pipe[] pipes = Pipe.pipes( left, middle, right ); Fields[] fields = Fields.fields( new Fields( "offset" ), new Fields( "offset" ), new Fields( "offset" ) ); Pipe merge = new CoGroup( "cogroup", pipes, fields, Fields.size( 6 ) ); Map sources = new HashMap(); sources.put( "left", source1 ); sources.put( "middle", source2 ); sources.put( "right", source1 ); Map sinks = new HashMap(); sinks.put( "cogroup", sink ); Flow flow = getPlatform().getFlowConnector().connect( sources, sinks, merge ); }