private void verifyCoGrouper() { if( isJoin() && joiner instanceof BufferJoin ) throw new IllegalArgumentException( "invalid joiner, may not use BufferJoiner in a HashJoin" ); if( joiner == null ) { joiner = new InnerJoin(); return; } if( joiner.numJoins() == -1 ) return; int joins = Math.max( numSelfJoins, keyFieldsMap.size() - 1 ); // joining two streams is one join if( joins != joiner.numJoins() ) throw new IllegalArgumentException( "invalid joiner, only accepts " + joiner.numJoins() + " joins, there are: " + joins ); }
/** * Constructor Splice creates a new Splice instance. * * @param spliceName of type String * @param pipes of type Pipe[] * @param groupFields of type Fields * @param sortFields of type Fields * @param reverseOrder of type boolean */ protected Splice( String spliceName, Pipe[] pipes, Fields groupFields, Fields sortFields, boolean reverseOrder ) { if( pipes == null ) throw new IllegalArgumentException( "pipes array may not be null" ); if( groupFields == null ) throw new IllegalArgumentException( "groupFields may not be null" ); setKind(); this.spliceName = spliceName; for( Pipe pipe : pipes ) { addPipe( pipe ); this.keyFieldsMap.put( pipe.getName(), groupFields ); if( sortFields != null ) this.sortFieldsMap.put( pipe.getName(), sortFields ); } this.reverseOrder = reverseOrder; this.joiner = new InnerJoin(); }
private Joiner getJoiner() { switch( getJoinType() ) { case INNER: return new InnerJoin(); case LEFT: return new LeftJoin(); case RIGHT: return new RightJoin(); case FULL: return new OuterJoin(); default: throw new IllegalStateException( "unknown join type" ); } } }
@Test public void testCross() throws Exception { getPlatform().copyFromLocal( inputFileLhs ); getPlatform().copyFromLocal( inputFileRhs ); Map sources = new HashMap(); sources.put( "lhs", getPlatform().getTextFile( inputFileLhs ) ); sources.put( "rhs", getPlatform().getTextFile( inputFileRhs ) ); Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "cross" ), SinkMode.REPLACE ); Pipe pipeLower = new Each( "lhs", new Fields( "line" ), new RegexSplitter( new Fields( "numLHS", "charLHS" ), " " ) ); Pipe pipeUpper = new Each( "rhs", new Fields( "line" ), new RegexSplitter( new Fields( "numRHS", "charRHS" ), " " ) ); Pipe cross = new CoGroup( pipeLower, new Fields( "numLHS" ), pipeUpper, new Fields( "numRHS" ), new InnerJoin() ); Flow flow = getPlatform().getFlowConnector().connect( sources, sink, cross ); flow.complete(); validateLength( flow, 37, null ); List<Tuple> values = getSinkAsList( flow ); assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) ); assertTrue( values.contains( new Tuple( "1\ta\t1\tB" ) ) ); }
@Test public void testCross() throws Exception { getPlatform().copyFromLocal( inputFileLhs ); getPlatform().copyFromLocal( inputFileRhs ); Map sources = new HashMap(); sources.put( "lhs", getPlatform().getTextFile( inputFileLhs ) ); sources.put( "rhs", getPlatform().getTextFile( inputFileRhs ) ); Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "cross" ), SinkMode.REPLACE ); Pipe pipeLower = new Each( "lhs", new Fields( "line" ), new RegexSplitter( new Fields( "numLHS", "charLHS" ), " " ) ); Pipe pipeUpper = new Each( "rhs", new Fields( "line" ), new RegexSplitter( new Fields( "numRHS", "charRHS" ), " " ) ); Pipe cross = new HashJoin( pipeLower, new Fields( "numLHS" ), pipeUpper, new Fields( "numRHS" ), new InnerJoin() ); Flow flow = getPlatform().getFlowConnector().connect( sources, sink, cross ); flow.complete(); validateLength( flow, 37, null ); List<Tuple> values = getSinkAsList( flow ); assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) ); assertTrue( values.contains( new Tuple( "1\ta\t1\tB" ) ) ); }
@Test public void testCross() throws Exception { getPlatform().copyFromLocal( inputFileLhs ); getPlatform().copyFromLocal( inputFileRhs ); Map sources = new HashMap(); sources.put( "lhs", getPlatform().getTextFile( inputFileLhs ) ); sources.put( "rhs", getPlatform().getTextFile( inputFileRhs ) ); Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "cross" ), SinkMode.REPLACE ); Pipe pipeLower = new Each( "lhs", new Fields( "line" ), new RegexSplitter( new Fields( "numLHS", "charLHS" ), " " ) ); Pipe pipeUpper = new Each( "rhs", new Fields( "line" ), new RegexSplitter( new Fields( "numRHS", "charRHS" ), " " ) ); Pipe cross = new CoGroup( pipeLower, new Fields( "numLHS" ), pipeUpper, new Fields( "numRHS" ), new InnerJoin() ); Flow flow = getPlatform().getFlowConnector().connect( sources, sink, cross ); flow.complete(); validateLength( flow, 37, null ); List<Tuple> values = getSinkAsList( flow ); assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) ); assertTrue( values.contains( new Tuple( "1\ta\t1\tB" ) ) ); }
@Test public void testCoGroup() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); Map sources = new HashMap(); sources.put( "lower", sourceLower ); sources.put( "upper", sourceUpper ); Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "cogroup" ), SinkMode.REPLACE ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); Pipe splice = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new InnerJoin( Fields.size( 4 ) ) ); Map<Object, Object> properties = getProperties(); // make sure hasher is getting called, but does nothing special FlowProps.setDefaultTupleElementComparator( properties, getPlatform().getStringComparator( false ).getClass().getCanonicalName() ); Flow flow = getPlatform().getFlowConnector( properties ).connect( sources, sink, splice ); flow.complete(); validateLength( flow, 5 ); List<Tuple> values = getSinkAsList( flow ); assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) ); assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) ); }
@Test public void testCoGroup() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); Map sources = new HashMap(); sources.put( "lower", sourceLower ); sources.put( "upper", sourceUpper ); Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "cogroup" ), SinkMode.REPLACE ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); Pipe splice = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new InnerJoin( Fields.size( 4 ) ) ); Map<Object, Object> properties = getProperties(); // make sure hasher is getting called, but does nothing special FlowProps.setDefaultTupleElementComparator( properties, getPlatform().getStringComparator( false ).getClass().getCanonicalName() ); Flow flow = getPlatform().getFlowConnector( properties ).connect( sources, sink, splice ); flow.complete(); validateLength( flow, 5 ); List<Tuple> values = getSinkAsList( flow ); assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) ); assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) ); }
@Test public void testCross() throws Exception { getPlatform().copyFromLocal( inputFileLhs ); getPlatform().copyFromLocal( inputFileRhs ); Map sources = new HashMap(); sources.put( "lhs", getPlatform().getTextFile( inputFileLhs ) ); sources.put( "rhs", getPlatform().getTextFile( inputFileRhs ) ); Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "cross" ), SinkMode.REPLACE ); Pipe pipeLower = new Each( "lhs", new Fields( "line" ), new RegexSplitter( new Fields( "numLHS", "charLHS" ), " " ) ); Pipe pipeUpper = new Each( "rhs", new Fields( "line" ), new RegexSplitter( new Fields( "numRHS", "charRHS" ), " " ) ); Pipe cross = new HashJoin( pipeLower, new Fields( "numLHS" ), pipeUpper, new Fields( "numRHS" ), new InnerJoin() ); Flow flow = getPlatform().getFlowConnector().connect( sources, sink, cross ); flow.complete(); validateLength( flow, 37, null ); List<Tuple> values = getSinkAsList( flow ); assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) ); assertTrue( values.contains( new Tuple( "1\ta\t1\tB" ) ) ); }
Pipe merge = new HashJoin( "join", Pipe.pipes( left, right ), fields, Fields.size( 4 ), new InnerJoin() );
Pipe merge = new HashJoin( "join", Pipe.pipes( left, right ), fields, Fields.size( 4 ), new InnerJoin() );
Pipe pipeUpper = new Each( "rhs", new Fields( "line" ), new RegexSplitter( new Fields( "numRHS", "charRHS" ), " " ) ); Pipe join = new HashJoin( pipeLower, new Fields( "numLHS" ), pipeUpper, new Fields( "numRHS" ), new InnerJoin() );
Pipe pipeUpper = new Each( "rhs", new Fields( "line" ), new RegexSplitter( new Fields( "numRHS", "charRHS" ), " " ) ); Pipe join = new HashJoin( pipeLower, new Fields( "numLHS" ), pipeUpper, new Fields( "numRHS" ), new InnerJoin() );
Pipe pipeUpper = new Each( "rhs", new Fields( "line" ), new RegexSplitter( new Fields( "numRHS", "charRHS" ), " " ) ); Pipe joinFirst = new HashJoin( pipeLower, new Fields( "numLHS" ), pipeUpper, new Fields( "numRHS" ), new InnerJoin() );
@Test public void testCoGroup() throws Exception { getPlatform().copyFromLocal( inputFileLower ); getPlatform().copyFromLocal( inputFileUpper ); Tap sourceLower = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileLower ); Tap sourceUpper = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileUpper ); Map sources = new HashMap(); sources.put( "lower", sourceLower ); sources.put( "upper", sourceUpper ); Tap sink = getPlatform().getTextFile( new Fields( "line" ), getOutputPath( "cogroup" ), SinkMode.REPLACE ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); pipeLower = new Each( pipeLower, new Fields( "num", "char" ), new Stop( new Limit( 2 ) ) ); Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); pipeUpper = new Each( pipeUpper, new Fields( "num", "char" ), new Stop( new Limit( 2 ) ) ); Pipe splice = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new InnerJoin( Fields.size( 4 ) ) ); splice = new Each( splice, Fields.ALL, new Stop( new Limit( 2 ) ) ); Map<Object, Object> properties = getProperties(); Flow flow = getPlatform().getFlowConnector( properties ).connect( sources, sink, splice ); flow.complete(); validateLength( flow, 2 ); List<Tuple> values = getSinkAsList( flow ); assertTrue( values.contains( new Tuple( "1\ta\t1\tA" ) ) ); assertTrue( values.contains( new Tuple( "2\tb\t2\tB" ) ) ); assertEquals( 2, flow.getFlowStats().getCounterValue( StepCounters.Tuples_Written ) ); assertEquals( 6, flow.getFlowStats().getCounterValue( StepCounters.Tuples_Read ) ); } }
Pipe pipeUpper = new Each( "rhs", new Fields( "line" ), new RegexSplitter( new Fields( "numRHS", "charRHS" ), " " ) ); Pipe joinFirst = new HashJoin( pipeLower, new Fields( "numLHS" ), pipeUpper, new Fields( "numRHS" ), new InnerJoin() );
pipeUpper1 = new Each( pipeUpper1, new Identity() ); Pipe splice1 = new CoGroup( "group", Pipe.pipes( pipeLower1, pipeLower2, pipeUpper1 ), Fields.fields( new Fields( "num" ), new Fields( "num" ), new Fields( "num" ) ), new Fields( "num1", "char1", "num2", "char2", "num3", "char3" ), new InnerJoin() );
Pipe inner = new CoGroup( "inner", fielded, new Fields( 0 ), uniques, new Fields( "word" ), new InnerJoin() ); Pipe outer = new CoGroup( "outer", fielded, new Fields( 0 ), uniques, new Fields( "word" ), new OuterJoin() ); Pipe left = new CoGroup( "left", fielded, new Fields( 0 ), uniques, new Fields( "word" ), new LeftJoin() );
Pipe inner = new CoGroup( "inner", fielded, new Fields( 0 ), uniques, new Fields( "word" ), new InnerJoin() ); Pipe outer = new CoGroup( "outer", fielded, new Fields( 0 ), uniques, new Fields( "word" ), new OuterJoin() ); Pipe left = new CoGroup( "left", fielded, new Fields( 0 ), uniques, new Fields( "word" ), new LeftJoin() );