@Override public FlowStep<JobConf> createFlowStep( ElementGraph stepElementGraph, FlowNodeGraph flowNodeGraph ) { return new HadoopFlowStep( stepElementGraph, flowNodeGraph ); } };
logWarn( "unable to remove step state file: " + stepStatePath, exception ); logWarn( "unable to remove temporary file: " + tempSink, exception ); for( Tap sink : getSinkTaps() ) cleanIntermediateData( config, sink ); for( Tap tap : getTraps() ) cleanTapMetaData( config, tap );
/** * sources are specific to step, remove all known accumulated sources, if any */ private Set<Tap> getUniqueStreamedSources() { Set<Tap> allAccumulatedSources = getAllAccumulatedSources(); // if a source is dual accumulated and streamed, honor the streamed annotation allAccumulatedSources.removeAll( getAllStreamedSources() ); // start with the full source declaration and removed undesired taps. the above methods are dependent on // annotations which may not exist, so we are safeguarding a declared tap is treated streamed by default HashSet<Tap> set = new HashSet<>( sources.keySet() ); set.removeAll( allAccumulatedSources ); return set; }
conf.setJobName( getStepDisplayName( conf.getInt( "cascading.display.id.truncate", Util.ID_LENGTH ) ) ); Set<String> serializations = getFieldDeclaredSerializations( Serialization.class ); initFromSources( flowProcess, conf ); initFromSink( flowProcess, conf ); initFromTraps( flowProcess, conf ); initFromStepConfigDef( conf ); int numSinkParts = getSink().getScheme().getNumSinkParts(); if( getGroup() != null ) conf.setNumReduceTasks( numSinkParts ); else conf.setNumMapTasks( numSinkParts ); else if( getGroup() != null ) throw new FlowException( getName(), "a default number of gather partitions must be set, see FlowRuntimeProps" ); ProcessEdge processEdge = Util.getFirst( getFlowNodeGraph().edgeSet() ); if( getGroup() == null ) if( getGroup().isSortReversed() ) conf.setOutputKeyComparatorClass( ReverseTupleComparator.class );
/** * Test a single piece Pipe, should not fail, inserts Identity pipe * * @throws IOException */ @Test public void testIdentity() throws Exception { Tap source = new Hfs( new TextLine(), "input/path" ); Tap sink = new Hfs( new TextLine(), "output/path", SinkMode.REPLACE ); Pipe pipe = new Pipe( "test" ); Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); List<FlowStep> steps = flow.getFlowSteps(); assertEquals( "wrong size", 1, steps.size() ); HadoopFlowStep step = (HadoopFlowStep) steps.get( 0 ); assertEquals( "not equal: step.sources.size()", 1, step.getSourceTaps().size() ); assertNull( "not null: step.groupBy", step.getGroup() ); assertNotNull( "null: step.sink", step.getSink() ); }
protected void cleanIntermediateData( JobConf config, Tap sink ) { if( sink.isTemporary() && ( getFlow().getFlowStats().isSuccessful() || getFlow().getRunID() == null ) ) { try { sink.deleteResource( config ); } catch( Exception exception ) { // sink all exceptions, don't fail app logWarn( "unable to remove temporary file: " + sink, exception ); } } else { cleanTapMetaData( config, sink ); } }
protected void initFromSink( FlowProcess<JobConf> flowProcess, JobConf conf ) { // init sink first so tempSink can take precedence if( getSink() != null ) getSink().sinkConfInit( flowProcess, conf ); Class<? extends OutputFormat> outputFormat = conf.getClass( "mapred.output.format.class", null, OutputFormat.class ); boolean isFileOutputFormat = false; if( outputFormat != null ) isFileOutputFormat = FileOutputFormat.class.isAssignableFrom( outputFormat ); Path outputPath = FileOutputFormat.getOutputPath( conf ); // if no output path is set, we need to substitute an alternative if the OutputFormat is file based // PartitionTap won't set the output, but will set an OutputFormat // MultiSinkTap won't set the output or set the OutputFormat // Non file based OutputFormats don't have an output path, but do have an OutputFormat set (JDBCTap..) if( outputPath == null && ( isFileOutputFormat || outputFormat == null ) ) tempSink = new TempHfs( conf, "tmp:/" + new Path( getSink().getIdentifier() ).toUri().getPath(), true ); // tempSink exists because sink is writeDirect if( tempSink != null ) tempSink.sinkConfInit( flowProcess, conf ); }
Set<Tap> uniqueSources = getUniqueStreamedSources(); Set<Tap> accumulatedSources = getAllAccumulatedSources();
@Override public Map<Object, Object> getConfigAsProperties() { return HadoopUtil.createProperties( getConfig() ); }
@Test public void testNestedProperties() throws IOException { Tap source = new Hfs( new TextLine( new Fields( "line" ) ), "/input" ); Pipe pipe = new Pipe( "test" ); pipe = new Each( pipe, new RegexSplitter( new Fields( "first", "second", "third" ), "\\s" ), Fields.ALL ); Tap sink = new Hfs( new TextLine(), "output", SinkMode.REPLACE ); Properties defaultProperties = new Properties(); defaultProperties.setProperty( "test.key", "test.value" ); HadoopFlow flow = (HadoopFlow) getPlatform().getFlowConnector( new Properties( defaultProperties ) ).connect( source, sink, pipe ); assertEquals( "test flow", "test.value", flow.getProperty( "test.key" ) ); assertEquals( "test step", "test.value", ( (HadoopFlowStep) flow.getFlowSteps().get( 0 ) ).createInitializedConfig( flow.getFlowProcess(), flow.getConfig() ).get( "test.key" ) ); }
conf.setJobName( getStepDisplayName( conf.getInt( "cascading.display.id.truncate", Util.ID_LENGTH ) ) ); Set<String> serializations = getFieldDeclaredSerializations( Serialization.class ); initFromSources( flowProcess, conf ); initFromSink( flowProcess, conf ); initFromTraps( flowProcess, conf ); initFromStepConfigDef( conf ); int numSinkParts = getSink().getScheme().getNumSinkParts(); if( getGroup() != null ) conf.setNumReduceTasks( numSinkParts ); else conf.setNumMapTasks( numSinkParts ); else if( getGroup() != null ) throw new FlowException( getName(), "a default number of gather partitions must be set, see FlowRuntimeProps" ); ProcessEdge processEdge = Util.getFirst( getFlowNodeGraph().edgeSet() ); if( getGroup() == null ) if( getGroup().isSortReversed() ) conf.setOutputKeyComparatorClass( ReverseTupleComparator.class );
/** * Test a single piece Pipe, should not fail, inserts Identity pipe * * @throws IOException */ @Test public void testIdentity() throws Exception { Tap source = new Hfs( new TextLine(), "input/path" ); Tap sink = new Hfs( new TextLine(), "output/path", SinkMode.REPLACE ); Pipe pipe = new Pipe( "test" ); Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); List<FlowStep> steps = flow.getFlowSteps(); assertEquals( "wrong size", 1, steps.size() ); HadoopFlowStep step = (HadoopFlowStep) steps.get( 0 ); assertEquals( "not equal: step.sources.size()", 1, step.getSourceTaps().size() ); assertNull( "not null: step.groupBy", step.getGroup() ); assertNotNull( "null: step.sink", step.getSink() ); }
protected void cleanIntermediateData( JobConf config, Tap sink ) { if( sink.isTemporary() && ( getFlow().getFlowStats().isSuccessful() || getFlow().getRunID() == null ) ) { try { sink.deleteResource( config ); } catch( Exception exception ) { // sink all exceptions, don't fail app logWarn( "unable to remove temporary file: " + sink, exception ); } } else { cleanTapMetaData( config, sink ); } }
protected void initFromSink( FlowProcess<JobConf> flowProcess, JobConf conf ) { // init sink first so tempSink can take precedence if( getSink() != null ) getSink().sinkConfInit( flowProcess, conf ); Class<? extends OutputFormat> outputFormat = conf.getClass( "mapred.output.format.class", null, OutputFormat.class ); boolean isFileOutputFormat = false; if( outputFormat != null ) isFileOutputFormat = FileOutputFormat.class.isAssignableFrom( outputFormat ); Path outputPath = FileOutputFormat.getOutputPath( conf ); // if no output path is set, we need to substitute an alternative if the OutputFormat is file based // PartitionTap won't set the output, but will set an OutputFormat // MultiSinkTap won't set the output or set the OutputFormat // Non file based OutputFormats don't have an output path, but do have an OutputFormat set (JDBCTap..) if( outputPath == null && ( isFileOutputFormat || outputFormat == null ) ) tempSink = new TempHfs( conf, "tmp:/" + new Path( getSink().getIdentifier() ).toUri().getPath(), true ); // tempSink exists because sink is writeDirect if( tempSink != null ) tempSink.sinkConfInit( flowProcess, conf ); }
Set<Tap> uniqueSources = getUniqueStreamedSources(); Set<Tap> accumulatedSources = getAllAccumulatedSources();
@Override public Map<Object, Object> getConfigAsProperties() { return HadoopUtil.createProperties( getConfig() ); }
@Test public void testNestedProperties() throws IOException { Tap source = new Hfs( new TextLine( new Fields( "line" ) ), "/input" ); Pipe pipe = new Pipe( "test" ); pipe = new Each( pipe, new RegexSplitter( new Fields( "first", "second", "third" ), "\\s" ), Fields.ALL ); Tap sink = new Hfs( new TextLine(), "output", SinkMode.REPLACE ); Properties defaultProperties = new Properties(); defaultProperties.setProperty( "test.key", "test.value" ); HadoopFlow flow = (HadoopFlow) getPlatform().getFlowConnector( new Properties( defaultProperties ) ).connect( source, sink, pipe ); assertEquals( "test flow", "test.value", flow.getProperty( "test.key" ) ); assertEquals( "test step", "test.value", ( (HadoopFlowStep) flow.getFlowSteps().get( 0 ) ).createInitializedConfig( flow.getFlowProcess(), flow.getConfig() ).get( "test.key" ) ); }
logWarn( "unable to remove step state file: " + stepStatePath, exception ); logWarn( "unable to remove temporary file: " + tempSink, exception ); for( Tap sink : getSinkTaps() ) cleanIntermediateData( config, sink ); for( Tap tap : getTraps() ) cleanTapMetaData( config, tap );
/** * sources are specific to step, remove all known accumulated sources, if any */ private Set<Tap> getUniqueStreamedSources() { Set<Tap> allAccumulatedSources = getAllAccumulatedSources(); // if a source is dual accumulated and streamed, honor the streamed annotation allAccumulatedSources.removeAll( getAllStreamedSources() ); // start with the full source declaration and removed undesired taps. the above methods are dependent on // annotations which may not exist, so we are safeguarding a declared tap is treated streamed by default HashSet<Tap> set = new HashSet<>( sources.keySet() ); set.removeAll( allAccumulatedSources ); return set; }
@Override public FlowStep<JobConf> createFlowStep( ElementGraph stepElementGraph, FlowNodeGraph flowNodeGraph ) { return new HadoopFlowStep( stepElementGraph, flowNodeGraph ); } };