@Override public Tuple<Collection<ExecutionLineageNode>, Collection<ChannelInstance>> evaluate( ChannelInstance[] inputs, ChannelInstance[] outputs, SparkExecutor sparkExecutor, OptimizationContext.OperatorContext operatorContext) { final String sourcePath; if (this.sourcePath != null) { assert inputs.length == 0; sourcePath = this.sourcePath; } else { FileChannel.Instance input = (FileChannel.Instance) inputs[0]; sourcePath = input.getSinglePath(); } RddChannel.Instance output = (RddChannel.Instance) outputs[0]; final String actualInputPath = FileSystems.findActualSingleInputPath(sourcePath); final JavaRDD<String> linesRdd = sparkExecutor.sc.textFile(actualInputPath); this.name(linesRdd); final JavaRDD<T> dataQuantaRdd = linesRdd .map(line -> { // TODO: Important. Enrich type informations to create the correct parser! int tabPos = line.indexOf('\t'); return (T) new Tuple2<>( Integer.valueOf(line.substring(0, tabPos)), Float.valueOf(line.substring(tabPos + 1))); }); this.name(dataQuantaRdd); output.accept(dataQuantaRdd, sparkExecutor); return ExecutionOperator.modelLazyExecution(inputs, outputs, operatorContext); }
final String path = output.addGivenOrTempPath(this.targetPath, javaExecutor.getCompiler().getConfiguration());
throws IOException { assert inputFileChannelInstance.wasProduced(); final String inputPath = inputFileChannelInstance.getSinglePath(); final String actualInputPath = FileSystems.findActualSingleInputPath(inputPath); final FileSystem inputFs = FileSystems.getFileSystem(inputPath).orElseThrow( "rheem.graphchi.pagerank.load.main", configuration )); mainExecutionLineage.addPredecessor(inputFileChannelInstance.getLineage());
final String path = output.addGivenOrTempPath(this.targetPath, javaExecutor.getCompiler().getConfiguration()); final FileSystem fileSystem = FileSystems.getFileSystem(path).orElseThrow( () -> new IllegalStateException(String.format("No file system found for \"%s\".", this.targetPath))
@Override public Tuple<Collection<ExecutionLineageNode>, Collection<ChannelInstance>> evaluate( ChannelInstance[] inputs, ChannelInstance[] outputs, JavaExecutor javaExecutor, OptimizationContext.OperatorContext operatorContext) { assert outputs.length == this.getNumOutputs(); SequenceFileIterator sequenceFileIterator; final String path; if (this.sourcePath == null) { final FileChannel.Instance input = (FileChannel.Instance) inputs[0]; path = input.getSinglePath(); } else { assert inputs.length == 0; path = this.sourcePath; } try { final String actualInputPath = FileSystems.findActualSingleInputPath(path); sequenceFileIterator = new SequenceFileIterator<>(actualInputPath); Stream<?> sequenceFileStream = StreamSupport.stream(Spliterators.spliteratorUnknownSize(sequenceFileIterator, 0), false); ((StreamChannel.Instance) outputs[0]).accept(sequenceFileStream); } catch (IOException e) { throw new RheemException(String.format("%s failed to read from %s.", this, path), e); } return ExecutionOperator.modelEagerExecution(inputs, outputs, operatorContext); }
@Override public Tuple<Collection<ExecutionLineageNode>, Collection<ChannelInstance>> evaluate( ChannelInstance[] inputs, ChannelInstance[] outputs, SparkExecutor sparkExecutor, OptimizationContext.OperatorContext operatorContext) { final String sourcePath; if (this.sourcePath != null) { assert inputs.length == 0; sourcePath = this.sourcePath; } else { FileChannel.Instance input = (FileChannel.Instance) inputs[0]; sourcePath = input.getSinglePath(); } RddChannel.Instance output = (RddChannel.Instance) outputs[0]; final String actualInputPath = FileSystems.findActualSingleInputPath(sourcePath); final JavaRDD<Object> rdd = sparkExecutor.sc.objectFile(actualInputPath); this.name(rdd); output.accept(rdd, sparkExecutor); return ExecutionOperator.modelLazyExecution(inputs, outputs, operatorContext); }
@Override public Tuple<Collection<ExecutionLineageNode>, Collection<ChannelInstance>> evaluate( ChannelInstance[] inputs, ChannelInstance[] outputs, SparkExecutor sparkExecutor, OptimizationContext.OperatorContext operatorContext) { assert inputs.length == this.getNumInputs(); final FileChannel.Instance output = (FileChannel.Instance) outputs[0]; final String targetPath = output.addGivenOrTempPath(this.targetPath, sparkExecutor.getConfiguration()); final RddChannel.Instance input = (RddChannel.Instance) inputs[0]; final JavaRDD<Object> rdd = input.provideRdd(); final JavaRDD<String> serializedRdd = rdd .map(dataQuantum -> { // TODO: Once there are more tuple types, make this generic. @SuppressWarnings("unchecked") Tuple2<Object, Object> tuple2 = (Tuple2<Object, Object>) dataQuantum; return String.valueOf(tuple2.field0) + '\t' + String.valueOf(tuple2.field1); }); this.name(serializedRdd); serializedRdd .coalesce(1) // TODO: Allow more than one TSV file? .saveAsTextFile(targetPath); return ExecutionOperator.modelEagerExecution(inputs, outputs, operatorContext); }
@Override public Tuple<Collection<ExecutionLineageNode>, Collection<ChannelInstance>> evaluate( ChannelInstance[] inputs, ChannelInstance[] outputs, JavaExecutor javaExecutor, OptimizationContext.OperatorContext operatorContext) { assert outputs.length == this.getNumOutputs(); final String path; if (this.sourcePath == null) { final FileChannel.Instance input = (FileChannel.Instance) inputs[0]; path = input.getSinglePath(); } else { assert inputs.length == 0; path = this.sourcePath; } final String actualInputPath = FileSystems.findActualSingleInputPath(path); Stream<T> stream = this.createStream(actualInputPath); ((StreamChannel.Instance) outputs[0]).accept(stream); return ExecutionOperator.modelLazyExecution(inputs, outputs, operatorContext); }
@Override public Tuple<Collection<ExecutionLineageNode>, Collection<ChannelInstance>> evaluate( ChannelInstance[] inputs, ChannelInstance[] outputs, SparkExecutor sparkExecutor, OptimizationContext.OperatorContext operatorContext) { assert inputs.length == this.getNumInputs(); assert outputs.length <= 1; final FileChannel.Instance output = (FileChannel.Instance) outputs[0]; final String targetPath = output.addGivenOrTempPath(this.targetPath, sparkExecutor.getConfiguration()); RddChannel.Instance input = (RddChannel.Instance) inputs[0]; input.provideRdd() .coalesce(1) // TODO: Remove. This only hotfixes the issue that JavaObjectFileSource reads only a single file. .saveAsObjectFile(targetPath); LoggerFactory.getLogger(this.getClass()).info("Writing dataset to {}.", targetPath); return ExecutionOperator.modelEagerExecution(inputs, outputs, operatorContext); }
@Override public void doDispose() throws RheemException { Actions.doSafe(() -> { logger.info("Deleting file channel instances {}.", this.paths); final String path = this.getSinglePath(); final Optional<FileSystem> fileSystemOptional = FileSystems.getFileSystem(path); fileSystemOptional.ifPresent(fs -> { try { fs.delete(path, true); } catch (IOException e) { throw new UncheckedIOException(e); } }); }); } }
@Override public ChannelInstance createInstance(Executor executor, OptimizationContext.OperatorContext producerOperatorContext, int producerOutputIndex) { // NB: File channels are not inherent to a certain Platform, therefore are not tied to the executor. return new Instance(producerOperatorContext, producerOutputIndex); }
public String addGivenOrTempPath(String pathOrNull, Configuration configuration) { final String path = pathOrNull == null ? this.generateTempPath(configuration) : pathOrNull; this.addPath(path); return path; }