public static FileSystem requireFileSystem(String fileUrl) { return getFileSystem(fileUrl).orElseThrow( () -> new RheemException(String.format("Could not identify filesystem for \"%s\".", fileUrl)) ); }
/** * As {@link #findActualInputPaths(String)} but requires the presence of only a single input file. */ public static String findActualSingleInputPath(String ostensibleInputFile) { final Collection<String> inputPaths = FileSystems.findActualInputPaths(ostensibleInputFile); if (inputPaths.size() != 1) { throw new RheemException(String.format( "Illegal number of files for \"%s\": %s", ostensibleInputFile, inputPaths )); // TODO: Add support. } return inputPaths.iterator().next(); }
final String actualInputPath = FileSystems.findActualSingleInputPath(inputPath); final FileSystem inputFs = FileSystems.getFileSystem(inputPath).orElseThrow( () -> new RheemException(String.format("Could not identify filesystem for \"%s\".", inputPath)) );
@Override public Tuple<Collection<ExecutionLineageNode>, Collection<ChannelInstance>> evaluate( ChannelInstance[] inputs, ChannelInstance[] outputs, JavaExecutor javaExecutor, OptimizationContext.OperatorContext operatorContext) { assert outputs.length == this.getNumOutputs(); SequenceFileIterator sequenceFileIterator; final String path; if (this.sourcePath == null) { final FileChannel.Instance input = (FileChannel.Instance) inputs[0]; path = input.getSinglePath(); } else { assert inputs.length == 0; path = this.sourcePath; } try { final String actualInputPath = FileSystems.findActualSingleInputPath(path); sequenceFileIterator = new SequenceFileIterator<>(actualInputPath); Stream<?> sequenceFileStream = StreamSupport.stream(Spliterators.spliteratorUnknownSize(sequenceFileIterator, 0), false); ((StreamChannel.Instance) outputs[0]).accept(sequenceFileStream); } catch (IOException e) { throw new RheemException(String.format("%s failed to read from %s.", this, path), e); } return ExecutionOperator.modelEagerExecution(inputs, outputs, operatorContext); }
@Override public Tuple<Collection<ExecutionLineageNode>, Collection<ChannelInstance>> evaluate( ChannelInstance[] inputs, ChannelInstance[] outputs, JavaExecutor javaExecutor, OptimizationContext.OperatorContext operatorContext) { assert inputs.length == 1; assert outputs.length == 0; JavaChannelInstance input = (JavaChannelInstance) inputs[0]; final FileSystem fs = FileSystems.requireFileSystem(this.textFileUrl); final Function<T, String> formatter = javaExecutor.getCompiler().compile(this.formattingDescriptor); try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(fs.create(this.textFileUrl)))) { input.<T>provideStream().forEach( dataQuantum -> { try { writer.write(formatter.apply(dataQuantum)); writer.write('\n'); } catch (IOException e) { throw new UncheckedIOException(e); } } ); } catch (IOException e) { throw new RheemException("Writing failed.", e); } return ExecutionOperator.modelEagerExecution(inputs, outputs, operatorContext); }
OptionalLong fileSize = FileSystems.getFileSize(TextFileSource.this.inputUrl); if (!fileSize.isPresent()) { TextFileSource.this.logger.warn("Could not determine size of {}... deliver fallback estimate.",
@Override public Tuple<Collection<ExecutionLineageNode>, Collection<ChannelInstance>> evaluate( ChannelInstance[] inputs, ChannelInstance[] outputs, SparkExecutor sparkExecutor, OptimizationContext.OperatorContext operatorContext) { final String sourcePath; if (this.sourcePath != null) { assert inputs.length == 0; sourcePath = this.sourcePath; } else { FileChannel.Instance input = (FileChannel.Instance) inputs[0]; sourcePath = input.getSinglePath(); } RddChannel.Instance output = (RddChannel.Instance) outputs[0]; final String actualInputPath = FileSystems.findActualSingleInputPath(sourcePath); final JavaRDD<String> linesRdd = sparkExecutor.sc.textFile(actualInputPath); this.name(linesRdd); final JavaRDD<T> dataQuantaRdd = linesRdd .map(line -> { // TODO: Important. Enrich type informations to create the correct parser! int tabPos = line.indexOf('\t'); return (T) new Tuple2<>( Integer.valueOf(line.substring(0, tabPos)), Float.valueOf(line.substring(tabPos + 1))); }); this.name(dataQuantaRdd); output.accept(dataQuantaRdd, sparkExecutor); return ExecutionOperator.modelLazyExecution(inputs, outputs, operatorContext); }
/** * Creates a {@link Stream} of a lines of the file. * * @param path of the file * @return the {@link Stream} */ private Stream<String> streamLines(String path) { final FileSystem fileSystem = FileSystems.getFileSystem(path).orElseThrow( () -> new IllegalStateException(String.format("No file system found for %s", path)) ); try { Iterator<String> lineIterator = this.createLineIterator(fileSystem, path); return StreamSupport.stream(Spliterators.spliteratorUnknownSize(lineIterator, 0), false); } catch (IOException e) { throw new RheemException(String.format("%s failed to read %s.", this, path), e); } }
@Override public Tuple<Collection<ExecutionLineageNode>, Collection<ChannelInstance>> evaluate( ChannelInstance[] inputs, ChannelInstance[] outputs, JavaExecutor javaExecutor, OptimizationContext.OperatorContext operatorContext) { assert outputs.length == this.getNumOutputs(); final String path; if (this.sourcePath == null) { final FileChannel.Instance input = (FileChannel.Instance) inputs[0]; path = input.getSinglePath(); } else { assert inputs.length == 0; path = this.sourcePath; } final String actualInputPath = FileSystems.findActualSingleInputPath(path); Stream<T> stream = this.createStream(actualInputPath); ((StreamChannel.Instance) outputs[0]).accept(stream); return ExecutionOperator.modelLazyExecution(inputs, outputs, operatorContext); }
@Override public void updateProgress(HashMap<String, Integer> partialProgress) throws IOException { HashMap<String, Object> progressBar = new HashMap<>(); Integer overall = 0; for (String operatorName : partialProgress.keySet()) { this.progress.put(operatorName, partialProgress.get(operatorName)); } for (String operatorName: this.progress.keySet()) { overall = overall + this.progress.get(operatorName); } if (this.progress.size()>0) overall = overall/this.progress.size(); final FileSystem progressFile = FileSystems.getFileSystem(progressUrl).get(); try (final OutputStreamWriter writer = new OutputStreamWriter(progressFile.create(progressUrl, true))) { progressBar.put("overall", overall); progressBar.put("details", progress); JSONObject jsonProgress = new JSONObject(progressBar); writer.write(jsonProgress.toString()); } catch (UncheckedIOException e) { throw e.getCause(); } } }
@Override public Tuple<Collection<ExecutionLineageNode>, Collection<ChannelInstance>> evaluate( ChannelInstance[] inputs, ChannelInstance[] outputs, SparkExecutor sparkExecutor, OptimizationContext.OperatorContext operatorContext) { final String sourcePath; if (this.sourcePath != null) { assert inputs.length == 0; sourcePath = this.sourcePath; } else { FileChannel.Instance input = (FileChannel.Instance) inputs[0]; sourcePath = input.getSinglePath(); } RddChannel.Instance output = (RddChannel.Instance) outputs[0]; final String actualInputPath = FileSystems.findActualSingleInputPath(sourcePath); final JavaRDD<Object> rdd = sparkExecutor.sc.objectFile(actualInputPath); this.name(rdd); output.accept(rdd, sparkExecutor); return ExecutionOperator.modelLazyExecution(inputs, outputs, operatorContext); }
/** * Systems such as Spark do not produce a single output file often times. That method tries to detect such * split object files to reassemble them correctly. As of now assumes either a Spark layout or a single file. * * @param ostensibleInputFile the path to that has been written using some framework; might be a dictionary * @return all actual input files */ public static Collection<String> findActualInputPaths(String ostensibleInputFile) { final Optional<FileSystem> fsOptional = getFileSystem(ostensibleInputFile); if (!fsOptional.isPresent()) { LoggerFactory.getLogger(FileSystems.class).warn("Could not inspect input file {}.", ostensibleInputFile); return Collections.singleton(ostensibleInputFile); } final FileSystem fs = fsOptional.get(); if (fs.isDirectory(ostensibleInputFile)) { final Collection<String> children = fs.listChildren(ostensibleInputFile); // Look for Spark-like directory structure. if (children.stream().anyMatch(child -> child.endsWith("_SUCCESS"))) { return children.stream().filter(child -> child.matches(".*/part-\\d+")).collect(Collectors.toList()); } else { throw new RheemException("Could not identify directory structure: " + children); } } return Collections.singleton(ostensibleInputFile); }
@Override public void doDispose() throws RheemException { Actions.doSafe(() -> { logger.info("Deleting file channel instances {}.", this.paths); final String path = this.getSinglePath(); final Optional<FileSystem> fileSystemOptional = FileSystems.getFileSystem(path); fileSystemOptional.ifPresent(fs -> { try { fs.delete(path, true); } catch (IOException e) { throw new UncheckedIOException(e); } }); }); } }
@Override public void initialize(Configuration config, String runId, List<Map> initialExecutionPlan) throws IOException { this.initialExecutionPlan = initialExecutionPlan; this.runId = runId; String runsDir = config.getStringProperty(DEFAULT_MONITOR_BASE_URL_PROPERTY_KEY, DEFAULT_MONITOR_BASE_URL); final String path = runsDir + "/" + runId; this.exPlanUrl = path + "/execplan.json"; this.progressUrl = path + "/progress.json"; final FileSystem execplanFile = FileSystems.getFileSystem(exPlanUrl).get(); try (final OutputStreamWriter writer = new OutputStreamWriter(execplanFile.create(exPlanUrl, true))) { HashMap<String, Object> jsonPlanMap = new HashMap<>(); jsonPlanMap.put("stages", initialExecutionPlan); jsonPlanMap.put("run_id", runId); JSONObject jsonPlan = new JSONObject(jsonPlanMap); writer.write(jsonPlan.toString()); } catch (UncheckedIOException e) { throw e.getCause(); } HashMap<String, Integer> initialProgress = new HashMap<>(); for (Map stage: initialExecutionPlan) { for (Map operator: (List<Map>)stage.get("operators")) { initialProgress.put((String)operator.get("name"), 0); } } updateProgress(initialProgress); }
/** * Adjusts this instance to the properties specified in the given file. * * @param configurationUrl URL to the configuration file */ public void load(String configurationUrl) { final Optional<FileSystem> fileSystem = FileSystems.getFileSystem(configurationUrl); if (!fileSystem.isPresent()) { throw new RheemException(String.format("Could not access %s.", configurationUrl)); } try (InputStream configInputStream = fileSystem.get().open(configurationUrl)) { this.load(configInputStream); } catch (Exception e) { throw new RheemException(String.format("Could not load configuration from %s.", configurationUrl), e); } }
/** * Determine the number of bytes of a given file. This method is not only a short-cut to * {@link FileSystem#getFileSize(String)} but also caches file sizes for performance reasons. * * @param fileUrl the URL of the file * @return the number of bytes of the file if it could be determined */ public static OptionalLong getFileSize(String fileUrl) { if (fileSizeCache.containsKey(fileUrl)) { return OptionalLong.of(fileSizeCache.get(fileUrl)); } final Optional<FileSystem> fileSystem = FileSystems.getFileSystem(fileUrl); if (fileSystem.isPresent()) { try { final long fileSize = fileSystem.get().getFileSize(fileUrl); fileSizeCache.put(fileUrl, fileSize); return OptionalLong.of(fileSize); } catch (FileNotFoundException e) { LOGGER.warn("Could not determine file size.", e); } } return OptionalLong.empty(); }
final FileSystem fileSystem = FileSystems.getFileSystem(path).orElseThrow( () -> new IllegalStateException(String.format("No file system found for \"%s\".", this.targetPath)) );
final Optional<FileSystem> fileSystem = FileSystems.getFileSystem(TextFileSource.this.inputUrl); if (fileSystem.isPresent()) {
@Override public Tuple<Collection<ExecutionLineageNode>, Collection<ChannelInstance>> evaluate( ChannelInstance[] inputs, ChannelInstance[] outputs, JavaExecutor javaExecutor, OptimizationContext.OperatorContext operatorContext) { assert inputs.length == this.getNumInputs(); assert outputs.length == this.getNumOutputs(); String url = this.getInputUrl().trim(); FileSystem fs = FileSystems.getFileSystem(url).orElseThrow( () -> new RheemException(String.format("Cannot access file system of %s.", url)) ); try { final InputStream inputStream = fs.open(url); Stream<String> lines = new BufferedReader(new InputStreamReader(inputStream)).lines(); ((StreamChannel.Instance) outputs[0]).accept(lines); } catch (IOException e) { throw new RheemException(String.format("Reading %s failed.", url), e); } ExecutionLineageNode prepareLineageNode = new ExecutionLineageNode(operatorContext); prepareLineageNode.add(LoadProfileEstimators.createFromSpecification( "rheem.java.textfilesource.load.prepare", javaExecutor.getConfiguration() )); ExecutionLineageNode mainLineageNode = new ExecutionLineageNode(operatorContext); mainLineageNode.add(LoadProfileEstimators.createFromSpecification( "rheem.java.textfilesource.load.main", javaExecutor.getConfiguration() )); outputs[0].getLineage().addPredecessor(mainLineageNode); return prepareLineageNode.collectAndMark(); }
final FileSystem fileSystem = FileSystems.getFileSystem(yamlUrl).orElseThrow( () -> new RheemException(String.format("No filesystem for %s.", yamlUrl)) );