/** * Systems such as Spark do not produce a single output file often times. That method tries to detect such * split object files to reassemble them correctly. As of now assumes either a Spark layout or a single file. * * @param ostensibleInputFile the path to that has been written using some framework; might be a dictionary * @return all actual input files */ public static Collection<String> findActualInputPaths(String ostensibleInputFile) { final Optional<FileSystem> fsOptional = getFileSystem(ostensibleInputFile); if (!fsOptional.isPresent()) { LoggerFactory.getLogger(FileSystems.class).warn("Could not inspect input file {}.", ostensibleInputFile); return Collections.singleton(ostensibleInputFile); } final FileSystem fs = fsOptional.get(); if (fs.isDirectory(ostensibleInputFile)) { final Collection<String> children = fs.listChildren(ostensibleInputFile); // Look for Spark-like directory structure. if (children.stream().anyMatch(child -> child.endsWith("_SUCCESS"))) { return children.stream().filter(child -> child.matches(".*/part-\\d+")).collect(Collectors.toList()); } else { throw new RheemException("Could not identify directory structure: " + children); } } return Collections.singleton(ostensibleInputFile); }