@Override public Extractor<String, String> getExtractor(WorkUnitState state) throws IOException { if (!state.contains(SOURCE_FILEBASED_OPTIONAL_DOWNLOADER_CLASS)) { state.setProp(SOURCE_FILEBASED_OPTIONAL_DOWNLOADER_CLASS, TokenizedFileDownloader.class.getName()); } return new FileBasedExtractor<>(state, new HadoopFsHelper(state)); }
/** * If a previous file has been read, first close that file. Then search through {@link #filesToPull} to find the first * non-empty file. */ private void getNextFileToRead() throws IOException { if (this.currentFile != null && this.currentFileItr != null) { closeCurrentFile(); incrementBytesReadCounter(); } while (!this.hasNext && !this.filesToPull.isEmpty()) { this.currentFile = this.filesToPull.remove(0); this.currentFileItr = downloadFile(this.currentFile); this.hasNext = this.currentFileItr == null ? false : this.currentFileItr.hasNext(); LOG.info("Will start downloading file: " + this.currentFile); } }
@SuppressWarnings("unchecked") public Iterator<D> downloadFile(String file) throws IOException { log.info("Beginning to download file: " + file); try { InputStream inputStream = this.fileBasedExtractor.getCloser().register(this.fileBasedExtractor.getFsHelper().getFileStream(file)); Iterator<D> fileItr = (Iterator<D>) IOUtils.lineIterator(inputStream, ConfigurationKeys.DEFAULT_CHARSET_ENCODING); if (this.fileBasedExtractor.isShouldSkipFirstRecord() && fileItr.hasNext()) { fileItr.next(); } return fileItr; } catch (FileBasedHelperException e) { throw new IOException("Exception while downloading file " + file + " with message " + e.getMessage(), e); } } }
@Override public Iterator<String> downloadFile(String filePath) throws IOException { Preconditions.checkArgument(this.token != null); try { log.info("downloading file: " + filePath); InputStream inputStream = this.fileBasedExtractor.getCloser().register(this.fileBasedExtractor.getFsHelper().getFileStream(filePath)); return new RecordIterator(inputStream, this.token, this.charset); } catch (FileBasedHelperException e) { throw new IOException("Exception when trying to download file " + filePath, e); } }
/** * Initializes a list of files to pull on the first call to the method * Iterates through the file and returns a new record upon each call until * there are no more records left in the file, then it moves on to the next * file */ @Override public D readRecordImpl(@Deprecated D reuse) throws DataRecordException, IOException { this.totalRecordCount++; if (this.statusCount > 0 && this.totalRecordCount % this.statusCount == 0) { LOG.info("Total number of records processed so far: " + this.totalRecordCount); } // If records have been read, check the hasNext value, if not then get the next file to process if (this.currentFile != null && this.currentFileItr != null) { this.hasNext = this.currentFileItr.hasNext(); // If the current file is done, move to the next one if (!this.hasNext) { getNextFileToRead(); } } else { // If no records have been read yet, get the first file to process getNextFileToRead(); } if (this.hasNext) { return this.currentFileItr.next(); } LOG.info("Finished reading records from all files"); return null; }
this.counters.initialize(getMetricContext(), CounterNames.class, this.getClass());
@Override public Iterator<String> downloadFile(String filePath) throws IOException { Preconditions.checkArgument(this.token != null); try { log.info("downloading file: " + filePath); InputStream inputStream = this.fileBasedExtractor.getCloser().register(this.fileBasedExtractor.getFsHelper().getFileStream(filePath)); return new RecordIterator(inputStream, this.token, this.charset); } catch (FileBasedHelperException e) { throw new IOException("Exception when trying to download file " + filePath, e); } }
/** * Initializes a list of files to pull on the first call to the method * Iterates through the file and returns a new record upon each call until * there are no more records left in the file, then it moves on to the next * file */ @Override public D readRecordImpl(@Deprecated D reuse) throws DataRecordException, IOException { this.totalRecordCount++; if (this.statusCount > 0 && this.totalRecordCount % this.statusCount == 0) { LOG.info("Total number of records processed so far: " + this.totalRecordCount); } // If records have been read, check the hasNext value, if not then get the next file to process if (this.currentFile != null && this.currentFileItr != null) { this.hasNext = this.currentFileItr.hasNext(); // If the current file is done, move to the next one if (!this.hasNext) { getNextFileToRead(); } } else { // If no records have been read yet, get the first file to process getNextFileToRead(); } if (this.hasNext) { return this.currentFileItr.next(); } LOG.info("Finished reading records from all files"); return null; }
this.counters.initialize(getMetricContext(), CounterNames.class, this.getClass());
@SuppressWarnings("unchecked") public Iterator<D> downloadFile(String file) throws IOException { log.info("Beginning to download gzip compressed file: " + file); try { InputStream inputStream = this.fileBasedExtractor.getCloser().register(this.fileBasedExtractor.getFsHelper().getFileStream(file)); Iterator<D> fileItr = (Iterator<D>) IOUtils.lineIterator(new GZIPInputStream(inputStream), ConfigurationKeys.DEFAULT_CHARSET_ENCODING); if (this.fileBasedExtractor.isShouldSkipFirstRecord() && fileItr.hasNext()) { fileItr.next(); } return fileItr; } catch (FileBasedHelperException e) { throw new IOException("Exception while downloading file " + file + " with message " + e.getMessage(), e); } } }
/** * If a previous file has been read, first close that file. Then search through {@link #filesToPull} to find the first * non-empty file. */ private void getNextFileToRead() throws IOException { if (this.currentFile != null && this.currentFileItr != null) { closeCurrentFile(); incrementBytesReadCounter(); } while (!this.hasNext && !this.filesToPull.isEmpty()) { this.currentFile = this.filesToPull.remove(0); this.currentFileItr = downloadFile(this.currentFile); this.hasNext = this.currentFileItr == null ? false : this.currentFileItr.hasNext(); LOG.info("Will start downloading file: " + this.currentFile); } }
@Override public Extractor<String, String> getExtractor(WorkUnitState state) throws IOException { if (!state.contains(SOURCE_FILEBASED_OPTIONAL_DOWNLOADER_CLASS)) { state.setProp(SOURCE_FILEBASED_OPTIONAL_DOWNLOADER_CLASS, TokenizedFileDownloader.class.getName()); } return new FileBasedExtractor<>(state, new HadoopFsHelper(state)); }
log.info("Using " + delimiter + " as a delimiter."); reader = this.fileBasedExtractor.getCloser().register( new CSVReader(new InputStreamReader( this.fileBasedExtractor.getFsHelper().getFileStream(file), ConfigurationKeys.DEFAULT_CHARSET_ENCODING), delimiter)); } else { reader = this.fileBasedExtractor.getCloser().register( new CSVReader(new InputStreamReader( this.fileBasedExtractor.getFsHelper().getFileStream(file), ConfigurationKeys.DEFAULT_CHARSET_ENCODING))); if (this.fileBasedExtractor.isShouldSkipFirstRecord() && iterator.hasNext()) { log.info("Skipping first record"); iterator.next();
@SuppressWarnings("unchecked") public Iterator<D> downloadFile(String file) throws IOException { log.info("Beginning to download file: " + file); try { InputStream inputStream = this.fileBasedExtractor.getCloser().register(this.fileBasedExtractor.getFsHelper().getFileStream(file)); Iterator<D> fileItr = (Iterator<D>) IOUtils.lineIterator(inputStream, ConfigurationKeys.DEFAULT_CHARSET_ENCODING); if (this.fileBasedExtractor.isShouldSkipFirstRecord() && fileItr.hasNext()) { fileItr.next(); } return fileItr; } catch (FileBasedHelperException e) { throw new IOException("Exception while downloading file " + file + " with message " + e.getMessage(), e); } } }
@SuppressWarnings("unchecked") public Iterator<D> downloadFile(String file) throws IOException { log.info("Beginning to download gzip compressed file: " + file); try { InputStream inputStream = this.fileBasedExtractor.getCloser().register(this.fileBasedExtractor.getFsHelper().getFileStream(file)); Iterator<D> fileItr = (Iterator<D>) IOUtils.lineIterator(new GZIPInputStream(inputStream), ConfigurationKeys.DEFAULT_CHARSET_ENCODING); if (this.fileBasedExtractor.isShouldSkipFirstRecord() && fileItr.hasNext()) { fileItr.next(); } return fileItr; } catch (FileBasedHelperException e) { throw new IOException("Exception while downloading file " + file + " with message " + e.getMessage(), e); } } }
log.info("Using " + delimiter + " as a delimiter."); reader = this.fileBasedExtractor.getCloser().register( new CSVReader(new InputStreamReader( this.fileBasedExtractor.getFsHelper().getFileStream(file), ConfigurationKeys.DEFAULT_CHARSET_ENCODING), delimiter)); } else { reader = this.fileBasedExtractor.getCloser().register( new CSVReader(new InputStreamReader( this.fileBasedExtractor.getFsHelper().getFileStream(file), ConfigurationKeys.DEFAULT_CHARSET_ENCODING))); if (this.fileBasedExtractor.isShouldSkipFirstRecord() && iterator.hasNext()) { log.info("Skipping first record"); iterator.next();