/** * Creates a seekable input stream to an Avro container file. * * @param conf The hadoop configuration. * @param path The path to the avro container file. * @throws IOException If there is an error reading from the path. */ protected SeekableInput createSeekableInput(Configuration conf, Path path) throws IOException { return new FsInput(path, conf); }
/** * Returns a seekable FsInput using the owning filesystem, * or the default if none is given. * @param filename The filename to be opened * @throws IOException */ static FsInput openSeekableFromFS(String filename) throws IOException { return new FsInput(new Path(filename), new Configuration()); }
public FsInput getFsInput() throws IOException { Path path = new Path(this.filePathInHdfs); Configuration conf = getConfiguration(); return new FsInput(path, conf); }
/** * Get Avro schema from an Avro data file. */ public static Schema getSchemaFromDataFile(Path dataFile, FileSystem fs) throws IOException { try (SeekableInput sin = new FsInput(dataFile, fs.getConf()); DataFileReader<GenericRecord> reader = new DataFileReader<>(sin, new GenericDatumReader<GenericRecord>())) { return reader.getSchema(); } }
public AvroAsTextRecordReader(JobConf job, FileSplit split) throws IOException { this(DataFileReader.openReader (new FsInput(split.getPath(), job), new GenericDatumReader<>()), split); }
/** * Get the latest avro schema for a directory * @param directory the input dir that contains avro files * @param fs the {@link FileSystem} for the given directory. * @param latest true to return latest schema, false to return oldest schema * @return the latest/oldest schema in the directory * @throws IOException */ public static Schema getDirectorySchema(Path directory, FileSystem fs, boolean latest) throws IOException { Schema schema = null; try (Closer closer = Closer.create()) { List<FileStatus> files = getDirectorySchemaHelper(directory, fs); if (files == null || files.size() == 0) { LOG.warn("There is no previous avro file in the directory: " + directory); } else { FileStatus file = latest ? files.get(0) : files.get(files.size() - 1); LOG.debug("Path to get the avro schema: " + file); FsInput fi = new FsInput(file.getPath(), fs.getConf()); GenericDatumReader<GenericRecord> genReader = new GenericDatumReader<>(); schema = closer.register(new DataFileReader<>(fi, genReader)).getSchema(); } } catch (IOException ioe) { throw new IOException("Cannot get the schema for directory " + directory, ioe); } return schema; }
/** * Returns an {@link DataFileReader} to the specified avro file. * <p> * Note: It is the caller's responsibility to close the returned {@link DataFileReader}. * </p> * * @param file The path to the avro file to open. * @return A {@link DataFileReader} for the specified avro file. * @throws FileBasedHelperException if there is a problem opening the {@link InputStream} for the specified file. */ public DataFileReader<GenericRecord> getAvroFile(String file) throws FileBasedHelperException { try { if (!this.getFileSystem().exists(new Path(file))) { LOGGER.warn(file + " does not exist."); return null; } if (this.getState().getPropAsBoolean(ConfigurationKeys.SHOULD_FS_PROXY_AS_USER, ConfigurationKeys.DEFAULT_SHOULD_FS_PROXY_AS_USER)) { return new DataFileReader<>(new ProxyFsInput(new Path(file), this.getFileSystem()), new GenericDatumReader<GenericRecord>()); } return new DataFileReader<>(new FsInput(new Path(file), this.getFileSystem().getConf()), new GenericDatumReader<GenericRecord>()); } catch (IOException e) { throw new FileBasedHelperException("Failed to open avro file " + file + " due to error " + e.getMessage(), e); } }
public Schema getAvroSchema(String file) throws FileBasedHelperException { DataFileReader<GenericRecord> dfr = null; try { if (this.getState().getPropAsBoolean(ConfigurationKeys.SHOULD_FS_PROXY_AS_USER, ConfigurationKeys.DEFAULT_SHOULD_FS_PROXY_AS_USER)) { dfr = new DataFileReader<>(new ProxyFsInput(new Path(file), this.getFileSystem()), new GenericDatumReader<GenericRecord>()); } else { dfr = new DataFileReader<>(new FsInput(new Path(file), this.getFileSystem().getConf()), new GenericDatumReader<GenericRecord>()); } return dfr.getSchema(); } catch (IOException e) { throw new FileBasedHelperException("Failed to open avro file " + file + " due to error " + e.getMessage(), e); } finally { if (dfr != null) { try { dfr.close(); } catch (IOException e) { LOGGER.error("Failed to close avro file " + file, e); } } } }
@Before public void setUp() throws Exception { conf = new Configuration(); conf.set("fs.default.name", "file:///"); file = new File(DIR.getRoot(), "file.txt"); try (PrintWriter out = new PrintWriter(new OutputStreamWriter(new FileOutputStream(file), Charset.forName("UTF-8")))) { out.print(FILE_CONTENTS); } fsInput = new FsInput(new Path(file.getPath()), conf); }
public TetherRecordReader(JobConf job, FileSplit split) throws IOException { this.in = new FsInput(split.getPath(), job); this.reader = new DataFileReader<>(in, new GenericDatumReader<>()); reader.sync(split.getStart()); // sync to start this.start = in.tell(); this.end = split.getStart() + split.getLength(); job.set(AvroJob.INPUT_SCHEMA, reader.getSchema().toString()); }
public static List<GenericRecord> getRecordFromFile(String path) throws IOException { Configuration config = new Configuration(); SeekableInput input = new FsInput(new Path(path), config); DatumReader<GenericRecord> reader1 = new GenericDatumReader<>(); FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader1); List<GenericRecord> records = new ArrayList<>(); for (GenericRecord datum : fileReader) { records.add(datum); } fileReader.close(); return records; }
public static List<GenericRecord> getRecordFromFile(String path) throws IOException { Configuration config = new Configuration(); SeekableInput input = new FsInput(new Path(path), config); DatumReader<GenericRecord> reader1 = new GenericDatumReader<>(); FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader1); List<GenericRecord> records = new ArrayList<>(); for (GenericRecord datum : fileReader) { records.add(datum); } fileReader.close(); return records; }
public TestExtractor(WorkUnitState workUnitState) { //super(workUnitState); Schema schema = new Schema.Parser().parse(AVRO_SCHEMA); Path sourceFile = new Path(workUnitState.getWorkunit().getProp(TestSource.SOURCE_FILE_KEY)); LOG.info("Reading from source file " + sourceFile); DatumReader<GenericRecord> datumReader = new GenericDatumReader<GenericRecord>(schema); try { FileSystem fs = FileSystem .get(URI.create(workUnitState.getProp(ConfigurationKeys.FS_URI_KEY, ConfigurationKeys.LOCAL_FS_URI)), new Configuration()); fs.makeQualified(sourceFile); this.dataFileReader = new DataFileReader<GenericRecord>(new FsInput(sourceFile, new Configuration()), datumReader); } catch (IOException ioe) { LOG.error("Failed to read the source file " + sourceFile, ioe); } }
@Test public void testConfigurationConstructor() throws Exception { try (FsInput in = new FsInput(new Path(file.getPath()), conf)) { int expectedByteCount = 1; byte[] readBytes = new byte[expectedByteCount]; int actualByteCount = fsInput.read(readBytes, 0, expectedByteCount); assertThat(actualByteCount, is(equalTo(expectedByteCount))); } }
@Test public void testFileSystemConstructor() throws Exception { Path path = new Path(file.getPath()); FileSystem fs = path.getFileSystem(conf); try (FsInput in = new FsInput(path, fs)) { int expectedByteCount = 1; byte[] readBytes = new byte[expectedByteCount]; int actualByteCount = fsInput.read(readBytes, 0, expectedByteCount); assertThat(actualByteCount, is(equalTo(expectedByteCount))); } }
/** * Constructs a reader. * * @param options The options. * @throws IOException If there is an error. */ public Reader(Options options) throws IOException { mKeySchema = options.getKeySchema(); this.model = options.getDataModel(); // Load the whole index file into memory. Path indexFilePath = new Path(options.getPath(), INDEX_FILENAME); LOG.debug("Loading the index from " + indexFilePath); mIndex = loadIndexFile(options.getConfiguration(), indexFilePath, mKeySchema); // Open the data file. Path dataFilePath = new Path(options.getPath(), DATA_FILENAME); LOG.debug("Loading the data file " + dataFilePath); Schema recordSchema = AvroKeyValue.getSchema(mKeySchema, options.getValueSchema()); DatumReader<GenericRecord> datumReader = model.createDatumReader(recordSchema); mDataFileReader = new DataFileReader<> (new FsInput(dataFilePath, options.getConfiguration()), datumReader); }
public AvroRecordReader(JobConf job, FileSplit split) throws IOException { this(DataFileReader.openReader (new FsInput(split.getPath(), job), AvroJob.createInputDataModel(job) .createDatumReader(AvroJob.getInputSchema(job))), split); }
public AvroGenericRecordReader(JobConf job, FileSplit split, Reporter reporter) throws IOException { this.jobConf = job; Schema latest; try { latest = getSchema(job, split); } catch (AvroSerdeException e) { throw new IOException(e); } GenericDatumReader<GenericRecord> gdr = new GenericDatumReader<GenericRecord>(); if(latest != null) { gdr.setExpected(latest); } if (split.getLength() == 0) { this.isEmptyInput = true; this.start = 0; this.reader = null; } else { this.isEmptyInput = false; this.reader = new DataFileReader<GenericRecord>(new FsInput(split.getPath(), job), gdr); this.reader.sync(split.getStart()); this.start = reader.tell(); } this.stop = split.getStart() + split.getLength(); this.recordReaderID = new UID(); }
private void readAndCheckResultsFromHdfs(RecordHeader header, List<TestLogData> testLogs) throws IOException { Path logsPath = new Path("/logs" + Path.SEPARATOR + applicationToken + Path.SEPARATOR + logSchemaVersion + Path.SEPARATOR + "data*"); FileStatus[] statuses = fileSystem.globStatus(logsPath); List<TestLogData> resultTestLogs = new ArrayList<>(); Schema wrapperSchema = RecordWrapperSchemaGenerator.generateRecordWrapperSchema(TestLogData.getClassSchema().toString()); for (FileStatus status : statuses) { FileReader<GenericRecord> fileReader = null; try { SeekableInput input = new FsInput(status.getPath(), fileSystem.getConf()); DatumReader<GenericRecord> datumReader = new SpecificDatumReader<>(wrapperSchema); fileReader = DataFileReader.openReader(input, datumReader); for (GenericRecord record : fileReader) { RecordHeader recordHeader = (RecordHeader) record.get(RecordWrapperSchemaGenerator.RECORD_HEADER_FIELD); Assert.assertEquals(header, recordHeader); TestLogData recordData = (TestLogData) record.get(RecordWrapperSchemaGenerator.RECORD_DATA_FIELD); resultTestLogs.add(recordData); } } finally { IOUtils.closeQuietly(fileReader); } } Assert.assertEquals(testLogs, resultTestLogs); }
@Test public void testDeflateClassCodec() throws IOException { Configuration conf = new Configuration(); Path myfile = new Path(mTempDir.getRoot().getPath(), "myfile"); Schema key = Schema.create(Schema.Type.STRING); Schema value = Schema.create(Schema.Type.STRING); Schema recordSchema = AvroKeyValue.getSchema(key, value); DatumReader<GenericRecord> datumReader = SpecificData.get().createDatumReader(recordSchema); DataFileReader<GenericRecord> reader; LOG.debug("Using CodecFactory.deflateCodec() for a SortedKeyValueFile..."); SortedKeyValueFile.Writer.Options options = new SortedKeyValueFile.Writer.Options() .withKeySchema(key) .withValueSchema(value) .withConfiguration(conf) .withPath(myfile) .withCodec(CodecFactory.deflateCodec(9)); SortedKeyValueFile.Writer<CharSequence, CharSequence> writer = new SortedKeyValueFile.Writer<>(options); writer.close(); reader = new DataFileReader<>( new FsInput(new Path(myfile, SortedKeyValueFile.DATA_FILENAME), conf), datumReader); assertEquals("deflate", reader.getMetaString("avro.codec")); reader.close(); }