private void createStream() throws FileNotFoundException, IOException { if (keepIndexing) { avroDataStream = new DataFileStream<GenericRecord>(new FileInputStream(avroFile), new GenericDatumReader<GenericRecord>()); return; } avroDataStream = null; }
/** * Get the Avro file reader for the given file. */ public static DataFileStream<GenericRecord> getAvroReader(File avroFile) throws IOException { if (avroFile.getName().endsWith(".gz")) { return new DataFileStream<>(new GZIPInputStream(new FileInputStream(avroFile)), new GenericDatumReader<GenericRecord>()); } else { return new DataFileStream<>(new FileInputStream(avroFile), new GenericDatumReader<GenericRecord>()); } }
@Override public RecordSchema getSchema(Map<String, String> variables, final InputStream contentStream, final RecordSchema readSchema) throws SchemaNotFoundException, IOException { final DataFileStream<GenericRecord> dataFileStream = new DataFileStream<>(contentStream, new GenericDatumReader<GenericRecord>()); final Schema avroSchema = dataFileStream.getSchema(); final RecordSchema recordSchema = AvroTypeUtil.createSchema(avroSchema); return recordSchema; }
public void readFields(byte[] bytes, Schema writerSchema, Schema readerSchema) throws IOException { fileSchema = writerSchema; record = new GenericData.Record(writerSchema); GenericDatumReader<GenericRecord> gdr = new GenericDatumReader<GenericRecord>(); gdr.setExpected(readerSchema); ByteArrayInputStream is = new ByteArrayInputStream(bytes); DataFileStream<GenericRecord> dfr = new DataFileStream<GenericRecord>(is, gdr); record = dfr.next(record); dfr.close(); }
@Override public Object deserialise(final byte[] allBytes, final int offset, final int length) throws SerialisationException { final DatumReader<Object> datumReader = new ReflectDatumReader<>(); try (final InputStream inputStream = new ByteArrayInputStream(allBytes, offset, length); final DataFileStream<Object> in = new DataFileStream<>(inputStream, datumReader)) { return in.next(); } catch (final IOException e) { throw new SerialisationException("Unable to deserialise object, failed to read input bytes", e); } }
private void nextInput() throws IOException{ currentInput++; Path path = inFiles.get(currentInput); FSDataInputStream input = new FSDataInputStream(Util.openFromFS(path)); reader = new DataFileStream<>(input, new GenericDatumReader<>()); if (schema == null) { // if this is the first file, the schema gets saved schema = reader.getSchema(); } else if (!schema.equals(reader.getSchema())) { // subsequent files have to have equal schemas throw new IOException("schemas dont match"); } }
private int numRowsInFile(File output) throws Exception { int rowcount = 0; try (DataFileStream<Utf8> reader = new DataFileStream<>(new FileInputStream(output), new GenericDatumReader<>())) { for (Utf8 ignored : reader) { ++rowcount; } } return rowcount; }
public static List<String> getColumnNamesFromAvro(File avro) throws IOException { List<String> ret = new ArrayList<String>(); DataFileStream<GenericRecord> dataStream = new DataFileStream<GenericRecord>(new FileInputStream(avro), new GenericDatumReader<GenericRecord>()); for (final Field field : dataStream.getSchema().getFields()) { ret.add(field.name()); } return ret; }
public void init() throws FileNotFoundException, IOException { dataStream = new DataFileStream<GenericRecord>(new FileInputStream(avroFile), new GenericDatumReader<GenericRecord>()); schema = dataStream.getSchema(); }
@Override public Optional<Map<DecoderColumnHandle, FieldValueProvider>> decodeRow(byte[] data, Map<String, String> dataMap) { GenericRecord avroRecord; DataFileStream<GenericRecord> dataFileReader = null; try { // Assumes producer uses DataFileWriter or data comes in this particular format. // TODO: Support other forms for producers dataFileReader = new DataFileStream<>(new ByteArrayInputStream(data), avroRecordReader); if (!dataFileReader.hasNext()) { throw new PrestoException(GENERIC_INTERNAL_ERROR, "No avro record found"); } avroRecord = dataFileReader.next(); if (dataFileReader.hasNext()) { throw new PrestoException(GENERIC_INTERNAL_ERROR, "Unexpected extra record found"); } } catch (Exception e) { throw new PrestoException(GENERIC_INTERNAL_ERROR, "Decoding Avro record failed.", e); } finally { closeQuietly(dataFileReader); } return Optional.of(columnDecoders.entrySet().stream() .collect(toImmutableMap( Map.Entry::getKey, entry -> entry.getValue().decodeField(avroRecord)))); }
private int getFirstIntDatum(File file) throws Exception { DataFileStream<GenericRecord> reader = new DataFileStream<>(new FileInputStream(file), new GenericDatumReader<>()); int result = (Integer) reader.next().get(0); System.out.println(result); reader.close(); return result; }
private int numRowsInFile(File output) throws Exception { DataFileStream<GenericRecord> reader = new DataFileStream<>( new FileInputStream(output), new GenericDatumReader<>()); Iterator<GenericRecord> rows = reader.iterator(); int rowcount = 0; while(rows.hasNext()) { ++rowcount; rows.next(); } reader.close(); return rowcount; }
public AvroReaderWithEmbeddedSchema(final InputStream in) throws IOException { this.in = in; dataFileStream = new DataFileStream<>(in, new NonCachingDatumReader<>()); this.avroSchema = dataFileStream.getSchema(); recordSchema = AvroTypeUtil.createSchema(avroSchema); }
private GenericData.Record readRecord(Schema schema, byte[] data) throws Exception { ByteArrayInputStream byteStream = new ByteArrayInputStream(data); GenericDatumReader<GenericData.Record> datumReader = new GenericDatumReader<>(schema); DataFileStream<GenericData.Record> reader = new DataFileStream<>(byteStream, datumReader); try { return reader.next(); } finally { reader.close(); } } }
private CodecFactory getCodec(File output) throws Exception { try (DataFileStream<GenericRecord> reader = new DataFileStream<>( new FileInputStream(output), new GenericDatumReader<>())) { String codec = reader.getMetaString(DataFileConstants.CODEC); return codec == null ? CodecFactory.nullCodec() : CodecFactory.fromString(codec); } }
private void validateCountsFile(File file) throws Exception { DatumReader<WordCount> reader = new ReflectDatumReader<>(); InputStream in = new BufferedInputStream(new FileInputStream(file)); DataFileStream<WordCount> counts = new DataFileStream<>(in, reader); int numWords = 0; for (WordCount wc : counts) { assertEquals(wc.word, WordCountUtil.COUNTS.get(wc.word), (Long)wc.count); numWords++; } in.close(); assertEquals(WordCountUtil.COUNTS.size(), numWords); }
public static void validateSortedFile(File file) throws Exception { DatumReader<ByteBuffer> reader = new GenericDatumReader<>(); try(InputStream in = new BufferedInputStream(new FileInputStream(file))) { try(DataFileStream<ByteBuffer> lines = new DataFileStream<>(in, reader)) { List<String> sortedLines = new ArrayList<>(Arrays.asList(LINES)); Collections.sort(sortedLines); for (String expectedLine : sortedLines) { ByteBuffer buf = lines.next(); byte[] b = new byte[buf.remaining()]; buf.get(b); assertEquals(expectedLine, new String(b, StandardCharsets.UTF_8).trim()); } assertFalse(lines.hasNext()); } } }
private long getNumberOfRecordsFromStream(InputStream in) throws IOException { final DatumReader<GenericRecord> datumReader = new GenericDatumReader<>(); try (DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(in, datumReader)) { GenericRecord record = null; long recordsFromStream = 0; while (dataFileReader.hasNext()) { // Reuse record object by passing it to next(). This saves us from // allocating and garbage collecting many objects for files with // many items. record = dataFileReader.next(record); recordsFromStream += 1; } return recordsFromStream; } }
@Test() public void testUseMeta() throws IOException { DataFileWriter<?> w = new DataFileWriter<>(new GenericDatumWriter<>()); File f = new File(DIR.getRoot().getPath(), "testDataFileMeta.avro"); w.setMeta("hello", "bar"); w.create(Schema.create(Type.NULL), f); w.close(); DataFileStream<Void> r = new DataFileStream<>(new FileInputStream(f), new GenericDatumReader<>()); assertTrue(r.getMetaKeys().contains("hello")); assertEquals("bar", r.getMetaString("hello")); }
public static void validateCountsFile(File file) throws Exception { int numWords = 0; DatumReader<Pair<Utf8,Long>> reader = new SpecificDatumReader<>(); try(InputStream in = new BufferedInputStream(new FileInputStream(file))) { try (DataFileStream<Pair<Utf8, Long>> counts = new DataFileStream<>(in, reader)) { for (Pair<Utf8, Long> wc : counts) { assertEquals(wc.key().toString(), COUNTS.get(wc.key().toString()), wc.value()); numWords++; } checkMeta(counts); } } assertEquals(COUNTS.size(), numWords); }