Refine search
@Override public void process(InputStream rawIn) throws IOException { try (final InputStream in = new BufferedInputStream(rawIn); final DataFileStream<GenericRecord> reader = new DataFileStream<>(in, new GenericDatumReader<GenericRecord>())) { final Schema schema = reader.getSchema(); if (schema == null) { throw new ProcessException("Avro schema was null"); for (String key : reader.getMetaKeys()) { if (requestedMetadataKeys.contains(key)) { avroMetadata.put(key, reader.getMetaString(key)); final byte[] rawFingerprint = SchemaNormalization.parsingFingerprint(fingerprintAlgorithm, schema); avroMetadata.put(SCHEMA_FINGERPRINT_ATTR, Hex.encodeHexString(rawFingerprint)); avroMetadata.put(SCHEMA_TYPE_ATTR, schema.getType().getName()); avroMetadata.put(SCHEMA_NAME_ATTR, schema.getName()); } catch (NoSuchAlgorithmException e) { long recordCount = reader.getBlockCount(); try { while (reader.nextBlock() != null) { recordCount += reader.getBlockCount();
@Override public Optional<Map<DecoderColumnHandle, FieldValueProvider>> decodeRow(byte[] data, Map<String, String> dataMap) { GenericRecord avroRecord; DataFileStream<GenericRecord> dataFileReader = null; try { // Assumes producer uses DataFileWriter or data comes in this particular format. // TODO: Support other forms for producers dataFileReader = new DataFileStream<>(new ByteArrayInputStream(data), avroRecordReader); if (!dataFileReader.hasNext()) { throw new PrestoException(GENERIC_INTERNAL_ERROR, "No avro record found"); } avroRecord = dataFileReader.next(); if (dataFileReader.hasNext()) { throw new PrestoException(GENERIC_INTERNAL_ERROR, "Unexpected extra record found"); } } catch (Exception e) { throw new PrestoException(GENERIC_INTERNAL_ERROR, "Decoding Avro record failed.", e); } finally { closeQuietly(dataFileReader); } return Optional.of(columnDecoders.entrySet().stream() .collect(toImmutableMap( Map.Entry::getKey, entry -> entry.getValue().decodeField(avroRecord)))); }
public void readFields(byte[] bytes, Schema writerSchema, Schema readerSchema) throws IOException { fileSchema = writerSchema; record = new GenericData.Record(writerSchema); GenericDatumReader<GenericRecord> gdr = new GenericDatumReader<GenericRecord>(); gdr.setExpected(readerSchema); ByteArrayInputStream is = new ByteArrayInputStream(bytes); DataFileStream<GenericRecord> dfr = new DataFileStream<GenericRecord>(is, gdr); record = dfr.next(record); dfr.close(); }
/** Read the next datum from the file. * @param reuse an instance to reuse. * @throws NoSuchElementException if no more remain in the file. */ public D next(D reuse) throws IOException { if (!hasNext()) throw new NoSuchElementException(); D result = reader.read(reuse, datumIn); if (0 == --blockRemaining) { blockFinished(); } return result; }
public static Schema extractSchemaFromAvroWithoutTime(File avroFile) throws IOException { DataFileStream<GenericRecord> dataStream = new DataFileStream<GenericRecord>(new FileInputStream(avroFile), new GenericDatumReader<GenericRecord>()); Schema schema = new Schema(); for (final Field field : dataStream.getSchema().getFields()) { try { getColumnType(field); } catch (Exception e) { LOGGER.warn("Caught exception while converting Avro field {} of type {}, field will not be in schema.", field.name(), field.schema().getType()); continue; } final String columnName = field.name(); final String pinotType = field.getProp("pinotType"); final FieldSpec fieldSpec; if (pinotType != null && "METRIC".equals(pinotType)) { fieldSpec = new MetricFieldSpec(); } else { fieldSpec = new DimensionFieldSpec(); } fieldSpec.setName(columnName); fieldSpec.setDataType(getColumnType(dataStream.getSchema().getField(columnName))); fieldSpec.setSingleValueField(isSingleValueField(dataStream.getSchema().getField(columnName))); schema.addField(fieldSpec); } dataStream.close(); return schema; }
try (final DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(instream, datumReader)) { final Schema generatedUnion = dataFileReader.getSchema().getField("The_Chairman").schema(); assertEquals(2, generatedUnion.getTypes().size()); final LogicalType logicalType = generatedUnion.getTypes().get(1).getLogicalType(); assertNotNull(logicalType); assertEquals("decimal", logicalType.getName()); while (dataFileReader.hasNext()) { record = dataFileReader.next(record); assertEquals("_1the__table", record.getSchema().getName()); assertEquals(bigDecimal, record.get("The_Chairman"));
@Test public void testWrite() throws IOException { Schema writerSchema = Schema.create(Schema.Type.INT); GenericData dataModel = new ReflectData(); CodecFactory compressionCodec = CodecFactory.nullCodec(); ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); TaskAttemptContext context = createMock(TaskAttemptContext.class); replay(context); // Write an avro container file with two records: 1 and 2. AvroKeyRecordWriter<Integer> recordWriter = new AvroKeyRecordWriter<>( writerSchema, dataModel, compressionCodec, outputStream); recordWriter.write(new AvroKey<>(1), NullWritable.get()); recordWriter.write(new AvroKey<>(2), NullWritable.get()); recordWriter.close(context); verify(context); // Verify that the file was written as expected. InputStream inputStream = new ByteArrayInputStream(outputStream.toByteArray()); Schema readerSchema = Schema.create(Schema.Type.INT); DatumReader<Integer> datumReader = new SpecificDatumReader<>(readerSchema); DataFileStream<Integer> dataFileReader = new DataFileStream<>(inputStream, datumReader); assertTrue(dataFileReader.hasNext()); // Record 1. assertEquals(1, dataFileReader.next().intValue()); assertTrue(dataFileReader.hasNext()); // Record 2. assertEquals(2, dataFileReader.next().intValue()); assertFalse(dataFileReader.hasNext()); // No more records. dataFileReader.close(); }
final DataFileStream<GenericRecord> dataFileReader = new DataFileStream<>(in, new GenericDatumReader<>())) { Schema avroSchema = dataFileReader.getSchema(); getLogger().debug(avroSchema.toString(true)); ParquetWriter<GenericRecord> writer = createParquetWriter(context, flowFile, rawOut, avroSchema ); int recordCount = 0; GenericRecord record = null; while (dataFileReader.hasNext()) { record = dataFileReader.next(); writer.write(record); recordCount++;
public static List<String> getColumnNamesFromAvro(File avro) throws IOException { List<String> ret = new ArrayList<String>(); DataFileStream<GenericRecord> dataStream = new DataFileStream<GenericRecord>(new FileInputStream(avro), new GenericDatumReader<GenericRecord>()); for (final Field field : dataStream.getSchema().getFields()) { ret.add(field.name()); } return ret; }
AvroJob.setInputSchema(job, Schema.create(Schema.Type.STRING)); job.set(AvroJob.OUTPUT_SCHEMA, outscheme.toString()); DataFileStream<Pair<Utf8, Long>> counts = new DataFileStream<>(cin, reader); int numWords = 0; for (Pair<Utf8, Long> wc : counts) {
public static Schema extractSchemaFromAvro(File avroFile, Map<String, FieldType> fieldTypeMap, TimeUnit granularity) throws IOException { DataFileStream<GenericRecord> dataStream = new DataFileStream<>(new FileInputStream(avroFile), new GenericDatumReader<GenericRecord>()); Schema schema = new Schema(); for (final Field field : dataStream.getSchema().getFields()) { final String columnName = field.name(); FieldType fieldType = fieldTypeMap.get(columnName); dataStream.close(); return schema;
@BeforeClass public static void before() throws Exception { final String filePath = TestUtils.getFileFromResourceUrl(DictionariesTest.class.getClassLoader().getResource(AVRO_DATA)); if (INDEX_DIR.exists()) { FileUtils.deleteQuietly(INDEX_DIR); } // System.out.println(INDEX_DIR.getAbsolutePath()); final SegmentIndexCreationDriver driver = SegmentCreationDriverFactory.get(null); final SegmentGeneratorConfig config = SegmentTestUtils .getSegmentGenSpecWithSchemAndProjectedColumns(new File(filePath), INDEX_DIR, "weeksSinceEpochSunday", TimeUnit.DAYS, "test"); config.setTimeColumnName("weeksSinceEpochSunday"); driver.init(config); driver.build(); final DataFileStream<GenericRecord> avroReader = AvroUtils.getAvroReader(new File(filePath)); final org.apache.avro.Schema avroSchema = avroReader.getSchema(); final String[] columns = new String[avroSchema.getFields().size()]; int i = 0; for (final Field f : avroSchema.getFields()) { columns[i] = f.name(); i++; } }
final org.apache.avro.Schema avroSchema = avroReader.getSchema(); final String[] columns = new String[avroSchema.getFields().size()]; int i = 0; for (final Field f : avroSchema.getFields()) { columns[i] = f.name(); i++; while (avroReader.hasNext()) { final GenericRecord rec = avroReader.next(); for (final String column : columns) { Object val = rec.get(column);
new DataFileStream<>(inStream, reader); if (!fileReader.getSchema().equals(new Schema.Parser().parse(TEXT_FILE_SCHEMA))) { err.println("Avro file is not generic text schema"); p.printHelpOn(err); fileReader.close(); return 1; while (fileReader.hasNext()) { ByteBuffer outBuff = (ByteBuffer) fileReader.next(); outStream.write(outBuff.array()); outStream.write(LINE_SEPARATOR); fileReader.close(); Util.close(inStream); Util.close(outStream);
DataFileStream<GenericRecord> reader = new DataFileStream<>( input, new GenericDatumReader<>()); schema = reader.getSchema(); for (String key : reader.getMetaKeys()) { if (!DataFileWriter.isReservedMeta(key)) { byte[] metadatum = reader.getMeta(key); metadata.put(key, metadatum); writer.setMeta(key, metadatum); inputCodec = reader.getMetaString(DataFileConstants.CODEC); if(inputCodec == null) { inputCodec = DataFileConstants.NULL_CODEC; if (!schema.equals(reader.getSchema())) { err.println("input files have different schemas"); reader.close(); return 1; for (String key : reader.getMetaKeys()) { if (!DataFileWriter.isReservedMeta(key)) { byte[] metadatum = reader.getMeta(key); byte[] writersMetadatum = metadata.get(key); if(!Arrays.equals(metadatum, writersMetadatum)) { err.println("input files have different non-reserved metadata"); reader.close(); return 2; String thisCodec = reader.getMetaString(DataFileConstants.CODEC);
@Test() public void testUseMeta() throws IOException { DataFileWriter<?> w = new DataFileWriter<>(new GenericDatumWriter<>()); File f = new File(DIR.getRoot().getPath(), "testDataFileMeta.avro"); w.setMeta("hello", "bar"); w.create(Schema.create(Type.NULL), f); w.close(); DataFileStream<Void> r = new DataFileStream<>(new FileInputStream(f), new GenericDatumReader<>()); assertTrue(r.getMetaKeys().contains("hello")); assertEquals("bar", r.getMetaString("hello")); }
DataFileStream<GenericRecord> reader = new DataFileStream<>( input, new GenericDatumReader<>()); Schema schema = reader.getSchema(); DataFileWriter<GenericRecord> writer = new DataFileWriter<>( new GenericDatumWriter<>()); for (String key : reader.getMetaKeys()) { if (!DataFileWriter.isReservedMeta(key)) { writer.setMeta(key, reader.getMeta(key));
private void validateSchema() { org.apache.avro.Schema avroSchema = _avroReader.getSchema(); for (FieldSpec fieldSpec : _schema.getAllFieldSpecs()) { String fieldName = fieldSpec.getName(); Field avroField = avroSchema.getField(fieldName); if (avroField == null) { LOGGER.warn("Pinot field: {} does not exist in Avro Schema", fieldName); } else { boolean isPinotFieldSingleValue = fieldSpec.isSingleValueField(); boolean isAvroFieldSingleValue = AvroUtils.isSingleValueField(avroField); if (isPinotFieldSingleValue != isAvroFieldSingleValue) { String errorMessage = "Pinot field: " + fieldName + " is " + (isPinotFieldSingleValue ? "Single" : "Multi") + "-valued in Pinot schema but not in Avro schema"; LOGGER.error(errorMessage); throw new IllegalStateException(errorMessage); } DataType pinotFieldDataType = fieldSpec.getDataType(); DataType avroFieldDataType = AvroUtils.extractFieldDataType(avroField); if (pinotFieldDataType != avroFieldDataType) { LOGGER.warn("Pinot field: {} of type: {} mismatches with corresponding field in Avro Schema of type: {}", fieldName, pinotFieldDataType, avroFieldDataType); } } } }
private void nextInput() throws IOException{ currentInput++; Path path = inFiles.get(currentInput); FSDataInputStream input = new FSDataInputStream(Util.openFromFS(path)); reader = new DataFileStream<>(input, new GenericDatumReader<>()); if (schema == null) { // if this is the first file, the schema gets saved schema = reader.getSchema(); } else if (!schema.equals(reader.getSchema())) { // subsequent files have to have equal schemas throw new IOException("schemas dont match"); } }