/** * check if a parquet type is a valid 'map' type */ private static boolean isLogicalMapType(Type groupType) { OriginalType ot = groupType.getOriginalType(); if (groupType.isPrimitive() || ot == null || groupType.isRepetition(Type.Repetition.REPEATED)) { return false; } if (groupType.getOriginalType().equals(OriginalType.MAP) || groupType.getOriginalType().equals(OriginalType.MAP_KEY_VALUE)) { GroupType myMapType = groupType.asGroupType(); if (myMapType.getFieldCount() != 1 || myMapType.getFields().get(0).isPrimitive()) { return false; } GroupType mapItemType = myMapType.getFields().get(0).asGroupType(); return mapItemType.isRepetition(Type.Repetition.REPEATED) && mapItemType.getFieldCount() == 2 && mapItemType.getFields().get(0).getName().equalsIgnoreCase("key") && mapItemType.getFields().get(0).isPrimitive() && mapItemType.getFields().get(1).getName().equalsIgnoreCase("value"); } return false; }
private PrimitiveType getElementType(Type type) { if (type.isPrimitive()) { return type.asPrimitiveType(); } if (type.asGroupType().getFields().size() > 1) { throw new RuntimeException( "Current Parquet Vectorization reader doesn't support nested type"); } return type.asGroupType().getFields().get(0).asGroupType().getFields().get(0) .asPrimitiveType(); }
@Test public void testMapOriginalType() throws Exception { final String hiveColumnTypes = "map<string,string>"; final String hiveColumnNames = "mapCol"; final List<String> columnNames = createHiveColumnsFrom(hiveColumnNames); final List<TypeInfo> columnTypes = createHiveTypeInfoFrom(hiveColumnTypes); final MessageType messageTypeFound = HiveSchemaConverter.convert(columnNames, columnTypes); // this messageType only has one optional field, whose name is mapCol, original Type is MAP assertEquals(1, messageTypeFound.getFieldCount()); org.apache.parquet.schema.Type topLevel = messageTypeFound.getFields().get(0); assertEquals("mapCol",topLevel.getName()); assertEquals(OriginalType.MAP, topLevel.getOriginalType()); assertEquals(Repetition.OPTIONAL, topLevel.getRepetition()); assertEquals(1, topLevel.asGroupType().getFieldCount()); org.apache.parquet.schema.Type secondLevel = topLevel.asGroupType().getFields().get(0); //there is one repeated field for mapCol, the field name is "map" and its original Type is MAP_KEY_VALUE; assertEquals("map", secondLevel.getName()); assertEquals(OriginalType.MAP_KEY_VALUE, secondLevel.getOriginalType()); assertEquals(Repetition.REPEATED, secondLevel.getRepetition()); } }
/** * check if a parquet type is a valid 'list' type */ private static boolean isLogicalListType(Type listType) { return !listType.isPrimitive() && listType.getOriginalType() != null && listType.getOriginalType().equals(OriginalType.LIST) && listType.asGroupType().getFieldCount() == 1 && listType.asGroupType().getFields().get(0).isRepetition(Type.Repetition.REPEATED); }
private static boolean isElementType(Type repeatedType, String parentName) { if (repeatedType.isPrimitive() || (repeatedType.asGroupType().getFieldCount() != 1)) { return true; } else if (repeatedType.getName().equals("array")) { return true; // existing avro data } else if (repeatedType.getName().equals(parentName + "_tuple")) { return true; // existing thrift data } // false for the following cases: // * name is "list", which matches the spec // * name is "bag", which indicates existing hive or pig data // * ambiguous case, which should be assumed is 3-level according to spec return false; } }
this.parent = parent; this.avroSchema = avroSchema; int schemaSize = parquetSchema.getFieldCount(); this.converters = new Converter[schemaSize]; this.specificClass = getDatumClass(baseModel, avroSchema); for (Schema.Field field: avroSchema.getFields()) { avroFieldIndexes.put(field.name(), avroFieldIndex++); for (Type parquetField: parquetSchema.getFields()) { Schema.Field avroField = getAvroField(parquetField.getName()); Schema nonNullSchema = AvroSchemaConverter.getNonNull(avroField.schema()); final int finalAvroIndex = avroFieldIndexes.remove(avroField.name()); Schema.Field field = avroSchema.getField(fieldName); if (field.schema().getType() == Schema.Type.NULL) { continue; // skip null since Parquet does not write nulls
public AvroUnionConverter(ParentValueContainer parent, Type parquetSchema, Schema avroSchema, GenericData model) { this.parent = parent; GroupType parquetGroup = parquetSchema.asGroupType(); this.memberConverters = new Converter[ parquetGroup.getFieldCount()]; int parquetIndex = 0; for (int index = 0; index < avroSchema.getTypes().size(); index++) { Schema memberSchema = avroSchema.getTypes().get(index); if (!memberSchema.getType().equals(Schema.Type.NULL)) { Type memberType = parquetGroup.getType(parquetIndex); memberConverters[parquetIndex] = newConverter(memberSchema, memberType, model, new ParentValueContainer() { @Override public void add(Object value) { Preconditions.checkArgument(memberValue==null, "Union is resolving to more than one type"); memberValue = value; } }); parquetIndex++; // Note for nulls the parquetIndex id not increased } } }
private void writeRecordFields(GroupType schema, Schema avroSchema, Object record) { List<Type> fields = schema.getFields(); List<Schema.Field> avroFields = avroSchema.getFields(); int index = 0; // parquet ignores Avro nulls, so index may differ for (int avroIndex = 0; avroIndex < avroFields.size(); avroIndex++) { Schema.Field avroField = avroFields.get(avroIndex); if (avroField.schema().getType().equals(Schema.Type.NULL)) { continue; } Type fieldType = fields.get(index); Object value = model.getField(record, avroField.name(), avroIndex); if (value != null) { recordConsumer.startField(fieldType.getName(), index); writeValue(fieldType, avroField.schema(), value); recordConsumer.endField(fieldType.getName(), index); } else if (fieldType.isRepetition(Type.Repetition.REQUIRED)) { throw new RuntimeException("Null-value for required field: " + avroField.name()); } index++; } }
private static Converter newConverter(Schema schema, Type type, GenericData model, ParentValueContainer setter) { LogicalType logicalType = schema.getLogicalType(); .getConversionContainer(setter, conversion, schema); if (schema.getType().equals(Schema.Type.BOOLEAN)) { return new AvroConverters.FieldBooleanConverter(parent); } else if (schema.getType().equals(Schema.Type.INT)) { return new AvroConverters.FieldIntegerConverter(parent); } else if (schema.getType().equals(Schema.Type.LONG)) { return new AvroConverters.FieldStringConverter(parent); } else if (schema.getType().equals(Schema.Type.RECORD)) { return new AvroIndexedRecordConverter(parent, type.asGroupType(), schema, model); } else if (schema.getType().equals(Schema.Type.ENUM)) { return new FieldEnumConverter(parent, schema, model); } else if (schema.getType().equals(Schema.Type.ARRAY)) { return new AvroArrayConverter(parent, type.asGroupType(), schema, model); } else if (schema.getType().equals(Schema.Type.MAP)) { return new MapConverter(parent, type.asGroupType(), schema, model); } else if (schema.getType().equals(Schema.Type.UNION)) { return new AvroUnionConverter(parent, type, schema, model);
switch (avroSchema.getType()) { case BOOLEAN: recordConsumer.addBoolean((Boolean) value); break; case INT: if (value instanceof Character) { recordConsumer.addInteger((Character) value); } else { recordConsumer.addInteger(((Number) value).intValue()); break; case RECORD: writeRecord(type.asGroupType(), avroSchema, value); break; case ENUM: break; case ARRAY: listWriter.writeList(type.asGroupType(), avroSchema, value); break; case MAP: writeMap(type.asGroupType(), avroSchema, (Map<CharSequence, ?>) value); break; case UNION: writeUnion(type.asGroupType(), avroSchema, value); break;
private <V> void writeMap(GroupType schema, Schema avroSchema, Map<CharSequence, V> map) { GroupType innerGroup = schema.getType(0).asGroupType(); Type keyType = innerGroup.getType(0); Type valueType = innerGroup.getType(1); recordConsumer.startGroup(); // group wrapper (original type MAP) if (map.size() > 0) { recordConsumer.startField(MAP_REPEATED_NAME, 0); for (Map.Entry<CharSequence, V> entry : map.entrySet()) { recordConsumer.startGroup(); // repeated group key_value, middle layer recordConsumer.startField(MAP_KEY_NAME, 0); writeValue(keyType, MAP_KEY_SCHEMA, entry.getKey()); recordConsumer.endField(MAP_KEY_NAME, 0); V value = entry.getValue(); if (value != null) { recordConsumer.startField(MAP_VALUE_NAME, 1); writeValue(valueType, avroSchema.getValueType(), value); recordConsumer.endField(MAP_VALUE_NAME, 1); } else if (!valueType.isRepetition(Type.Repetition.OPTIONAL)) { throw new RuntimeException("Null map value for " + avroSchema.getName()); } recordConsumer.endGroup(); } recordConsumer.endField(MAP_REPEATED_NAME, 0); } recordConsumer.endGroup(); }
@Override public void write(TupleEntry record) { recordConsumer.startMessage(); final List<Type> fields = rootSchema.getFields(); for (int i = 0; i < fields.size(); i++) { Type field = fields.get(i); if (record == null || record.getObject(field.getName()) == null) { continue; } recordConsumer.startField(field.getName(), i); if (field.isPrimitive()) { writePrimitive(record, field.asPrimitiveType()); } else { throw new UnsupportedOperationException("Complex type not implemented"); } recordConsumer.endField(field.getName(), i); } recordConsumer.endMessage(); }
/** * Returns whether the given type is the element type of a list or is a * synthetic group with one field that is the element type. This is * determined by checking whether the type can be a synthetic group and by * checking whether a potential synthetic group matches the expected schema. * <p> * Unlike {@link AvroSchemaConverter#isElementType(Type, String)}, this * method never guesses because the expected schema is known. * * @param repeatedType a type that may be the element type * @param elementSchema the expected Schema for list elements * @return {@code true} if the repeatedType is the element schema */ static boolean isElementType(Type repeatedType, Schema elementSchema) { if (repeatedType.isPrimitive() || repeatedType.asGroupType().getFieldCount() > 1 || repeatedType.asGroupType().getType(0).isRepetition(REPEATED)) { // The repeated type must be the element type because it is an invalid // synthetic wrapper. Must be a group with one optional or required field return true; } else if (elementSchema != null && elementSchema.getType() == Schema.Type.RECORD) { Schema schemaFromRepeated = CONVERTER.convert(repeatedType.asGroupType()); if (checkReaderWriterCompatibility(elementSchema, schemaFromRepeated) .getType() == COMPATIBLE) { return true; } } return false; }
public AvroArrayConverter(ParentValueContainer parent, GroupType type, Schema avroSchema, GenericData model, Class<?> arrayClass) { this.parent = parent; this.avroSchema = avroSchema; Preconditions.checkArgument(arrayClass.isArray(), "Cannot convert non-array: " + arrayClass.getName()); this.elementClass = arrayClass.getComponentType(); ParentValueContainer setter = createSetterAndContainer(); Schema elementSchema = this.avroSchema.getElementType(); Type repeatedType = type.getType(0); // always determine whether the repeated type is the element type by // matching it against the element schema. if (isElementType(repeatedType, elementSchema)) { // the element type is the repeated type (and required) converter = newConverter(elementSchema, repeatedType, model, elementClass, setter); } else { // the element is wrapped in a synthetic group and may be optional converter = new ArrayElementConverter( repeatedType.asGroupType(), elementSchema, model, setter); } }
private Schema convertFields(String name, List<Type> parquetFields) { List<Schema.Field> fields = new ArrayList<Schema.Field>(); for (Type parquetType : parquetFields) { Schema fieldSchema = convertField(parquetType); if (parquetType.isRepetition(REPEATED)) { throw new UnsupportedOperationException("REPEATED not supported outside LIST or MAP. Type: " + parquetType); } else if (parquetType.isRepetition(Type.Repetition.OPTIONAL)) { fields.add(new Schema.Field( parquetType.getName(), optional(fieldSchema), null, NULL_VALUE)); } else { // REQUIRED fields.add(new Schema.Field( parquetType.getName(), fieldSchema, null, (Object) null)); } } Schema schema = Schema.createRecord(name, null, null, false); schema.setFields(fields); return schema; }
private void initializeInternal() throws IOException, UnsupportedOperationException { // Check that the requested schema is supported. missingColumns = new boolean[requestedSchema.getFieldCount()]; List<ColumnDescriptor> columns = requestedSchema.getColumns(); List<String[]> paths = requestedSchema.getPaths(); for (int i = 0; i < requestedSchema.getFieldCount(); ++i) { Type t = requestedSchema.getFields().get(i); if (!t.isPrimitive() || t.isRepetition(Type.Repetition.REPEATED)) { throw new UnsupportedOperationException("Complex types not supported."); } String[] colPath = paths.get(i); if (fileSchema.containsPath(colPath)) { ColumnDescriptor fd = fileSchema.getColumnDescription(colPath); if (!fd.equals(columns.get(i))) { throw new UnsupportedOperationException("Schema evolution not supported."); } missingColumns[i] = false; } else { if (columns.get(i).getMaxDefinitionLevel() == 0) { // Column is missing in data but the required data is non-nullable. This file is invalid. throw new IOException("Required column is missing in data file. Col: " + Arrays.toString(colPath)); } missingColumns[i] = true; } } }
/** * Changes the list inner '$data$' vector name to 'element' in the schema */ private Type renameChildTypeToElement(Type childType) { if (childType.isPrimitive()) { PrimitiveType childPrimitiveType = childType.asPrimitiveType(); return new PrimitiveType(childType.getRepetition(), childPrimitiveType.getPrimitiveTypeName(), childPrimitiveType.getTypeLength(), "element", childPrimitiveType.getOriginalType(), childPrimitiveType.getDecimalMetadata(), null); } else { GroupType childGroupType = childType.asGroupType(); return new GroupType(childType.getRepetition(), "element", childType.getOriginalType(), childGroupType.getFields()); } }
private Map<Integer, Converter> buildFieldToConverter(final MessageType schema) { final Map<Integer, Converter> fieldToConverter = new HashMap<>(fieldCount); int i = 0; for (final Type field : schema.getFields()) { if (field.isPrimitive()) { fieldToConverter.put(i, new PrimitiveConverter(parquetColumnToObject, field.asPrimitiveType().getPrimitiveTypeName().javaType.getSimpleName(), new String[]{field.getName()}, field.getOriginalType())); } else { fieldToConverter.put(i, new BypassGroupConverter(parquetColumnToObject, field.asGroupType(), new String[]{field.getName()})); } i++; } return fieldToConverter; }
private static Type getType(String[] pathSegments, int depth, MessageType schema) { Type type = schema.getType(Arrays.copyOfRange(pathSegments, 0, depth + 1)); if (depth + 1 == pathSegments.length) { return type; } else { Preconditions.checkState(!type.isPrimitive()); return new GroupType(type.getRepetition(), type.getName(), type.getOriginalType(), getType(pathSegments, depth + 1, schema)); } }
private void validate(PrimitiveTypeName p) { Type currentType = types.peek().asGroupType().getType(fields.peek()); int c = fieldValueCount.pop() + 1; fieldValueCount.push(c); LOG.debug("validate {} for {}",p ,currentType.getName()); switch (currentType.getRepetition()) { case OPTIONAL: case REQUIRED: if (c > 1) { throw new InvalidRecordException("repeated value when the type is not repeated in " + currentType); } break; case REPEATED: break; default: throw new InvalidRecordException("unknown repetition " + currentType.getRepetition() + " in " + currentType); } if (!currentType.isPrimitive() || currentType.asPrimitiveType().getPrimitiveTypeName() != p) { throw new InvalidRecordException("expected type " + p + " but got "+ currentType); } }