/** * convert a repeated field into a list of primitives or groups */ private static List<Object> convertRepeatedFieldToList(Group g, int fieldIndex, boolean binaryAsString) { Type t = g.getType().getFields().get(fieldIndex); assert t.getRepetition().equals(Type.Repetition.REPEATED); int repeated = g.getFieldRepetitionCount(fieldIndex); List<Object> vals = new ArrayList<>(); for (int i = 0; i < repeated; i++) { if (t.isPrimitive()) { vals.add(convertPrimitiveField(g, fieldIndex, i, binaryAsString)); } else { vals.add(g.getGroup(fieldIndex, i)); } } return vals; }
if (fieldType.getRepetition().equals(Type.Repetition.REPEATED)) { int repeated = g.getFieldRepetitionCount(fieldIndex); List<Object> vals = new ArrayList<>();
return Types.buildGroup(fieldType.getRepetition()) .addFields(typesArray) .named(fieldType.getName());
@Test public void testMapOriginalType() throws Exception { final String hiveColumnTypes = "map<string,string>"; final String hiveColumnNames = "mapCol"; final List<String> columnNames = createHiveColumnsFrom(hiveColumnNames); final List<TypeInfo> columnTypes = createHiveTypeInfoFrom(hiveColumnTypes); final MessageType messageTypeFound = HiveSchemaConverter.convert(columnNames, columnTypes); // this messageType only has one optional field, whose name is mapCol, original Type is MAP assertEquals(1, messageTypeFound.getFieldCount()); org.apache.parquet.schema.Type topLevel = messageTypeFound.getFields().get(0); assertEquals("mapCol",topLevel.getName()); assertEquals(OriginalType.MAP, topLevel.getOriginalType()); assertEquals(Repetition.OPTIONAL, topLevel.getRepetition()); assertEquals(1, topLevel.asGroupType().getFieldCount()); org.apache.parquet.schema.Type secondLevel = topLevel.asGroupType().getFields().get(0); //there is one repeated field for mapCol, the field name is "map" and its original Type is MAP_KEY_VALUE; assertEquals("map", secondLevel.getName()); assertEquals(OriginalType.MAP_KEY_VALUE, secondLevel.getOriginalType()); assertEquals(Repetition.REPEATED, secondLevel.getRepetition()); } }
private TypeMapping fromParquet(Type type) { return fromParquet(type, type.getName(), type.getRepetition()); }
/** * Converts Parquet {@link MessageType} to Flink field-name and {@link InternalType} pairs. */ public Map<String, InternalType> convertToInternalType(MessageType parquetSchema) { List<Type> types = parquetSchema.asGroupType().getFields(); Map<String, InternalType> result = new HashMap<>(); for (Type type : types) { String name = type.getName(); switch (type.getRepetition()) { case OPTIONAL: case REQUIRED: result.put(name, convertType(type)); break; default: throw new UnsupportedOperationException(type + " is not supported"); } } return result; }
private static Type getType(String[] pathSegments, int depth, MessageType schema) { Type type = schema.getType(Arrays.copyOfRange(pathSegments, 0, depth + 1)); if (depth + 1 == pathSegments.length) { return type; } else { Preconditions.checkState(!type.isPrimitive()); return new GroupType(type.getRepetition(), type.getName(), type.getOriginalType(), getType(pathSegments, depth + 1, schema)); } }
private void validate(PrimitiveTypeName p) { Type currentType = types.peek().asGroupType().getType(fields.peek()); int c = fieldValueCount.pop() + 1; fieldValueCount.push(c); LOG.debug("validate {} for {}",p ,currentType.getName()); switch (currentType.getRepetition()) { case OPTIONAL: case REQUIRED: if (c > 1) { throw new InvalidRecordException("repeated value when the type is not repeated in " + currentType); } break; case REPEATED: break; default: throw new InvalidRecordException("unknown repetition " + currentType.getRepetition() + " in " + currentType); } if (!currentType.isPrimitive() || currentType.asPrimitiveType().getPrimitiveTypeName() != p) { throw new InvalidRecordException("expected type " + p + " but got "+ currentType); } }
private void validate(PrimitiveTypeName p) { Type currentType = types.peek().asGroupType().getType(fields.peek()); int c = fieldValueCount.pop() + 1; fieldValueCount.push(c); LOG.debug("validate {} for {}",p ,currentType.getName()); switch (currentType.getRepetition()) { case OPTIONAL: case REQUIRED: if (c > 1) { throw new InvalidRecordException("repeated value when the type is not repeated in " + currentType); } break; case REPEATED: break; default: throw new InvalidRecordException("unknown repetition " + currentType.getRepetition() + " in " + currentType); } if (!currentType.isPrimitive() || currentType.asPrimitiveType().getPrimitiveTypeName() != p) { throw new InvalidRecordException("expected type " + p + " but got "+ currentType); } }
private void validate(PrimitiveTypeName... ptypes) { Type currentType = types.peek().asGroupType().getType(fields.peek()); int c = fieldValueCount.pop() + 1; fieldValueCount.push(c); if (LOG.isDebugEnabled()) LOG.debug("validate " + Arrays.toString(ptypes) + " for " + currentType.getName()); switch (currentType.getRepetition()) { case OPTIONAL: case REQUIRED: if (c > 1) { throw new InvalidRecordException("repeated value when the type is not repeated in " + currentType); } break; case REPEATED: break; default: throw new InvalidRecordException("unknown repetition " + currentType.getRepetition() + " in " + currentType); } if (!currentType.isPrimitive()) { throw new InvalidRecordException( "expected type in " + Arrays.toString(ptypes) + " but got " + currentType); } for (PrimitiveTypeName p : ptypes) { if (currentType.asPrimitiveType().getPrimitiveTypeName() == p) { return; // type is valid } } throw new InvalidRecordException( "expected type in " + Arrays.toString(ptypes) + " but got " + currentType); }
private static Type getType(String[] pathSegments, int depth, MessageType schema) { Type type = schema.getType(Arrays.copyOfRange(pathSegments, 0, depth + 1)); if (depth + 1 == pathSegments.length) { return type; } else { Preconditions.checkState(!type.isPrimitive()); return new GroupType(type.getRepetition(), type.getName(), getType(pathSegments, depth + 1, schema)); } }
private void visitChildren(GroupColumnIO newIO, GroupType groupType, GroupType requestedGroupType) { GroupColumnIO oldIO = current; current = newIO; for (Type type : groupType.getFields()) { // if the file schema does not contain the field it will just stay null if (requestedGroupType.containsField(type.getName())) { currentRequestedIndex = requestedGroupType.getFieldIndex(type.getName()); currentRequestedType = requestedGroupType.getType(currentRequestedIndex); if (currentRequestedType.getRepetition().isMoreRestrictiveThan(type.getRepetition())) { incompatibleSchema(type, currentRequestedType); } type.accept(this); } } current = oldIO; }
private void visitChildren(GroupColumnIO newIO, GroupType groupType, GroupType requestedGroupType) { GroupColumnIO oldIO = current; current = newIO; for (Type type : groupType.getFields()) { // if the file schema does not contain the field it will just stay null if (requestedGroupType.containsField(type.getName())) { currentRequestedIndex = requestedGroupType.getFieldIndex(type.getName()); currentRequestedType = requestedGroupType.getType(currentRequestedIndex); if (currentRequestedType.getRepetition().isMoreRestrictiveThan(type.getRepetition())) { incompatibleSchema(type, currentRequestedType); } type.accept(this); } } current = oldIO; }
/** * Changes the list inner '$data$' vector name to 'element' in the schema */ private Type renameChildTypeToElement(Type childType) { if (childType.isPrimitive()) { PrimitiveType childPrimitiveType = childType.asPrimitiveType(); return new PrimitiveType(childType.getRepetition(), childPrimitiveType.getPrimitiveTypeName(), childPrimitiveType.getTypeLength(), "element", childPrimitiveType.getOriginalType(), childPrimitiveType.getDecimalMetadata(), null); } else { GroupType childGroupType = childType.asGroupType(); return new GroupType(childType.getRepetition(), "element", childType.getOriginalType(), childGroupType.getFields()); } }
@Override protected Type union(Type toMerge, boolean strict) { if (toMerge.isPrimitive()) { throw new IncompatibleSchemaModificationException("can not merge primitive type " + toMerge + " into group type " + this); } return new GroupType(toMerge.getRepetition(), getName(), toMerge.getOriginalType(), mergeFields(toMerge.asGroupType()), getId()); }
@Override protected Type union(Type toMerge, boolean strict) { if (toMerge.isPrimitive()) { throw new IncompatibleSchemaModificationException("can not merge primitive type " + toMerge + " into group type " + this); } return new GroupType(toMerge.getRepetition(), getName(), toMerge.getLogicalTypeAnnotation(), mergeFields(toMerge.asGroupType()), getId()); }
private boolean hasMissingRequiredFieldInGroupType(GroupType requested, GroupType fullSchema) { for (Type field : fullSchema.getFields()) { if (requested.containsField(field.getName())) { Type requestedType = requested.getType(field.getName()); // if a field is in requested schema and the type of it is a group type, then do recursive check if (!field.isPrimitive()) { if (hasMissingRequiredFieldInGroupType(requestedType.asGroupType(), field.asGroupType())) { return true; } else { continue;// check next field } } } else { if (field.getRepetition() == Type.Repetition.REQUIRED) { return true; // if a field is missing in requested schema and it's required } else { continue; // the missing field is not required, then continue checking next field } } } return false; }
/** * @param type parquet types * @param name overrides parquet.getName() * @return the mapping */ private TypeMapping fromParquetGroup(GroupType type, String name) { OriginalType ot = type.getOriginalType(); if (ot == null) { List<TypeMapping> typeMappings = fromParquet(type.getFields()); Field arrowField = new Field(name, type.isRepetition(OPTIONAL), new Struct_(), fields(typeMappings)); return new StructTypeMapping(arrowField, type, typeMappings); } else { switch (ot) { case LIST: List3Levels list3Levels = new List3Levels(type); TypeMapping child = fromParquet(list3Levels.getElement(), null, list3Levels.getElement().getRepetition()); Field arrowField = new Field(name, type.isRepetition(OPTIONAL), new ArrowType.List(), asList(child.getArrowField())); return new ListTypeMapping(arrowField, list3Levels, child); default: throw new UnsupportedOperationException("Unsupported type " + type); } } }
@Override protected Type union(Type toMerge, boolean strict) { if (!toMerge.isPrimitive()) { reportSchemaMergeError(toMerge); } if (strict) { // Can't merge primitive fields of different type names or different original types if (!primitive.equals(toMerge.asPrimitiveType().getPrimitiveTypeName()) || !Objects.equals(getLogicalTypeAnnotation(), toMerge.getLogicalTypeAnnotation())) { reportSchemaMergeError(toMerge); } // Can't merge FIXED_LEN_BYTE_ARRAY fields of different lengths int toMergeLength = toMerge.asPrimitiveType().getTypeLength(); if (primitive == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY && length != toMergeLength) { reportSchemaMergeError(toMerge); } // Can't merge primitive fields with different column orders if (!columnOrder().equals(toMerge.asPrimitiveType().columnOrder())) { reportSchemaMergeErrorWithColumnOrder(toMerge); } } Types.PrimitiveBuilder<PrimitiveType> builder = Types.primitive(primitive, toMerge.getRepetition()); if (PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY == primitive) { builder.length(length); } return builder.as(getLogicalTypeAnnotation()).named(getName()); }
@Override protected Type union(Type toMerge, boolean strict) { if (!toMerge.isPrimitive()) { reportSchemaMergeError(toMerge); } if (strict) { // Can't merge primitive fields of different type names or different original types if (!primitive.equals(toMerge.asPrimitiveType().getPrimitiveTypeName()) || getOriginalType() != toMerge.getOriginalType()) { reportSchemaMergeError(toMerge); } // Can't merge FIXED_LEN_BYTE_ARRAY fields of different lengths int toMergeLength = toMerge.asPrimitiveType().getTypeLength(); if (primitive == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY && length != toMergeLength) { reportSchemaMergeError(toMerge); } // Can't merge primitive fields with different column orders if (!columnOrder().equals(toMerge.asPrimitiveType().columnOrder())) { reportSchemaMergeErrorWithColumnOrder(toMerge); } } Types.PrimitiveBuilder<PrimitiveType> builder = Types.primitive(primitive, toMerge.getRepetition()); if (PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY == primitive) { builder.length(length); } return builder.as(getOriginalType()).named(getName()); }