return Types.primitive(PrimitiveTypeName.BINARY, repetition).as(OriginalType.UTF8) .named(name); typeInfo.equals(TypeInfoFactory.shortTypeInfo) || typeInfo.equals(TypeInfoFactory.byteTypeInfo)) { return Types.primitive(PrimitiveTypeName.INT32, repetition).named(name); return Types.primitive(PrimitiveTypeName.INT64, repetition).named(name); return Types.primitive(PrimitiveTypeName.DOUBLE, repetition).named(name); return Types.primitive(PrimitiveTypeName.FLOAT, repetition).named(name); return Types.primitive(PrimitiveTypeName.BOOLEAN, repetition).named(name); return Types.primitive(PrimitiveTypeName.BINARY, repetition).named(name); return Types.primitive(PrimitiveTypeName.INT96, repetition).named(name); serdeConstants.CHAR_TYPE_NAME)) { if (repetition == Repetition.OPTIONAL) { return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); return Types.repeated(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); serdeConstants.VARCHAR_TYPE_NAME)) { if (repetition == Repetition.OPTIONAL) { return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name);
private static MessageType readParquetSchema(List<SchemaElement> schema) { Iterator<SchemaElement> schemaIterator = schema.iterator(); SchemaElement rootSchema = schemaIterator.next(); Types.MessageTypeBuilder builder = Types.buildMessage(); readTypeSchema(builder, schemaIterator, rootSchema.getNum_children()); return builder.named(rootSchema.name); }
@Override protected Type buildSchema() { JsonElementConverter elementConverter = this.elementConverter; JsonElementConverter keyConverter = getKeyConverter(); GroupType mapGroup = Types.repeatedGroup().addFields(keyConverter.schema(), elementConverter.schema()).named(MAP_KEY) .asGroupType(); String columnName = this.jsonSchema.getColumnName(); switch (this.jsonSchema.optionalOrRequired()) { case OPTIONAL: return Types.optionalGroup().addFields(mapGroup).named(columnName).asGroupType(); case REQUIRED: return Types.requiredGroup().addFields(mapGroup).named(columnName).asGroupType(); default: return null; } }
@Override protected Type buildSchema() { String columnName = this.jsonSchema.getColumnName(); if (this.repeated) { return Types.repeated(BINARY).as(UTF8).named(columnName); } switch (this.jsonSchema.optionalOrRequired()) { case OPTIONAL: return Types.optional(BINARY).as(UTF8).named(columnName); case REQUIRED: return Types.required(BINARY).as(UTF8).named(columnName); default: throw new RuntimeException("Unsupported Repetition type"); } } }
return Types.primitive(PrimitiveTypeName.BINARY, repetition).as(OriginalType.UTF8) .named(name); typeInfo.equals(TypeInfoFactory.shortTypeInfo) || typeInfo.equals(TypeInfoFactory.byteTypeInfo)) { return Types.primitive(PrimitiveTypeName.INT32, repetition).named(name); return Types.primitive(PrimitiveTypeName.INT64, repetition).named(name); return Types.primitive(PrimitiveTypeName.DOUBLE, repetition).named(name); return Types.primitive(PrimitiveTypeName.FLOAT, repetition).named(name); return Types.primitive(PrimitiveTypeName.BOOLEAN, repetition).named(name); return Types.primitive(PrimitiveTypeName.BINARY, repetition).named(name); return Types.primitive(PrimitiveTypeName.INT96, repetition).named(name); return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); int scale = decimalTypeInfo.scale(); int bytes = ParquetHiveSerDe.PRECISION_TO_BYTE_COUNT[prec - 1]; return Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(bytes).as(OriginalType.DECIMAL).scale(scale).precision(prec).named(name); return Types.primitive(PrimitiveTypeName.INT32, repetition).as(OriginalType.DATE).named(name);
@Override public Type getType(String name) { int byteLength = getByteLength(precision); PrimitiveTypeName typeName = USE_BINARY ? PrimitiveTypeName.BINARY : PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY; PrimitiveBuilder<PrimitiveType> builder = Types.optional(typeName).as(OriginalType.DECIMAL); // NOTE: return types of PrimitiveBuilder.{length,precision,scale} are unstable try { BUILDER_LENGTH_METHOD.invoke(builder, byteLength); BUILDER_PRECISION_METHOD.invoke(builder, precision); BUILDER_SCALE_METHOD.invoke(builder, scale); } catch (ReflectiveOperationException e) { throw new IllegalArgumentException("error occurred while resolving decimal type", e); } return builder.named(name); }
schemaTypes.add(Types.buildGroup(groupFieldType.getRepetition()) .addFields(typesArray) .named(fieldType.getName()) schemaTypes.add(Types.optional(PrimitiveTypeName.BINARY).named(colName));
private void primitiveType(PrimitiveTypeName type, OriginalType orig) { if (isCurrentlyMatchedFilter()) { PrimitiveBuilder<PrimitiveType> b = primitive(type, currentRepetition); if (orig != null) { b = b.as(orig); } currentType = b.named(currentName); } }
@Override protected Type buildSchema() { String columnName = this.jsonSchema.getColumnName(); if (this.repeated) { return Types.repeated(BINARY).as(UTF8).named(columnName); } switch (this.jsonSchema.optionalOrRequired()) { case OPTIONAL: return Types.optional(BINARY).as(UTF8).named(columnName); case REQUIRED: return Types.required(BINARY).as(UTF8).named(columnName); default: throw new RuntimeException("Unsupported Repetition type"); } } }
if (typeInfo.getCategory().equals(Category.PRIMITIVE)) { if (typeInfo.equals(TypeInfoFactory.stringTypeInfo)) { return Types.primitive(PrimitiveTypeName.BINARY, repetition).as(OriginalType.UTF8) .named(name); } else if (typeInfo.equals(TypeInfoFactory.intTypeInfo) || typeInfo.equals(TypeInfoFactory.shortTypeInfo) || typeInfo.equals(TypeInfoFactory.byteTypeInfo)) { return Types.primitive(PrimitiveTypeName.INT32, repetition).named(name); } else if (typeInfo.equals(TypeInfoFactory.longTypeInfo)) { return Types.primitive(PrimitiveTypeName.INT64, repetition).named(name); } else if (typeInfo.equals(TypeInfoFactory.doubleTypeInfo)) { return Types.primitive(PrimitiveTypeName.DOUBLE, repetition).named(name); } else if (typeInfo.equals(TypeInfoFactory.floatTypeInfo)) { return Types.primitive(PrimitiveTypeName.FLOAT, repetition).named(name); } else if (typeInfo.equals(TypeInfoFactory.booleanTypeInfo)) { return Types.primitive(PrimitiveTypeName.BOOLEAN, repetition).named(name); } else if (typeInfo.equals(TypeInfoFactory.binaryTypeInfo)) { return Types.primitive(PrimitiveTypeName.BINARY, repetition).named(name); } else if (typeInfo.equals(TypeInfoFactory.timestampTypeInfo)) { return Types.primitive(PrimitiveTypeName.INT96, repetition).named(name); } else if (typeInfo.equals(TypeInfoFactory.voidTypeInfo)) { throw new UnsupportedOperationException("Void type not implemented"); } else if (typeInfo.getTypeName().toLowerCase().startsWith( serdeConstants.CHAR_TYPE_NAME)) { return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8) .named(name); } else if (typeInfo.getTypeName().toLowerCase().startsWith( serdeConstants.VARCHAR_TYPE_NAME)) { return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8)
@Override public Type getType(String name) { return Types.optional(PrimitiveTypeName.BINARY) .as(OriginalType.UTF8) .named(name); }
@Override protected Type union(Type toMerge, boolean strict) { if (!toMerge.isPrimitive() || (strict && !primitive.equals(toMerge.asPrimitiveType().getPrimitiveTypeName()))) { throw new IncompatibleSchemaModificationException("can not merge type " + toMerge + " into " + this); } Types.PrimitiveBuilder<PrimitiveType> builder = Types.primitive( primitive, toMerge.getRepetition()); if (PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY == primitive) { builder.length(length); } return builder.named(getName()); } }
return Types.primitive(PrimitiveTypeName.BINARY, repetition).as(OriginalType.UTF8) .named(name); typeInfo.equals(TypeInfoFactory.shortTypeInfo) || typeInfo.equals(TypeInfoFactory.byteTypeInfo)) { return Types.primitive(PrimitiveTypeName.INT32, repetition).named(name); return Types.primitive(PrimitiveTypeName.INT64, repetition).named(name); return Types.primitive(PrimitiveTypeName.DOUBLE, repetition).named(name); return Types.primitive(PrimitiveTypeName.FLOAT, repetition).named(name); return Types.primitive(PrimitiveTypeName.BOOLEAN, repetition).named(name); return Types.primitive(PrimitiveTypeName.BINARY, repetition).named(name); return Types.primitive(PrimitiveTypeName.INT96, repetition).named(name); serdeConstants.CHAR_TYPE_NAME)) { if (repetition == Repetition.OPTIONAL) { return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); return Types.repeated(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); serdeConstants.VARCHAR_TYPE_NAME)) { if (repetition == Repetition.OPTIONAL) { return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name);
return Types.primitive(PrimitiveTypeName.BINARY, repetition).as(OriginalType.UTF8) .named(name); typeInfo.equals(TypeInfoFactory.shortTypeInfo) || typeInfo.equals(TypeInfoFactory.byteTypeInfo)) { return Types.primitive(PrimitiveTypeName.INT32, repetition).named(name); return Types.primitive(PrimitiveTypeName.INT64, repetition).named(name); return Types.primitive(PrimitiveTypeName.DOUBLE, repetition).named(name); return Types.primitive(PrimitiveTypeName.FLOAT, repetition).named(name); return Types.primitive(PrimitiveTypeName.BOOLEAN, repetition).named(name); return Types.primitive(PrimitiveTypeName.BINARY, repetition).named(name); return Types.primitive(PrimitiveTypeName.INT96, repetition).named(name); return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); int scale = decimalTypeInfo.scale(); int bytes = ParquetHiveSerDe.PRECISION_TO_BYTE_COUNT[prec - 1]; return Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(bytes).as(OriginalType.DECIMAL).scale(scale).precision(prec).named(name); return Types.primitive(PrimitiveTypeName.INT32, repetition).as(OriginalType.DATE).named(name);
private static MessageType readParquetSchema(List<SchemaElement> schema) { Iterator<SchemaElement> schemaIterator = schema.iterator(); SchemaElement rootSchema = schemaIterator.next(); Types.MessageTypeBuilder builder = Types.buildMessage(); readTypeSchema(builder, schemaIterator, rootSchema.getNum_children()); return builder.named(rootSchema.name); }
/** * Searchs column names by index on a given Parquet file schema, and returns its corresponded * Parquet schema types. * * @param schema Message schema where to search for column names. * @param colNames List of column names. * @param colIndexes List of column indexes. * @return A MessageType object of the column names found. */ private static MessageType getSchemaByIndex(MessageType schema, List<String> colNames, List<Integer> colIndexes) { List<Type> schemaTypes = new ArrayList<Type>(); for (Integer i : colIndexes) { if (i < colNames.size()) { if (i < schema.getFieldCount()) { schemaTypes.add(schema.getType(i)); } else { //prefixing with '_mask_' to ensure no conflict with named //columns in the file schema schemaTypes.add(Types.optional(PrimitiveTypeName.BINARY).named("_mask_" + colNames.get(i))); } } } return new MessageType(schema.getName(), schemaTypes); }
@Override protected Type buildSchema() { JsonElementConverter elementConverter = this.elementConverter; JsonElementConverter keyConverter = getKeyConverter(); GroupType mapGroup = Types.repeatedGroup().addFields(keyConverter.schema(), elementConverter.schema()).named(MAP_KEY) .asGroupType(); String columnName = this.jsonSchema.getColumnName(); switch (this.jsonSchema.optionalOrRequired()) { case OPTIONAL: return Types.optionalGroup().addFields(mapGroup).named(columnName).asGroupType(); case REQUIRED: return Types.requiredGroup().addFields(mapGroup).named(columnName).asGroupType(); default: return null; } }
return Types.primitive(PrimitiveTypeName.BINARY, repetition).as(OriginalType.UTF8) .named(name); typeInfo.equals(TypeInfoFactory.shortTypeInfo) || typeInfo.equals(TypeInfoFactory.byteTypeInfo)) { return Types.primitive(PrimitiveTypeName.INT32, repetition).named(name); return Types.primitive(PrimitiveTypeName.INT64, repetition).named(name); return Types.primitive(PrimitiveTypeName.DOUBLE, repetition).named(name); return Types.primitive(PrimitiveTypeName.FLOAT, repetition).named(name); return Types.primitive(PrimitiveTypeName.BOOLEAN, repetition).named(name); return Types.primitive(PrimitiveTypeName.BINARY, repetition).named(name); return Types.primitive(PrimitiveTypeName.INT96, repetition).named(name); serdeConstants.CHAR_TYPE_NAME)) { if (repetition == Repetition.OPTIONAL) { return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); return Types.repeated(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); serdeConstants.VARCHAR_TYPE_NAME)) { if (repetition == Repetition.OPTIONAL) { return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name);
MessageType fromParquetSchema(List<SchemaElement> schema) { Iterator<SchemaElement> iterator = schema.iterator(); SchemaElement root = iterator.next(); Types.MessageTypeBuilder builder = Types.buildMessage(); buildChildren(builder, iterator, root.getNum_children()); return builder.named(root.name); }
return Types.primitive(PrimitiveTypeName.BINARY, repetition).as(OriginalType.UTF8) .named(name); typeInfo.equals(TypeInfoFactory.shortTypeInfo) || typeInfo.equals(TypeInfoFactory.byteTypeInfo)) { return Types.primitive(PrimitiveTypeName.INT32, repetition).named(name); return Types.primitive(PrimitiveTypeName.INT64, repetition).named(name); return Types.primitive(PrimitiveTypeName.DOUBLE, repetition).named(name); return Types.primitive(PrimitiveTypeName.FLOAT, repetition).named(name); return Types.primitive(PrimitiveTypeName.BOOLEAN, repetition).named(name); return Types.primitive(PrimitiveTypeName.BINARY, repetition).named(name); return Types.primitive(PrimitiveTypeName.INT96, repetition).named(name); serdeConstants.CHAR_TYPE_NAME)) { if (repetition == Repetition.OPTIONAL) { return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); return Types.repeated(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); serdeConstants.VARCHAR_TYPE_NAME)) { if (repetition == Repetition.OPTIONAL) { return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name);