/** * Searchs column names by name on a given Parquet schema, and returns its corresponded * Parquet schema types. * * @param schema Group schema where to search for column names. * @param colNames List of column names. * @param colTypes List of column types. * @return List of GroupType objects of projected columns. */ private static List<Type> getProjectedGroupFields(GroupType schema, List<String> colNames, List<TypeInfo> colTypes) { List<Type> schemaTypes = new ArrayList<Type>(); ListIterator<String> columnIterator = colNames.listIterator(); while (columnIterator.hasNext()) { TypeInfo colType = colTypes.get(columnIterator.nextIndex()); String colName = columnIterator.next(); Type fieldType = getFieldTypeIgnoreCase(schema, colName); if (fieldType == null) { schemaTypes.add(Types.optional(PrimitiveTypeName.BINARY).named(colName)); } else { schemaTypes.add(getProjectedType(colType, fieldType)); } } return schemaTypes; }
/** * Searches column names by indexes on a given Parquet file schema, and returns its corresponded * Parquet schema types. * * @param schema Message schema where to search for column names. * @param colNames List of column names. * @param colIndexes List of column indexes. * @return A MessageType object of the column names found. */ public static MessageType getSchemaByIndex(MessageType schema, List<String> colNames, List<Integer> colIndexes) { List<Type> schemaTypes = new ArrayList<Type>(); for (Integer i : colIndexes) { if (i < colNames.size()) { if (i < schema.getFieldCount()) { schemaTypes.add(schema.getType(i)); } else { //prefixing with '_mask_' to ensure no conflict with named //columns in the file schema schemaTypes.add( Types.optional(PrimitiveTypeName.BINARY).named("_mask_" + colNames.get(i))); } } } return new MessageType(schema.getName(), schemaTypes); }
schemaTypes.add(Types.optional(PrimitiveTypeName.BINARY).named("_mask_" + colNames.get(i)));
} else if (typeInfo.getTypeName().toLowerCase().startsWith( serdeConstants.CHAR_TYPE_NAME)) { return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8) .named(name); } else if (typeInfo.getTypeName().toLowerCase().startsWith( serdeConstants.VARCHAR_TYPE_NAME)) { return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8) .named(name); } else if (typeInfo instanceof DecimalTypeInfo) { int scale = decimalTypeInfo.scale(); int bytes = ParquetHiveSerDe.PRECISION_TO_BYTE_COUNT[prec - 1]; return Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(bytes).as(OriginalType.DECIMAL). scale(scale).precision(prec).named(name); } else if (typeInfo.equals(TypeInfoFactory.dateTypeInfo)) {
private TypeMapping primitive(PrimitiveTypeName type) { return mapping(Types.optional(type).named(fieldName)); }
private TypeMapping primitive(PrimitiveTypeName type, OriginalType otype) { return mapping(Types.optional(type).as(otype).named(fieldName)); }
private TypeMapping primitiveFLBA(int length, OriginalType otype) { return mapping(Types.optional(FIXED_LEN_BYTE_ARRAY).length(length).as(otype).named(fieldName)); } });
@SuppressWarnings("unchecked") private Statistics<T> getStatistics(BigDecimal min, BigDecimal max, DecimalMetadata decimalMetadata) { PrimitiveType decimalType = org.apache.parquet.schema.Types.optional(PrimitiveType.PrimitiveTypeName.BINARY) .as(OriginalType.DECIMAL) .precision(decimalMetadata.getPrecision()) .scale(decimalMetadata.getScale()) .named("decimal_type"); return (Statistics<T>) Statistics.getBuilderForReading(decimalType) .withMin(min.unscaledValue().toByteArray()) .withMax(max.unscaledValue().toByteArray()) .withNumNulls(0) .build(); }
private TypeMapping decimal(PrimitiveTypeName type, int precision, int scale) { return mapping(Types.optional(type).as(DECIMAL).precision(precision).scale(scale).named(fieldName)); }
PrimitiveType decimalType = org.apache.parquet.schema.Types.optional(PrimitiveType.PrimitiveTypeName.BINARY) .as(OriginalType.DECIMAL) .length(length)
long totalSize, long totalUncompressedSize) { return get(path, Types.optional(type).named("fake_type"), codec, encodingStats, encodings, statistics, firstDataPage, dictionaryPageOffset, valueCount, totalSize, totalUncompressedSize);
long totalSize, long totalUncompressedSize) { return get(path, Types.optional(type).named("fake_type"), codec, encodingStats, encodings, statistics, firstDataPage, dictionaryPageOffset, valueCount, totalSize, totalUncompressedSize);
case DECIMAL: if ( f.getAllowNull() ) { return Types.optional( PrimitiveType.PrimitiveTypeName.BINARY ).as( OriginalType.DECIMAL ).precision( f.getPrecision() ).scale( f.getScale() ).named( formatFieldName ); } else { return Types.required( PrimitiveType.PrimitiveTypeName.BINARY ).as( OriginalType.DECIMAL ).precision( f.getPrecision() ).scale( f.getScale() ).named( formatFieldName ); return Types.optional( PrimitiveType.PrimitiveTypeName.INT32 ).as( OriginalType.DECIMAL ).precision( f.getPrecision() ).scale( f.getScale() ).named( formatFieldName ); } else { return Types.required( PrimitiveType.PrimitiveTypeName.INT32 ).as( OriginalType.DECIMAL ).precision( f.getPrecision() ).scale( f.getScale() ).named( formatFieldName ); return Types.optional( PrimitiveType.PrimitiveTypeName.INT64 ).as( OriginalType.DECIMAL ).precision( f.getPrecision() ).scale( f.getScale() ).named( formatFieldName ); } else { return Types.required( PrimitiveType.PrimitiveTypeName.INT64 ).as( OriginalType.DECIMAL ).precision( f.getPrecision() ).scale( f.getScale() ).named( formatFieldName );
return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); int scale = decimalTypeInfo.scale(); int bytes = ParquetHiveSerDe.PRECISION_TO_BYTE_COUNT[prec - 1]; return Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(bytes).as(OriginalType.DECIMAL).scale(scale).precision(prec).named(name);
return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); int scale = decimalTypeInfo.scale(); int bytes = ParquetHiveSerDe.PRECISION_TO_BYTE_COUNT[prec - 1]; return Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(bytes).as(OriginalType.DECIMAL).scale(scale).precision(prec).named(name);
serdeConstants.CHAR_TYPE_NAME)) { if (repetition == Repetition.OPTIONAL) { return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); serdeConstants.VARCHAR_TYPE_NAME)) { if (repetition == Repetition.OPTIONAL) { return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); int bytes = ParquetHiveSerDe.PRECISION_TO_BYTE_COUNT[prec - 1]; if (repetition == Repetition.OPTIONAL) { return Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(bytes).as(OriginalType.DECIMAL).scale(scale).precision(prec).named(name);
serdeConstants.CHAR_TYPE_NAME)) { if (repetition == Repetition.OPTIONAL) { return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); serdeConstants.VARCHAR_TYPE_NAME)) { if (repetition == Repetition.OPTIONAL) { return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); int bytes = ParquetHiveSerDe.PRECISION_TO_BYTE_COUNT[prec - 1]; if (repetition == Repetition.OPTIONAL) { return Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(bytes).as(OriginalType.DECIMAL).scale(scale).precision(prec).named(name);
serdeConstants.CHAR_TYPE_NAME)) { if (repetition == Repetition.OPTIONAL) { return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); serdeConstants.VARCHAR_TYPE_NAME)) { if (repetition == Repetition.OPTIONAL) { return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); int bytes = ParquetHiveSerDe.PRECISION_TO_BYTE_COUNT[prec - 1]; if (repetition == Repetition.OPTIONAL) { return Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(bytes).as(OriginalType.DECIMAL).scale(scale).precision(prec).named(name);
serdeConstants.CHAR_TYPE_NAME)) { if (repetition == Repetition.OPTIONAL) { return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); serdeConstants.VARCHAR_TYPE_NAME)) { if (repetition == Repetition.OPTIONAL) { return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named(name); int bytes = ParquetHiveSerDe.PRECISION_TO_BYTE_COUNT[prec - 1]; if (repetition == Repetition.OPTIONAL) { return Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(bytes).as(OriginalType.DECIMAL).scale(scale).precision(prec).named(name);