@Override public Collection<String> getPropertyKeys(final Object o) { if (o == null) { return Collections.emptySet(); } else if (o instanceof Map) { return ((Map<Object, Object>) o).keySet().stream().map(String::valueOf).collect(Collectors.toSet()); } else if (o instanceof Group) { return ((Group) o).getType().getFields().stream().map(f -> f.getName()).collect(Collectors.toSet()); } else { throw new UnsupportedOperationException(o.getClass().getName()); } }
/** * Searchs for a fieldName into a parquet GroupType by ignoring string case. * GroupType#getType(String fieldName) is case sensitive, so we use this method. * * @param groupType Group of field types where to search for fieldName * @param fieldName The field what we are searching * @return The Type object of the field found; null otherwise. */ private static Type getFieldTypeIgnoreCase(GroupType groupType, String fieldName) { for (Type type : groupType.getFields()) { if (type.getName().equalsIgnoreCase(fieldName)) { return type; } } return null; }
private static boolean isElementType(Type repeatedType, String parentName) { if (repeatedType.isPrimitive() || (repeatedType.asGroupType().getFieldCount() != 1)) { return true; } else if (repeatedType.getName().equals("array")) { return true; // existing avro data } else if (repeatedType.getName().equals(parentName + "_tuple")) { return true; // existing thrift data } // false for the following cases: // * name is "list", which matches the spec // * name is "bag", which indicates existing hive or pig data // * ambiguous case, which should be assumed is 3-level according to spec return false; } }
private List<ColumnDescriptor> getAllColumnDescriptorByType( int depth, Type type, List<ColumnDescriptor> columns) throws ParquetRuntimeException { List<ColumnDescriptor> res = new ArrayList<>(); for (ColumnDescriptor descriptor : columns) { if (depth >= descriptor.getPath().length) { throw new InvalidSchemaException("Corrupted Parquet schema"); } if (type.getName().equals(descriptor.getPath()[depth])) { res.add(descriptor); } } return res; }
/** * Translate the search argument to the filter predicate parquet uses. It includes * only the columns from the passed schema. * @return a filter predicate translated from search argument. null is returned * if failed to convert. */ public static FilterPredicate toFilterPredicate(SearchArgument sarg, MessageType schema) { Set<String> columns = null; if (schema != null) { columns = new HashSet<String>(); for (Type field : schema.getFields()) { columns.add(field.getName()); } } try { return translate(sarg.getExpression(), sarg.getLeaves(), columns, schema); } catch(Exception e) { return null; } }
String tn = type.getName().toLowerCase();
if (i < schema.getFieldCount()) { Type t = schema.getType(i); String tn = t.getName().toLowerCase(); if (!prunedCols.containsKey(tn)) { schemaTypes.add(schema.getType(i));
public MapDataWriter(MapObjectInspector inspector, GroupType groupType) { this.inspector = inspector; // Get the internal map structure (MAP_KEY_VALUE) GroupType repeatedType = groupType.getType(0).asGroupType(); this.repeatedGroupName = repeatedType.getName(); // Get key element information Type keyType = repeatedType.getType(0); ObjectInspector keyInspector = this.inspector.getMapKeyObjectInspector(); this.keyName = keyType.getName(); this.keyWriter = createWriter(keyInspector, keyType); // Get value element information Type valuetype = repeatedType.getType(1); ObjectInspector valueInspector = this.inspector.getMapValueObjectInspector(); this.valueName = valuetype.getName(); this.valueWriter = createWriter(valueInspector, valuetype); }
/** * check if a parquet type is a valid 'map' type */ private static boolean isLogicalMapType(Type groupType) { OriginalType ot = groupType.getOriginalType(); if (groupType.isPrimitive() || ot == null || groupType.isRepetition(Type.Repetition.REPEATED)) { return false; } if (groupType.getOriginalType().equals(OriginalType.MAP) || groupType.getOriginalType().equals(OriginalType.MAP_KEY_VALUE)) { GroupType myMapType = groupType.asGroupType(); if (myMapType.getFieldCount() != 1 || myMapType.getFields().get(0).isPrimitive()) { return false; } GroupType mapItemType = myMapType.getFields().get(0).asGroupType(); return mapItemType.isRepetition(Type.Repetition.REPEATED) && mapItemType.getFieldCount() == 2 && mapItemType.getFields().get(0).getName().equalsIgnoreCase("key") && mapItemType.getFields().get(0).isPrimitive() && mapItemType.getFields().get(1).getName().equalsIgnoreCase("value"); } return false; }
private boolean isSubType( final GroupType groupType, final Type subtype) { if (subtype.isPrimitive() || subtype.isRepetition(Type.Repetition.REPEATED)) { return groupType.getFields().contains(subtype); } else { for (Type g : groupType.getFields()) { if (!g.isPrimitive() && g.getName().equals(subtype.getName())) { // check all elements are contained in g boolean containsAll = false; for (Type subSubType : subtype.asGroupType().getFields()) { containsAll = isSubType(g.asGroupType(), subSubType); if (!containsAll) { break; } } if (containsAll) { return containsAll; } } } return false; } }
public ListDataWriter(ListObjectInspector inspector, GroupType groupType) { this.inspector = inspector; // Get the internal array structure GroupType repeatedType = groupType.getType(0).asGroupType(); this.repeatedGroupName = repeatedType.getName(); Type elementType = repeatedType.getType(0); this.elementName = elementType.getName(); ObjectInspector elementInspector = this.inspector.getListElementObjectInspector(); this.elementWriter = createWriter(elementInspector, elementType); }
private void init(final GroupType selectedGroupType, final ConverterParent parent, final int index, final GroupType containingGroupType, TypeInfo hiveTypeInfo) { if (parent != null) { setMetadata(parent.getMetadata()); } final int selectedFieldCount = selectedGroupType.getFieldCount(); converters = new Converter[selectedFieldCount]; this.repeatedConverters = new ArrayList<Repeated>(); if (hiveTypeInfo != null && hiveTypeInfo.getCategory().equals(ObjectInspector.Category.STRUCT)) { this.hiveFieldNames = ((StructTypeInfo) hiveTypeInfo).getAllStructFieldNames(); this.hiveFieldTypeInfos = ((StructTypeInfo) hiveTypeInfo).getAllStructFieldTypeInfos(); } List<Type> selectedFields = selectedGroupType.getFields(); for (int i = 0; i < selectedFieldCount; i++) { Type subtype = selectedFields.get(i); if (isSubType(containingGroupType, subtype)) { int fieldIndex = containingGroupType.getFieldIndex(subtype.getName()); TypeInfo _hiveTypeInfo = getFieldTypeIgnoreCase(hiveTypeInfo, subtype.getName(), fieldIndex); converters[i] = getFieldConverter(subtype, fieldIndex, _hiveTypeInfo); } else { throw new IllegalStateException("Group type [" + containingGroupType + "] does not contain requested field: " + subtype); } } }
/** * Prints the given group in the row of Parquet file. * * @param g The given group. */ private static void printGroup(Group g) { int fieldCnt = g.getType().getFieldCount(); for (int field = 0; field < fieldCnt; field++) { int valCnt = g.getFieldRepetitionCount(field); Type fieldType = g.getType().getType(field); String fieldName = fieldType.getName(); for (int idx = 0; idx < valCnt; idx++) { if (fieldType.isPrimitive()) System.out.println(fieldName + " " + g.getValueToString(field, idx)); else printGroup(g.getGroup(field, idx)); } } System.out.println(); }
return Types.buildGroup(fieldType.getRepetition()) .addFields(typesArray) .named(fieldType.getName()); case LIST: TypeInfo elemType = ((ListTypeInfo) colType).getListElementTypeInfo(); Type subFieldType = fieldType.asGroupType().getType(0); if (!subFieldType.isPrimitive()) { String subFieldName = subFieldType.getName(); Text name = new Text(subFieldName); if (name.equals(ParquetHiveSerDe.ARRAY) || name.equals(ParquetHiveSerDe.LIST)) { subFieldType).named(fieldType.getName());
@Test public void testMapOriginalType() throws Exception { final String hiveColumnTypes = "map<string,string>"; final String hiveColumnNames = "mapCol"; final List<String> columnNames = createHiveColumnsFrom(hiveColumnNames); final List<TypeInfo> columnTypes = createHiveTypeInfoFrom(hiveColumnTypes); final MessageType messageTypeFound = HiveSchemaConverter.convert(columnNames, columnTypes); // this messageType only has one optional field, whose name is mapCol, original Type is MAP assertEquals(1, messageTypeFound.getFieldCount()); org.apache.parquet.schema.Type topLevel = messageTypeFound.getFields().get(0); assertEquals("mapCol",topLevel.getName()); assertEquals(OriginalType.MAP, topLevel.getOriginalType()); assertEquals(Repetition.OPTIONAL, topLevel.getRepetition()); assertEquals(1, topLevel.asGroupType().getFieldCount()); org.apache.parquet.schema.Type secondLevel = topLevel.asGroupType().getFields().get(0); //there is one repeated field for mapCol, the field name is "map" and its original Type is MAP_KEY_VALUE; assertEquals("map", secondLevel.getName()); assertEquals(OriginalType.MAP_KEY_VALUE, secondLevel.getOriginalType()); assertEquals(Repetition.REPEATED, secondLevel.getRepetition()); } }
@Override public ParquetValueWriter<?> struct(GroupType struct, List<ParquetValueWriter<?>> fieldWriters) { List<Type> fields = struct.getFields(); List<ParquetValueWriter<?>> writers = Lists.newArrayListWithExpectedSize(fieldWriters.size()); for (int i = 0; i < fields.size(); i += 1) { Type fieldType = struct.getType(i); int fieldD = type.getMaxDefinitionLevel(path(fieldType.getName())); writers.add(option(fieldType, fieldD, fieldWriters.get(i))); } return new RecordWriter(writers); }
@Override public ParquetValueWriter<?> struct(GroupType struct, List<ParquetValueWriter<?>> fieldWriters) { List<Type> fields = struct.getFields(); List<ParquetValueWriter<?>> writers = Lists.newArrayListWithExpectedSize(fieldWriters.size()); for (int i = 0; i < fields.size(); i += 1) { Type fieldType = struct.getType(i); int fieldD = type.getMaxDefinitionLevel(path(fieldType.getName())); writers.add(option(fieldType, fieldD, fieldWriters.get(i))); } return new RecordWriter(writers); }
private static Type getType(String[] pathSegments, int depth, MessageType schema) { Type type = schema.getType(Arrays.copyOfRange(pathSegments, 0, depth + 1)); if (depth + 1 == pathSegments.length) { return type; } else { Preconditions.checkState(!type.isPrimitive()); return new GroupType(type.getRepetition(), type.getName(), type.getOriginalType(), getType(pathSegments, depth + 1, schema)); } }