public static ColumnIdMap deserialize(TypeDescription schema, ByteBuffer serial) { ColumnIdMap result = new ColumnIdMap(); String[] parts = StandardCharsets.UTF_8.decode(serial).toString().split(","); for(int i = 0; i < parts.length; ++i) { String[] subparts = parts[i].split(":"); result.put(schema.findSubtype(Integer.parseInt(subparts[0])), Integer.parseInt(subparts[1])); } return result; } }
OrcFileAppender(Schema schema, OutputFile file, OrcFile.WriterOptions options, Map<String,byte[]> metadata) { orcSchema = TypeConversion.toOrc(schema, columnIds); options.setSchema(orcSchema); path = new Path(file.location()); try { writer = OrcFile.createWriter(path, options); } catch (IOException e) { throw new RuntimeException("Can't create file " + path, e); } writer.addUserMetadata(COLUMN_NUMBERS_ATTRIBUTE, columnIds.serialize()); metadata.forEach( (key,value) -> writer.addUserMetadata(key, ByteBuffer.wrap(value))); }
public OrcIterator build() { Preconditions.checkNotNull(schema, "Schema is required"); try { Path path = new Path(file.location()); Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf)); ColumnIdMap columnIds = new ColumnIdMap(); TypeDescription orcSchema = TypeConversion.toOrc(schema, columnIds); Reader.Options options = reader.options(); if (start != null) { options.range(start, length); } options.schema(orcSchema); return new OrcIterator(path, orcSchema, reader.rows(options)); } catch (IOException e) { throw new RuntimeException("Can't open " + file.location(), e); } } }
@Override public Metrics metrics() { try { long rows = writer.getNumberOfRows(); ColumnStatistics[] stats = writer.getStatistics(); // we don't currently have columnSizes or distinct counts. Map<Integer, Long> valueCounts = new HashMap<>(); Map<Integer, Long> nullCounts = new HashMap<>(); Integer[] icebergIds = new Integer[orcSchema.getMaximumId() + 1]; for(TypeDescription type: columnIds.keySet()) { icebergIds[type.getId()] = columnIds.get(type); } for(int c=1; c < stats.length; ++c) { if (icebergIds[c] != null) { valueCounts.put(icebergIds[c], stats[c].getNumberOfValues()); } } for(TypeDescription child: orcSchema.getChildren()) { int c = child.getId(); if (icebergIds[c] != null) { nullCounts.put(icebergIds[c], rows - stats[c].getNumberOfValues()); } } return new Metrics(rows, null, valueCounts, nullCounts); } catch (IOException e) { throw new RuntimeException("Can't get statistics " + path, e); } }
String name = fieldNames.get(c); TypeDescription type = fieldTypes.get(c); fields.add(Types.NestedField.optional(columnIds.get(type), name, convertOrcToType(type, columnIds))); return Types.ListType.ofOptional(columnIds.get(child), convertOrcToType(child, columnIds)); TypeDescription key = schema.getChildren().get(0); TypeDescription value = schema.getChildren().get(1); return Types.MapType.ofOptional(columnIds.get(key), columnIds.get(value), convertOrcToType(key, columnIds), convertOrcToType(value, columnIds));
columnIds.put(result, fieldId);
public SparkOrcReader(InputFile location, FileScanTask task, Schema readSchema) { ColumnIdMap columnIds = new ColumnIdMap(); orcSchema = TypeConversion.toOrc(readSchema, columnIds); reader = ORC.read(location) .split(task.start(), task.length()) .schema(readSchema) .build(); int numFields = readSchema.columns().size(); row = new UnsafeRow(numFields); holder = new BufferHolder(row, INITIAL_SIZE); writer = new UnsafeRowWriter(holder, numFields); converter = new Converter[numFields]; for(int c=0; c < numFields; ++c) { converter[c] = buildConverter(holder, orcSchema.getChildren().get(c)); } }