public void start(Schema schema, DictionaryProvider provider) throws IOException { List<Field> fields = new ArrayList<>(schema.getFields().size()); Set<Long> dictionaryIdsUsed = new HashSet<>(); this.schema = schema; // Store original Schema to ensure batches written match // Convert fields with dictionaries to have dictionary type for (Field field : schema.getFields()) { fields.add(DictionaryUtility.toMessageFormat(field, provider, dictionaryIdsUsed)); } Schema updatedSchema = new Schema(fields, schema.getCustomMetadata()); generator.writeStartObject(); generator.writeObjectField("schema", updatedSchema); // Write all dictionaries that were used if (!dictionaryIdsUsed.isEmpty()) { writeDictionaryBatches(generator, dictionaryIdsUsed, provider); } // Start writing of record batches generator.writeArrayFieldStart("batches"); }
public static CompleteType deserialize(byte[] bytes) { Schema schema = Schema.getRootAsSchema(ByteBuffer.wrap(bytes)); org.apache.arrow.vector.types.pojo.Schema s = org.apache.arrow.vector.types.pojo.Schema.convertSchema(schema); return CompleteType.fromField(s.getFields().get(0)); }
public int serialize(FlatBufferBuilder builder) { org.apache.arrow.vector.types.pojo.Schema schema = new org.apache.arrow.vector.types.pojo.Schema(Collections.singletonList(this.toField("f"))); return schema.getSchema(builder); }
public VectorSchemaRoot(Schema schema, List<FieldVector> fieldVectors, int rowCount) { if (schema.getFields().size() != fieldVectors.size()) { throw new IllegalArgumentException("Fields must match field vectors. Found " + fieldVectors.size() + " vectors and " + schema.getFields().size() + " fields"); } this.schema = schema; this.rowCount = rowCount; this.fieldVectors = fieldVectors; for (int i = 0; i < schema.getFields().size(); ++i) { Field field = schema.getFields().get(i); FieldVector vector = fieldVectors.get(i); fieldVectorsMap.put(field.getName(), vector); } }
public VectorSchemaRoot(List<Field> fields, List<FieldVector> fieldVectors, int rowCount) { this(new Schema(fields), fieldVectors, rowCount); }
@Test public void testDataSetSchema() throws Exception { try( final KVStoreProvider kvstore = new LocalKVStoreProvider(DremioTest.CLASSPATH_SCAN_RESULT, null, true, false); ) { kvstore.start(); final NamespaceService ns = new NamespaceServiceImpl(kvstore); Field field1 = new Field("a", true, new Int(32, true), null); Field child1 = new Field("c", true, Utf8.INSTANCE, null); Field field2 = new Field("b", true, Struct.INSTANCE, ImmutableList.of(child1)); Schema schema = new Schema(ImmutableList.of(field1, field2)); FlatBufferBuilder builder = new FlatBufferBuilder(); schema.getSchema(builder); builder.finish(schema.getSchema(builder)); addSource(ns, "s"); addPhysicalDS(ns, "s.foo", builder.sizedByteArray()); ByteBuffer bb = ByteBuffer.wrap(DatasetHelper.getSchemaBytes(ns.getDataset(new NamespaceKey(PathUtils.parseFullPath("s.foo")))).toByteArray()); Schema returnedSchema = Schema.convertSchema(org.apache.arrow.flatbuf.Schema.getRootAsSchema(bb)); assertEquals(schema, returnedSchema); } }
private void newSchema() throws IOException { // Reset it to half of current number and bound it within the limits recordCountForNextMemCheck = min(max(MINIMUM_RECORD_COUNT_FOR_CHECK, recordCountForNextMemCheck / 2), MAXIMUM_RECORD_COUNT_FOR_CHECK); String json = new Schema(batchSchema).toJson(); extraMetaData.put(DREMIO_ARROW_SCHEMA_2_1, json); List<Type> types = Lists.newArrayList(); for (Field field : batchSchema) { if (field.getName().equalsIgnoreCase(WriterPrel.PARTITION_COMPARATOR_FIELD)) { continue; } Type childType = getType(field); if (childType != null) { types.add(childType); } } Preconditions.checkState(types.size() > 0, "No types for parquet schema"); schema = new MessageType("root", types); int dictionarySize = (int)context.getOptions().getOption(ExecConstants.PARQUET_DICT_PAGE_SIZE_VALIDATOR); final ParquetProperties parquetProperties = new ParquetProperties(dictionarySize, writerVersion, enableDictionary, new ParquetDirectByteBufferAllocator(columnEncoderAllocator), pageSize, true, enableDictionaryForBinary); pageStore = ColumnChunkPageWriteStoreExposer.newColumnChunkPageWriteStore(codecFactory.getCompressor(codec), schema, parquetProperties); store = new ColumnWriteStoreV1(pageStore, pageSize, parquetProperties); MessageColumnIO columnIO = new ColumnIOFactory(false).getColumnIO(this.schema); consumer = columnIO.getRecordWriter(store); setUp(schema, consumer); }
public static Schema deserialize(ByteBuffer buffer) { return convertSchema(org.apache.arrow.flatbuf.Schema.getRootAsSchema(buffer)); }
public byte[] toByteArray() { FlatBufferBuilder builder = new FlatBufferBuilder(); int schemaOffset = this.getSchema(builder); builder.finish(schemaOffset); ByteBuffer bb = builder.dataBuffer(); byte[] bytes = new byte[bb.remaining()]; bb.get(bytes); return bytes; }
Converter groupConverterFromArrowSchema(String nameForChild, String fieldName, GroupType groupType, Collection<SchemaPath> c) { final Field arrowField = Schema.findField(arrowSchema, fieldName); final ArrowTypeID arrowTypeType = arrowField.getType().getTypeID(); final List<Field> arrowChildren = arrowField.getChildren(); if (arrowTypeType == ArrowTypeID.Union) { // if it's a union we will add the children directly to the parent return new UnionGroupConverter(mutator, getWriterProvider(), groupType, c, options, arrowChildren, nameForChild, schemaHelper); } else if (arrowTypeType == ArrowTypeID.List) { // make sure the parquet schema matches the arrow schema and delegate handling the logical list to defaultGroupConverter() Preconditions.checkState(groupType.getOriginalType() == OriginalType.LIST, "parquet schema doesn't match the arrow schema for LIST " + nameForChild); } return defaultGroupConverter(mutator, groupType, nameForChild, c, arrowChildren); }
@Test public void testBackwardCompatofSchema() throws Exception { Schema schema = DremioArrowSchema.fromJSON(OLD_SCHEMA); // should not fail with serialization exception String newJson = schema.toJson(); assertFalse(newJson.contains("typeLayout")); } }
public static VectorSchemaRoot create(Schema schema, BufferAllocator allocator) { List<FieldVector> fieldVectors = new ArrayList<>(); for (Field field : schema.getFields()) { FieldVector vector = field.createVector(allocator); fieldVectors.add(vector); } if (fieldVectors.size() != schema.getFields().size()) { throw new IllegalArgumentException("The root vector did not create the right number of children. found " + fieldVectors.size() + " expected " + schema.getFields().size()); } return new VectorSchemaRoot(schema, fieldVectors, 0); }
@Override public Schema deserialize(JsonParser jsonParser, DeserializationContext deserializationContext) throws IOException, JsonProcessingException { JsonNode node = jsonParser.getCodec().readTree(jsonParser); JsonNode metadataNode = node.get("metadata"); Map<String,String> metadata = mapper.convertValue(metadataNode, Map.class); JsonNode fieldsNode = node.get("fields"); Iterable<Field> fields = fieldsReader.readValue(fieldsNode); return new Schema(fields, metadata); }
/** * Deserializes an Arrow Schema object from a schema message. Format is from serialize(). * * @param schemaMessage a Message of type MessageHeader.Schema * @return the deserialized Arrow Schema */ public static Schema deserializeSchema(Message schemaMessage) { return Schema.convertSchema((org.apache.arrow.flatbuf.Schema) schemaMessage.header(new org.apache.arrow.flatbuf.Schema())); }
/** * Serialize a schema object. * * @param out where to write the schema * @param schema the object to serialize to out * @return the number of bytes written * @throws IOException if something went wrong */ public static long serialize(WriteChannel out, Schema schema) throws IOException { long start = out.getCurrentPosition(); assert start % 8 == 0; FlatBufferBuilder builder = new FlatBufferBuilder(); int schemaOffset = schema.getSchema(builder); ByteBuffer serializedMessage = serializeMessage(builder, MessageHeader.Schema, schemaOffset, 0); int messageLength = serializedMessage.remaining(); int bytesWritten = writeMessageBuffer(out, messageLength, serializedMessage); assert bytesWritten % 8 == 0; return bytesWritten; }
se, context.getOptions(), arrowSchema == null ? null : arrowSchema.findField(fieldName), schemaHelper.readInt96AsTimeStamp() ); columnChunkMetaData = rowGroupMetadata.getColumns().get(columnChunkMetadataPositionsInList.get(Arrays.toString(column.getPath()))); schemaElement = schemaElements.get(column.getPath()[0]); Field childArrowField = arrowSchema == null ? null : arrowSchema.findField(schemaElement.getName()); if(childArrowField != null){ field = childArrowField;
public Schema start() throws JsonParseException, IOException { readToken(START_OBJECT); { Schema originalSchema = readNextField("schema", Schema.class); List<Field> fields = new ArrayList<>(); dictionaries = new HashMap<>(); // Convert fields with dictionaries to have the index type for (Field field : originalSchema.getFields()) { fields.add(DictionaryUtility.toMemoryFormat(field, allocator, dictionaries)); } this.schema = new Schema(fields, originalSchema.getCustomMetadata()); if (!dictionaries.isEmpty()) { nextFieldIs("dictionaries"); readDictionaryBatches(); } nextFieldIs("batches"); readToken(START_ARRAY); started = true; return this.schema; } }
public static TypedFieldId getFieldId(Schema schema, SchemaPath path, boolean isHyper){ int i = 0; for (Field f : schema.getFields()) { TypedFieldId id = getFieldId(f, i, path, isHyper); if (id != null) { return id; } i++; } return null; }
public static BatchSchema deserialize(byte[] bytes) { Schema schema = Schema.getRootAsSchema(ByteBuffer.wrap(bytes)); org.apache.arrow.vector.types.pojo.Schema s = org.apache.arrow.vector.types.pojo.Schema.convertSchema(schema); return new BatchSchema(SelectionVectorMode.NONE, s.getFields()); }