protected static void writeRepeateListData(ParquetWriter<Group> writer, int elementNum, boolean isNull) throws IOException { SimpleGroupFactory f = new SimpleGroupFactory(schema); int listMaxSize = 4; for (int i = 0; i < elementNum; i++) { Group group = f.newGroup(); if (!isNull) { for (int j = 0; j < listMaxSize; j++) { group.append("list_int32_field_for_repeat_test", j); } } writer.write(group); } writer.close(); }
public GroupRecordConverter(MessageType schema) { this.simpleGroupFactory = new SimpleGroupFactory(schema); this.root = new SimpleGroupConverter(null, 0, schema) { @Override public void start() { this.current = simpleGroupFactory.newGroup(); } @Override public void end() { } }; }
@Override public void start() { this.current = simpleGroupFactory.newGroup(); }
public GroupRecordConverter(MessageType schema) { this.simpleGroupFactory = new SimpleGroupFactory(schema); this.root = new SimpleGroupConverter(null, 0, schema) { @Override public void start() { this.current = simpleGroupFactory.newGroup(); } @Override public void end() { } }; }
@Override public void start() { this.current = simpleGroupFactory.newGroup(); }
protected static void writeRepeateMapData( ParquetWriter<Group> writer, int elementNum, boolean isNull) throws IOException { SimpleGroupFactory f = new SimpleGroupFactory(schema); int mapMaxSize = 4; for (int i = 0; i < elementNum; i++) { Group group = f.newGroup(); if (!isNull) { for (int j = 0; j < mapMaxSize; j++) { group.addGroup("map_int32_for_repeat_test").append("key", j).append("value", j); } } writer.write(group); } writer.close(); }
private ApacheParquet(String outputPath, MessageType schema, WriterVersion writerVersion) throws IOException { this.schema = schema; this.outputPath = outputPath; Configuration configuration = new Configuration(); GroupWriteSupport.setSchema(schema, configuration); this.writer = ExampleParquetWriter.builder(new Path(outputPath)) .withType(schema) .withConf(configuration) .withPageSize(DEFAULT_PAGE_SIZE) .withDictionaryPageSize(DEFAULT_PAGE_SIZE) .withDictionaryEncoding(DEFAULT_IS_DICTIONARY_ENABLED) .withValidation(DEFAULT_IS_VALIDATING_ENABLED) .withWriterVersion(writerVersion) .withRowGroupSize(DEFAULT_BLOCK_SIZE) // set Parquet file block size and page size values .withCompressionCodec(CompressionCodecName.UNCOMPRESSED) //压缩类型 .build(); this.groupFactory = new SimpleGroupFactory(this.schema); }
@Override public void writeLine(List<Object> evalRow) { Group group = groupFactory.newGroup(); List<ColumnDescriptor> columns = schema.getColumns(); for (int i = 0; i < evalRow.size(); i++) { Object value = evalRow.get(i); addValueToGroup(columns.get(i).getType().javaType, group, i, value); } try { writeGroup(group); } catch (IOException e) { logger.error("", e); } }
protected static void writeListData(ParquetWriter<Group> writer, boolean isDictionaryEncoding, int elementNum) throws IOException { SimpleGroupFactory f = new SimpleGroupFactory(schema); int listMaxSize = 4; int listElementIndex = 0; for (int i = 0; i < elementNum; i++) { boolean isNull = isNull(i); Group group = f.newGroup(); int listSize = i % listMaxSize + 1; if (!isNull) { for (int j = 0; j < listSize; j++) { group.append("list_int32_field", getIntValue(isDictionaryEncoding, listElementIndex)); group.append("list_int64_field", getLongValue(isDictionaryEncoding, listElementIndex)); group.append("list_double_field", getDoubleValue(isDictionaryEncoding, listElementIndex)); group.append("list_float_field", getFloatValue(isDictionaryEncoding, listElementIndex)); group.append("list_boolean_field", getBooleanValue(listElementIndex)); group.append("list_binary_field", getBinaryValue(isDictionaryEncoding, listElementIndex)); HiveDecimal hd = getDecimal(isDictionaryEncoding, listElementIndex).setScale(2); HiveDecimalWritable hdw = new HiveDecimalWritable(hd); group.append("list_decimal_field", Binary.fromConstantByteArray(hdw.getInternalStorage())); listElementIndex++; } } for (int j = 0; j < listMaxSize; j++) { group.append("list_binary_field_for_repeat_test", getBinaryValue(isDictionaryEncoding, i)); } writer.write(group); } writer.close(); }
.named("Pair"); GroupFactory factory = new SimpleGroupFactory(schema);
@Override public void writeLine(Row row) { Group group = groupFactory.newGroup(); List<ColumnDescriptor> columns = schema.getColumns(); for (int i = 0; i < row.size(); i++) { Object value = row.getAs(i); addValueToGroup(columns.get(i).getType().javaType, group, i++, value); } try { writeGroup(group); } catch (IOException e) { logger.error("", e); } }
protected static void writeMapData(ParquetWriter<Group> writer, boolean isDictionaryEncoding, int elementNum) throws IOException { SimpleGroupFactory f = new SimpleGroupFactory(schema); int mapMaxSize = 4; int mapElementIndex = 0; for (int i = 0; i < elementNum; i++) { boolean isNull = isNull(i); Group group = f.newGroup();
@Override public void writeLine(Map<String, Object> evalRow) { //--创建一个 不区分key大小写的map Map obj = new org.apache.commons.collections.map.CaseInsensitiveMap(evalRow); Group group = groupFactory.newGroup(); int i = 0; for (Type field : schema.getFields()) { OriginalType o = field.getOriginalType(); Class<?> javaType = (o != null && o.name().equals("MAP")) ? Map.class : field.asPrimitiveType().getPrimitiveTypeName().javaType; Object value = obj.get(field.getName()); try { addValueToGroup(javaType, group, i++, value); } catch (Exception e) { if (!errField.contains(field.getName())) { errField.offer(field.getName()); logger.warn("错误字段:{}:{} 原因:{} file={}", field.getName(), value, e.getMessage(), outputPath); } } } try { writeGroup(group); } catch (Exception e) { logger.warn("错误行:{} err:", evalRow, e); } }
protected static void writeData(ParquetWriter<Group> writer, boolean isDictionaryEncoding) throws IOException { SimpleGroupFactory f = new SimpleGroupFactory(schema); for (int i = 0; i < nElements; i++) { boolean isNull = isNull(i); boolean booleanVal = getBooleanValue(i); Binary binary = getBinaryValue(isDictionaryEncoding, i); Group group = f.newGroup() .append("int32_field", intVal) .append("int64_field", longVal)
private void writeParquetFile(Path data,Configuration conf) throws IOException { MessageType schema = parseMessageType( "message test { " + "required int32 key; " + "required int32 column1_i; " + "required double column2_d; " + "required binary column3_s; " + "required boolean column4_b; " + "} "); GroupWriteSupport.setSchema(schema, conf); SimpleGroupFactory f = new SimpleGroupFactory(schema); ParquetWriter<Group> writer = new ParquetWriter<Group>(data, new GroupWriteSupport(), UNCOMPRESSED, 1024, 1024, 512, true, false, ParquetProperties.WriterVersion.PARQUET_1_0, conf); writer.write(f.newGroup().append("key", 1).append("column1_i", 3).append("column2_d", 2.3) .append("column3_s", "some string").append("column4_b", true)); writer.write(f.newGroup().append("key", 2).append("column1_i", 5).append("column2_d", 4.5) .append("column3_s", "some more").append("column4_b", false)); writer.write(f.newGroup().append("key", 3).append("column1_i", 7).append("column2_d", 5.6) .append("column3_s", "some more and more").append("column4_b", true)); writer.write(f.newGroup().append("key", 4).append("column1_i", 9).append("column2_d",10.9) .append("column3_s", "some more and alst").append("column4_b", false)); writer.close(); } }
private void writeParquetFile(Path data,Configuration conf) throws IOException { MessageType schema = parseMessageType( "message test { " + "required int32 key; " + "required int32 column1_i_s; " + "required binary column2_d; " + "required binary column3_s; " + "required boolean column4_b; " + "} "); GroupWriteSupport.setSchema(schema, conf); SimpleGroupFactory f = new SimpleGroupFactory(schema); ParquetWriter<Group> writer = new ParquetWriter<Group>(data, new GroupWriteSupport(), UNCOMPRESSED, 1024, 1024, 512, true, false, ParquetProperties.WriterVersion.PARQUET_1_0, conf); writer.write(f.newGroup().append("key", 1).append("column1_i_s", 292).append("column2_d", "no type") .append("column3_s", "some string").append("column4_b", true)); writer.write(f.newGroup().append("key", 2).append("column1_i_s", 23).append("column2_d", "no type") .append("column3_s", "some more").append("column4_b", false)); writer.write(f.newGroup().append("key", 3).append("column1_i_s", 32).append("column2_d", "no type") .append("column3_s", "some more and more").append("column4_b", true)); writer.write(f.newGroup().append("key", 4).append("column1_i_s", 22).append("column2_d", "no type") .append("column3_s", "some more and alst").append("column4_b", false)); writer.close(); } }