public static List<IndexedRecord> generateHoodieTestRecords(int from, int limit) throws IOException, URISyntaxException { List<IndexedRecord> records = generateTestRecords(from, limit); String commitTime = HoodieActiveTimeline.createNewCommitTime(); Schema hoodieFieldsSchema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); return records.stream().map(s -> HoodieAvroUtils.rewriteRecord((GenericRecord) s, hoodieFieldsSchema)).map(p -> { p.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, UUID.randomUUID().toString()); p.put(HoodieRecord.PARTITION_PATH_METADATA_FIELD, "0000/00/00"); p.put(HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitTime); return p; }).collect(Collectors.toList()); }
@SuppressWarnings({"unchecked", "deprecation"}) private static void generateParquetData(Path filePath, boolean isParquetSchemaSimple) throws IOException, URISyntaxException, InterruptedException { Schema schema = (isParquetSchemaSimple ? SchemaTestUtil.getSimpleSchema() : SchemaTestUtil.getEvolvedSchema()); org.apache.parquet.schema.MessageType parquetSchema = new AvroSchemaConverter().convert(schema); BloomFilter filter = new BloomFilter(1000, 0.0001); HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(parquetSchema, schema, filter); ParquetWriter writer = new ParquetWriter(filePath, writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED, ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED, ParquetWriter.DEFAULT_WRITER_VERSION, fileSystem.getConf()); List<IndexedRecord> testRecords = (isParquetSchemaSimple ? SchemaTestUtil .generateTestRecords(0, 100) : SchemaTestUtil.generateEvolvedTestRecords(100, 100)); testRecords.forEach(s -> { try { writer.write(s); } catch (IOException e) { fail("IOException while writing test records as parquet" + e.toString()); } }); writer.close(); }
public static List<HoodieRecord> generateHoodieTestRecords(int from, int limit, Schema schema) throws IOException, URISyntaxException { List<IndexedRecord> records = generateTestRecords(from, limit); return records.stream() .map(s -> HoodieAvroUtils.rewriteRecord((GenericRecord) s, schema)) .map(p -> convertToHoodieRecords(p, UUID.randomUUID().toString(), "000/00/00")).collect( Collectors.toList()); }
public static List<IndexedRecord> generateEvolvedTestRecords(int from, int limit) throws IOException, URISyntaxException { return toRecords(getSimpleSchema(), getEvolvedSchema(), from, limit); }
public static List<IndexedRecord> generateTestRecords(int from, int limit) throws IOException, URISyntaxException { return toRecords(getSimpleSchema(), getSimpleSchema(), from, limit); }
@Test(expected = IOException.class) public void simpleTestWithException() throws IOException, URISyntaxException { Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema()); String payloadClazz = HoodieAvroPayload.class.getName(); ExternalSpillableMap<String, HoodieRecord<? extends HoodieRecordPayload>> records = new ExternalSpillableMap<>(16L, FAILURE_OUTPUT_PATH, new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema)); //16B List<IndexedRecord> iRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100); List<String> recordKeys = SpillableMapTestUtils.upsertRecords(iRecords, records); assert (recordKeys.size() == 100); Iterator<HoodieRecord<? extends HoodieRecordPayload>> itr = records.iterator(); while (itr.hasNext()) { throw new IOException("Testing failures..."); } }
@Test public void testSimpleUpsert() throws IOException, URISyntaxException { Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema()); String payloadClazz = HoodieAvroPayload.class.getName(); new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema)); //16B List<IndexedRecord> iRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100); List<String> recordKeys = SpillableMapTestUtils.upsertRecords(iRecords, records); assert (recordKeys.size() == 100); assert recordKeys.contains(rec.getRecordKey()); List<IndexedRecord> updatedRecords = SchemaTestUtil.updateHoodieTestRecords(recordKeys, SchemaTestUtil.generateHoodieTestRecords(0, 100), HoodieActiveTimeline.createNewCommitTime());
assertEquals("Hive Schema should match the dataset schema + partition field", hiveClient.getTableSchema().size(), SchemaTestUtil.getSimpleSchema().getFields().size() + 1); assertEquals("Table partitions should match the number of partitions we wrote", 5, hiveClient.scanTablePartitions().size()); SchemaTestUtil.getEvolvedSchema().getFields().size() + 1);
@Test public void testDataCorrectnessWithoutHoodieMetadata() throws IOException, URISyntaxException { Schema schema = SchemaTestUtil.getSimpleSchema(); String payloadClazz = HoodieAvroPayload.class.getName(); List<HoodieRecord> hoodieRecords = SchemaTestUtil.generateHoodieTestRecordsWithoutHoodieMetadata(0, 100); hoodieRecords.stream().forEach(r -> { records.put(r.getRecordKey(), r); SchemaTestUtil.updateHoodieTestRecordsWithoutHoodieMetadata(recordsToUpdate, schema, fieldName, newValue); SchemaTestUtil.updateHoodieTestRecordsWithoutHoodieMetadata(recordsToUpdate, schema, fieldName, newValue);
public static List<HoodieRecord> generateHoodieTestRecordsWithoutHoodieMetadata(int from, int limit) throws IOException, URISyntaxException { List<IndexedRecord> iRecords = generateTestRecords(from, limit); return iRecords .stream() .map(r -> new HoodieRecord<>(new HoodieKey(UUID.randomUUID().toString(), "0000/00/00"), new HoodieAvroPayload(Optional.of((GenericRecord) r)))).collect(Collectors.toList()); }
@Test public void simpleInsertTest() throws IOException, URISyntaxException { Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema()); String payloadClazz = HoodieAvroPayload.class.getName(); ExternalSpillableMap<String, HoodieRecord<? extends HoodieRecordPayload>> records = new ExternalSpillableMap<>(16L, BASE_OUTPUT_PATH, new DefaultSizeEstimator(), new HoodieRecordSizeEstimator(schema)); //16B List<IndexedRecord> iRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100); List<String> recordKeys = SpillableMapTestUtils.upsertRecords(iRecords, records); assert (recordKeys.size() == 100); Iterator<HoodieRecord<? extends HoodieRecordPayload>> itr = records.iterator(); List<HoodieRecord> oRecords = new ArrayList<>(); while (itr.hasNext()) { HoodieRecord<? extends HoodieRecordPayload> rec = itr.next(); oRecords.add(rec); assert recordKeys.contains(rec.getRecordKey()); } }
@Test public void testDataCorrectnessWithUpsertsToDataInMapAndOnDisk() throws IOException, URISyntaxException { Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getSimpleSchema()); String payloadClazz = HoodieAvroPayload.class.getName(); List<IndexedRecord> iRecords = SchemaTestUtil.generateHoodieTestRecords(0, 100); recordKeys.addAll(SpillableMapTestUtils.upsertRecords(iRecords, records)); .updateHoodieTestRecords(keysToBeUpdated, recordsToUpdate, newCommitTime); keysToBeUpdated.add(key); updatedRecords = SchemaTestUtil.updateHoodieTestRecords(keysToBeUpdated, recordsToUpdate, newCommitTime);
SchemaTestUtil.getSimpleSchema().getFields().size() + 1); assertEquals("Table partitions should match the number of partitions we wrote", 5, hiveClientRT.scanTablePartitions().size()); SchemaTestUtil.getEvolvedSchema().getFields().size() + 1);
@Test public void testAppendNotSupported() throws IOException, URISyntaxException, InterruptedException { // Use some fs like LocalFileSystem, that does not support appends Path localPartitionPath = new Path("file://" + partitionPath); FileSystem localFs = FSUtils.getFs(localPartitionPath.toString(), HoodieTestUtils.getDefaultHadoopConf()); Path testPath = new Path(localPartitionPath, "append_test"); localFs.mkdirs(testPath); // Some data & append two times. List<IndexedRecord> records = SchemaTestUtil.generateTestRecords(0, 100); Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header); for (int i = 0; i < 2; i++) { HoodieLogFormat.newWriterBuilder().onParentPath(testPath) .withFileExtension(HoodieArchivedLogFile.ARCHIVE_EXTENSION).withFileId("commits.archive").overBaseCommit("") .withFs(localFs).build().appendBlock(dataBlock).close(); } // ensure there are two log file versions, with same data. FileStatus[] statuses = localFs.listStatus(testPath); assertEquals(2, statuses.length); }
.withFileExtension(HoodieLogFile.DELTA_EXTENSION).withSizeThreshold(1024) .withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build(); Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); List<IndexedRecord> records1 = SchemaTestUtil.generateHoodieTestRecords(0, 100); List<IndexedRecord> copyOfRecords1 = records1.stream().map( record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
private static HoodieLogFile generateLogData(Path parquetFilePath, boolean isLogSchemaSimple) throws IOException, InterruptedException, URISyntaxException { Schema schema = (isLogSchemaSimple ? SchemaTestUtil.getSimpleSchema() : SchemaTestUtil.getEvolvedSchema()); HoodieDataFile dataFile = new HoodieDataFile(fileSystem.getFileStatus(parquetFilePath)); // Write a log file for this parquet file Writer logWriter = HoodieLogFormat.newWriterBuilder().onParentPath(parquetFilePath.getParent()) .withFileExtension(HoodieLogFile.DELTA_EXTENSION) .withFileId(dataFile.getFileId()) .overBaseCommit(dataFile.getCommitTime()).withFs(fileSystem).build(); List<IndexedRecord> records = (isLogSchemaSimple ? SchemaTestUtil.generateTestRecords(0, 100) : SchemaTestUtil.generateEvolvedTestRecords(100, 100)); Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, dataFile.getCommitTime()); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString()); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header); logWriter.appendBlock(dataBlock); logWriter.close(); return logWriter.getLogFile(); }
@Test public void testBasicAppend() throws IOException, InterruptedException, URISyntaxException { Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") .overBaseCommit("100").withFs(fs).build(); List<IndexedRecord> records = SchemaTestUtil.generateTestRecords(0, 100); Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header); writer = writer.appendBlock(dataBlock); long size = writer.getCurrentSize(); assertTrue("We just wrote a block - size should be > 0", size > 0); assertEquals("Write should be auto-flushed. The size reported by FileStatus and the writer should match", size, fs.getFileStatus(writer.getLogFile().getPath()).getLen()); writer.close(); }
Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); List<IndexedRecord> records1 = SchemaTestUtil.generateHoodieTestRecords(0, 100); List<IndexedRecord> copyOfRecords1 = records1.stream().map( record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
@SuppressWarnings("unchecked") @Test public void testBasicWriteAndScan() throws IOException, URISyntaxException, InterruptedException { Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1") .overBaseCommit("100").withFs(fs).build(); Schema schema = getSimpleSchema(); List<IndexedRecord> records = SchemaTestUtil.generateTestRecords(0, 100); List<IndexedRecord> copyOfRecords = records.stream().map( record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList()); Map<HoodieLogBlock.HeaderMetadataType, String> header = Maps.newHashMap(); header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100"); header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString()); HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header); writer = writer.appendBlock(dataBlock); writer.close(); Reader reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema()); assertTrue("We wrote a block, we should be able to read it", reader.hasNext()); HoodieLogBlock nextBlock = reader.next(); assertEquals("The next block should be a data block", HoodieLogBlockType.AVRO_DATA_BLOCK, nextBlock.getBlockType()); HoodieAvroDataBlock dataBlockRead = (HoodieAvroDataBlock) nextBlock; assertEquals("Read records size should be equal to the written records size", copyOfRecords.size(), dataBlockRead.getRecords().size()); assertEquals("Both records lists should be the same. (ordering guaranteed)", copyOfRecords, dataBlockRead.getRecords()); reader.close(); }
Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema()); List<IndexedRecord> records1 = SchemaTestUtil.generateHoodieTestRecords(0, 100); List<IndexedRecord> copyOfRecords1 = records1.stream().map( record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());