@Test public void testGetNextPartitionCheckpointIsLargerThanPartition() throws InterruptedException, IOException { final Path partition2Path = new Path(RAW_DATA_PATH, PARTITION2); this.fileSystem.mkdirs(new Path(partition2Path, FILE1)); final StringValue val1 = new StringValue(PARTITION2); final HDFSPartitionManager pm = new HDFSPartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, this.fileSystem); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fileSystem, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); metadataManager.saveChanges(); final Path partition1Path = new Path(RAW_DATA_PATH, PARTITION1); this.fileSystem.mkdirs(new Path(partition1Path, FILE1)); // Checkpoint value is greater than the partitions in the data folder so nothing new to process Assert.assertFalse(pm.getNextPartition(getLatestCheckpoint(metadataManager)).isPresent()); }
@Test public void testGetNextPartitionWithCheckpointLaterThanStartDate() throws IOException, ParseException { this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-05-01")); this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-06-01")); this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-07-01")); final SimpleDateFormat sdf = new SimpleDateFormat(HiveSourceConfiguration.HIVE_START_DATE_FORMAT); final Date startDate = sdf.parse("2017-05-03"); final StringValue val1 = new StringValue("datestr=2017-06-02"); final HDFSDatePartitionManager pm = new HDFSDatePartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, DATESTR, Optional.of(startDate), this.fs); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fs, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); metadataManager.saveChanges(); final Optional<StringValue> latestCheckpoint = metadataManager.get(MetadataConstants.CHECKPOINT_KEY); final Optional<String> partition = pm.getNextPartition(latestCheckpoint); Assert.assertTrue(partition.isPresent()); Assert.assertEquals("datestr=2017-07-01", partition.get()); }
@Test public void testGetNextPartitionWithCheckpointBeforeThanStartDate() throws IOException, ParseException { this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-05-01")); this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-06-01")); this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-07-01")); final SimpleDateFormat sdf = new SimpleDateFormat(HiveSourceConfiguration.HIVE_START_DATE_FORMAT); final Date startDate = sdf.parse("2017-06-01"); final StringValue val1 = new StringValue("datestr=2017-05-02"); final HDFSDatePartitionManager pm = new HDFSDatePartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, DATESTR, Optional.of(startDate), this.fs); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fs, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); metadataManager.saveChanges(); final Optional<StringValue> latestCheckpoint = metadataManager.get(MetadataConstants.CHECKPOINT_KEY); final Optional<String> partition = pm.getNextPartition(latestCheckpoint); Assert.assertTrue(partition.isPresent()); Assert.assertEquals("datestr=2017-07-01", partition.get()); }
@Test public void testGetNextPartitionMultipleDataPartitions() throws IOException, InterruptedException { final StringValue val1 = new StringValue(PARTITION1); final Path partition2Path = new Path(RAW_DATA_PATH, PARTITION2); final Path partition3Path = new Path(RAW_DATA_PATH, PARTITION3); this.fileSystem.create(new Path(partition2Path, FILE1)); this.fileSystem.create(new Path(partition3Path, FILE1)); final HDFSPartitionManager pm = new HDFSPartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, this.fileSystem); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fileSystem, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); metadataManager.saveChanges(); final Optional<StringValue> latestCheckpoint = getLatestCheckpoint(metadataManager); Assert.assertTrue(pm.getNextPartition(latestCheckpoint).isPresent()); Assert.assertTrue(pm.getNextPartition(latestCheckpoint).get().equals(PARTITION2)); }
@Test public void testGetNextPartitionWithMultipleDatePartitionsAndOneCheckpoint() throws IOException { // Job has multiple data partitions, one is less than checkpoint and the other is larger final Path partition1 = new Path(RAW_DATA_PATH, "datestr=2017-05-01"); final Path partition2 = new Path(RAW_DATA_PATH, "datestr=2017-05-03"); this.fs.mkdirs(new Path(partition1, FILE1)); this.fs.mkdirs(new Path(partition2, FILE1)); final StringValue val1 = new StringValue("datestr=2017-05-02"); final HDFSDatePartitionManager pm = new HDFSDatePartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, DATESTR, Optional.absent(), this.fs); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fs, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); metadataManager.saveChanges(); final Optional<StringValue> latestCheckpoint = metadataManager.get(MetadataConstants.CHECKPOINT_KEY); final Optional<String> partition = pm.getNextPartition(latestCheckpoint); Assert.assertTrue(partition.isPresent()); Assert.assertEquals("datestr=2017-05-03", partition.get()); }
@Test public void testGetNextPartitionWithOnlyTempFileCheckpoints() throws InterruptedException, IOException { final Path partitionPath = new Path(RAW_DATA_PATH, PARTITION1); final Path filePath = new Path(partitionPath, FILE1); this.fileSystem.create(filePath); final HDFSPartitionManager pm = new HDFSPartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, this.fileSystem); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fileSystem, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); // if this metadata was saved successfully we would say there's no partition to process // but this will be in a temp file so it will be ignored metadataManager.set(MetadataConstants.CHECKPOINT_KEY, new StringValue(PARTITION2)); metadataManager.saveChanges(); final Optional<FileStatus> fs = metadataManager.getLatestMetadataFile(); Assert.assertTrue(fs.isPresent()); // move the metadata file back to a temp location this.fileSystem.rename(fs.get().getPath(), new Path(fs.get().getPath().toString() + MetadataConstants.TEMP_FILE_EXTENSION)); final Optional<String> partition = pm.getNextPartition(getLatestCheckpoint(metadataManager)); Assert.assertTrue(partition.isPresent()); Assert.assertEquals(PARTITION1, partition.get()); }
@Test public void testGetNextPartitionWithSmallerExistentCheckpoint() throws IOException, InterruptedException { final StringValue val1 = new StringValue("datestr=2017-05-01"); final Path partition1 = new Path(RAW_DATA_PATH, "datestr=2017-05-02"); this.fs.mkdirs(new Path(partition1, FILE1)); final HDFSDatePartitionManager pm = new HDFSDatePartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, DATESTR, Optional.absent(), this.fs); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fs, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); metadataManager.saveChanges(); final Optional<StringValue> latestCheckpoint = metadataManager.get(MetadataConstants.CHECKPOINT_KEY); final Optional<String> partition = pm.getNextPartition(latestCheckpoint); Assert.assertTrue(partition.isPresent()); Assert.assertEquals("datestr=2017-05-02", partition.get()); }
@Test public void testGetNextPartitionWithLargerExistentCheckpoint() throws IOException, InterruptedException { // In this case the checkpoint is larger than the data partition so there is no "next" partition final StringValue val1 = new StringValue("datestr=2017-05-02"); final Path partition1 = new Path(RAW_DATA_PATH, "datestr=2017-05-01"); this.fs.mkdirs(new Path(partition1, FILE1)); final HDFSDatePartitionManager pm = new HDFSDatePartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, DATESTR, Optional.absent(), this.fs); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fs, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); metadataManager.saveChanges(); final Optional<StringValue> latestCheckpoint = metadataManager.get(MetadataConstants.CHECKPOINT_KEY); final Optional<String> partition = pm.getNextPartition(latestCheckpoint); Assert.assertFalse(partition.isPresent()); } }
@Test public void testGetNextPartitionSinglePartition() throws IOException, InterruptedException { final Path partitionPath = new Path(RAW_DATA_PATH, PARTITION2); final Path filePath = new Path(partitionPath, FILE1); this.fileSystem.create(filePath); final StringValue val1 = new StringValue(PARTITION1); final HDFSPartitionManager pm = new HDFSPartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, this.fileSystem); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fileSystem, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); metadataManager.saveChanges(); final Optional<StringValue> latestCheckpoint = getLatestCheckpoint(metadataManager); Assert.assertTrue(pm.getNextPartition(latestCheckpoint).isPresent()); Assert.assertTrue(pm.getNextPartition(latestCheckpoint).get().equals(PARTITION2)); }
@Test public void testHDFSOverwriteCheckpointValue() throws IOException, InterruptedException { final StringValue val1 = new StringValue("testVal"); this.metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); final StringValue val2 = new StringValue("testVal2"); this.metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val2); final Optional<StringValue> readValue = this.metadataManager.get(MetadataConstants.CHECKPOINT_KEY); Assert.assertTrue(readValue.isPresent()); Assert.assertTrue(readValue.get().getValue().equals("testVal2")); this.metadataManager.saveChanges(); final Optional<FileStatus> fs = this.metadataManager.getLatestMetadataFile(); Assert.assertTrue(fs.isPresent()); final Map<String, StringValue> loadedMap = this.metadataManager.loadMetadata(fs.get().getPath()); validateDeserializedMapEqualsInMemoryMap(loadedMap); }
@Test public void testHDFSReadWriteSingleMetadataFile() throws IOException { // Test in memory final StringValue val = new StringValue("testVal"); this.metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val); final Optional<StringValue> readValue = this.metadataManager.get(MetadataConstants.CHECKPOINT_KEY); Assert.assertTrue(readValue.isPresent()); Assert.assertTrue(readValue.get().getValue().equals("testVal")); this.metadataManager.set("foo", new StringValue("bar")); // Serialize the metadata map to a file this.metadataManager.saveChanges(); final Optional<FileStatus> fs = this.metadataManager.getLatestMetadataFile(); Assert.assertTrue(fs.isPresent()); // Deserialize the metadata map and check contents are the same final Map<String, StringValue> loadedMap = this.metadataManager.loadMetadata(fs.get().getPath()); validateDeserializedMapEqualsInMemoryMap(loadedMap); }
Assert.assertEquals(pm.getNextPartition(latestCheckpoint).get(), PARTITION1); metadataManager.set(MetadataConstants.CHECKPOINT_KEY, new StringValue(PARTITION1)); metadataManager.saveChanges(); Assert.assertEquals(pm2.getNextPartition(latestCheckpoint2).get(), PARTITION2); metadataManager2.set(MetadataConstants.CHECKPOINT_KEY, new StringValue(PARTITION2)); metadataManager2.saveChanges();
@Test public void testComputeWorkUnitsWithExistentCheckpoint() throws IOException { this.fileSystem.mkdirs(new Path(this.dataPath, PARTITION_1)); this.fileSystem.mkdirs(new Path(this.dataPath, PARTITION_3)); this.partitionManager = new HDFSPartitionManager(JOB_NAME, this.metadataPath, this.dataPath, this.fileSystem); this.metadataManager = new HDFSMetadataManager(this.fileSystem, new Path(this.metadataPath, JOB_NAME).toString(), new AtomicBoolean(true)); // partition 1 is in effect already processed since the checkpoint is larger final StringValue val1 = new StringValue(PARTITION_2); this.metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); this.metadataManager.saveChanges(); final ParquetWorkUnitCalculator calculator = new ParquetWorkUnitCalculator(this.hiveConfig, this.fileSystem); calculator.initPreviousRunState(this.metadataManager); final IWorkUnitCalculator.IWorkUnitCalculatorResult iresult = calculator.computeWorkUnits(); Assert.assertTrue(iresult instanceof ParquetWorkUnitCalculatorResult); final ParquetWorkUnitCalculatorResult result = (ParquetWorkUnitCalculatorResult) iresult; final List<String> workUnits = result.getWorkUnits(); Assert.assertEquals(1, workUnits.size()); Assert.assertEquals(PARTITION_3, workUnits.get(0)); Assert.assertTrue(result.getNextRunState().getPartition().isPresent()); Assert.assertEquals(PARTITION_3, result.getNextRunState().getPartition().get()); }
metadataManager2.set(MetadataConstants.CHECKPOINT_KEY, new StringValue(StringTypes.EMPTY)); calc.initPreviousRunState(metadataManager2); final IWorkUnitCalculator.IWorkUnitCalculatorResult<String, HiveRunState> iresult = calc.computeWorkUnits();
@Test public void testDeletionIsPropagated() throws Exception { final StringValue val1 = new StringValue("testVal"); this.metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); this.metadataManager.saveChanges(); Optional<FileStatus> fs = this.metadataManager.getLatestMetadataFile(); Assert.assertTrue(fs.isPresent()); Map<String, StringValue> loadedMap = this.metadataManager.loadMetadata(fs.get().getPath()); validateDeserializedMapEqualsInMemoryMap(loadedMap); // reload the configuration setupTest(); Assert.assertTrue(this.metadataManager.get(MetadataConstants.CHECKPOINT_KEY).isPresent()); this.metadataManager.remove(MetadataConstants.CHECKPOINT_KEY); Assert.assertFalse(this.metadataManager.get(MetadataConstants.CHECKPOINT_KEY).isPresent()); this.metadataManager.saveChanges(); fs = this.metadataManager.getLatestMetadataFile(); Assert.assertFalse(this.metadataManager.get(MetadataConstants.CHECKPOINT_KEY).isPresent()); loadedMap = this.metadataManager.loadMetadata(fs.get().getPath()); validateDeserializedMapEqualsInMemoryMap(loadedMap); }