@NotEmpty final Optional<StringValue> latestCheckPoint) throws IOException { if (this.isSinglePartition()) { log.info("Next partition: {}", this.rawDataRootPath); return Optional.of(this.rawDataRootPath); final LocalDate startDate = getDefaultStartDate(); final List<LocalDate> existingPartitions = listSortedPartitionsAfterDate(compareDate); if (!existingPartitions.isEmpty()) {
private boolean hasPartitionKeyNameInPartition() { try { final boolean hasPartitionKey = getExistingPartitions() .stream() .anyMatch(partition -> partition.startsWith(this.partitionKeyName)); return hasPartitionKey; } catch (IOException e) { throw new JobRuntimeException(String.format("Unable to read existing partitions in the HDFS Path {}", this.rawDataRootPath)); } } }
public HDFSDatePartitionManager(@NotEmpty final String metadataKey, @NotEmpty final String genericBaseMetadataPath, @NotEmpty final String genericBaseDataPath, @NotEmpty final String partitionKeyName, @NonNull final Optional<Date> startDate, @NonNull final FileSystem fileSystem) throws IOException { super(metadataKey, genericBaseMetadataPath, genericBaseDataPath, fileSystem); this.partitionKeyName = partitionKeyName + StringTypes.EQUAL; this.hasPartitionKeyInHDFSPartitionPath = hasPartitionKeyNameInPartition(); this.startDate = startDate; log.info("HDFSDatePartitionManager has partitionKey in HDFS path: {}", this.hasPartitionKeyInHDFSPartitionPath); }
@Test public void testGetNextPartitionWithStartDateAndNoCheckpoint() throws IOException, ParseException { this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-05-01")); this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-06-01")); final SimpleDateFormat sdf = new SimpleDateFormat(HiveSourceConfiguration.HIVE_START_DATE_FORMAT); final Date startDate = sdf.parse("2017-05-15"); final HDFSDatePartitionManager pm = new HDFSDatePartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, DATESTR, Optional.of(startDate), this.fs); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fs, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); final Optional<StringValue> latestCheckpoint = metadataManager.get(MetadataConstants.CHECKPOINT_KEY); final Optional<String> partition = pm.getNextPartition(latestCheckpoint); Assert.assertTrue(partition.isPresent()); Assert.assertEquals("datestr=2017-06-01", partition.get()); }
public ParquetWorkUnitCalculator(@NonNull final HiveSourceConfiguration hiveConf, @NonNull final FileSystem fs) throws IOException { this.hiveConf = hiveConf; final PartitionType partitionType = hiveConf.getPartitionType(); log.info("Create partition manger with partition type: {}", partitionType); if (partitionType.equals(PartitionType.NONE) || partitionType.equals(PartitionType.NORMAL)) { // create partition manager internally this.partitionManager = new HDFSPartitionManager(hiveConf.getJobName(), hiveConf.getBaseMetadataPath(), hiveConf.getDataPath(), fs); } else if (partitionType.equals(PartitionType.DATE)) { this.partitionManager = new HDFSDatePartitionManager(hiveConf.getJobName(), hiveConf.getBaseMetadataPath(), hiveConf.getDataPath(), hiveConf.getPartitionKeyName().get(), getHiveConf().getStartDate(), fs); } else { throw new JobRuntimeException("Error: Partition type is not supported. Partition type: " + partitionType); } }
@Test public void testGetNextPartitionWitMultipleDatePartitionsAndNoCheckpoint() throws IOException { this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-05-01")); this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-05-02")); final HDFSDatePartitionManager pm = new HDFSDatePartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, DATESTR, Optional.absent(), this.fs); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fs, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); final Optional<StringValue> latestCheckpoint = metadataManager.get(MetadataConstants.CHECKPOINT_KEY); final Optional<String> partition = pm.getNextPartition(latestCheckpoint); Assert.assertTrue(partition.isPresent()); Assert.assertEquals("datestr=2017-05-01", partition.get()); }
@Test public void testGetNextPartitionWithNonexistentCheckpoint() throws IOException { final Path basePath = new Path(RAW_DATA_PATH, "datestr=2017-05-01"); this.fs.mkdirs(basePath); final HDFSDatePartitionManager pm = new HDFSDatePartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, DATESTR, Optional.absent(), this.fs); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fs, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); final Optional<StringValue> latestCheckpoint = metadataManager.get(MetadataConstants.CHECKPOINT_KEY); final Optional<String> partition = pm.getNextPartition(latestCheckpoint); Assert.assertTrue(partition.isPresent()); Assert.assertEquals("datestr=2017-05-01", partition.get()); }
/** * Returns the partitions in sorted ascending order which are after the date value * @param localDate * @return * @throws IOException */ private List<LocalDate> listSortedPartitionsAfterDate(final LocalDate localDate) throws IOException { final LocalDate startDate = localDate.plusDays(1); final List<LocalDate> partitions = getExistingPartitions() .stream() .map(dt -> DateUtil.convertToUTCDate(dt.replace(this.partitionKeyName, StringTypes.EMPTY))) .filter(dt -> dt.compareTo(startDate) >= 0) .collect(Collectors.toList()); return partitions; }
@Test public void testGetNextPartitionWithSmallerExistentCheckpoint() throws IOException, InterruptedException { final StringValue val1 = new StringValue("datestr=2017-05-01"); final Path partition1 = new Path(RAW_DATA_PATH, "datestr=2017-05-02"); this.fs.mkdirs(new Path(partition1, FILE1)); final HDFSDatePartitionManager pm = new HDFSDatePartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, DATESTR, Optional.absent(), this.fs); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fs, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); metadataManager.saveChanges(); final Optional<StringValue> latestCheckpoint = metadataManager.get(MetadataConstants.CHECKPOINT_KEY); final Optional<String> partition = pm.getNextPartition(latestCheckpoint); Assert.assertTrue(partition.isPresent()); Assert.assertEquals("datestr=2017-05-02", partition.get()); }
@Test public void testGetNextPartitionWithLargerExistentCheckpoint() throws IOException, InterruptedException { // In this case the checkpoint is larger than the data partition so there is no "next" partition final StringValue val1 = new StringValue("datestr=2017-05-02"); final Path partition1 = new Path(RAW_DATA_PATH, "datestr=2017-05-01"); this.fs.mkdirs(new Path(partition1, FILE1)); final HDFSDatePartitionManager pm = new HDFSDatePartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, DATESTR, Optional.absent(), this.fs); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fs, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); metadataManager.saveChanges(); final Optional<StringValue> latestCheckpoint = metadataManager.get(MetadataConstants.CHECKPOINT_KEY); final Optional<String> partition = pm.getNextPartition(latestCheckpoint); Assert.assertFalse(partition.isPresent()); } }
@Test public void testGetNextPartitionWithCheckpointLaterThanStartDate() throws IOException, ParseException { this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-05-01")); this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-06-01")); this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-07-01")); final SimpleDateFormat sdf = new SimpleDateFormat(HiveSourceConfiguration.HIVE_START_DATE_FORMAT); final Date startDate = sdf.parse("2017-05-03"); final StringValue val1 = new StringValue("datestr=2017-06-02"); final HDFSDatePartitionManager pm = new HDFSDatePartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, DATESTR, Optional.of(startDate), this.fs); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fs, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); metadataManager.saveChanges(); final Optional<StringValue> latestCheckpoint = metadataManager.get(MetadataConstants.CHECKPOINT_KEY); final Optional<String> partition = pm.getNextPartition(latestCheckpoint); Assert.assertTrue(partition.isPresent()); Assert.assertEquals("datestr=2017-07-01", partition.get()); }
@Test public void testGetNextPartitionWithCheckpointBeforeThanStartDate() throws IOException, ParseException { this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-05-01")); this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-06-01")); this.fs.mkdirs(new Path(RAW_DATA_PATH, "datestr=2017-07-01")); final SimpleDateFormat sdf = new SimpleDateFormat(HiveSourceConfiguration.HIVE_START_DATE_FORMAT); final Date startDate = sdf.parse("2017-06-01"); final StringValue val1 = new StringValue("datestr=2017-05-02"); final HDFSDatePartitionManager pm = new HDFSDatePartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, DATESTR, Optional.of(startDate), this.fs); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fs, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); metadataManager.saveChanges(); final Optional<StringValue> latestCheckpoint = metadataManager.get(MetadataConstants.CHECKPOINT_KEY); final Optional<String> partition = pm.getNextPartition(latestCheckpoint); Assert.assertTrue(partition.isPresent()); Assert.assertEquals("datestr=2017-07-01", partition.get()); }
@Test public void testGetNextPartitionWithMultipleDatePartitionsAndOneCheckpoint() throws IOException { // Job has multiple data partitions, one is less than checkpoint and the other is larger final Path partition1 = new Path(RAW_DATA_PATH, "datestr=2017-05-01"); final Path partition2 = new Path(RAW_DATA_PATH, "datestr=2017-05-03"); this.fs.mkdirs(new Path(partition1, FILE1)); this.fs.mkdirs(new Path(partition2, FILE1)); final StringValue val1 = new StringValue("datestr=2017-05-02"); final HDFSDatePartitionManager pm = new HDFSDatePartitionManager(JOBNAME, HDFSTestConstants.BASE_METADATA_PATH, RAW_DATA_PATH, DATESTR, Optional.absent(), this.fs); final HDFSMetadataManager metadataManager = new HDFSMetadataManager(this.fs, new Path(HDFSTestConstants.BASE_METADATA_PATH, JOBNAME).toString(), new AtomicBoolean(true)); metadataManager.set(MetadataConstants.CHECKPOINT_KEY, val1); metadataManager.saveChanges(); final Optional<StringValue> latestCheckpoint = metadataManager.get(MetadataConstants.CHECKPOINT_KEY); final Optional<String> partition = pm.getNextPartition(latestCheckpoint); Assert.assertTrue(partition.isPresent()); Assert.assertEquals("datestr=2017-05-03", partition.get()); }