/** * Will return complete partition name i.e. dbName@tableName@partitionName */ @Override public String datasetURN() { return this.hivePartition.getCompleteName(); }
@Override public int compare(Partition o1, Partition o2) { return o1.getCompleteName().compareTo(o2.getCompleteName()); } });
@Override public boolean shouldCreateWorkunit(Partition sourcePartition, LongWatermark lowWatermark) { // If a whitelist is provided only create workunits for those partitions if (!this.partitionsWhitelist.isEmpty()) { return this.partitionsWhitelist.contains(sourcePartition.getCompleteName()); } // If no whitelist is set, all partitions of a dataset are backfilled return true; }
@Override public Void call() throws Exception { // Execute validation queries log.debug(String.format("Going to execute count validation queries queries: %s for format: %s " + "and partition %s", countValidationQueries, format, sourcePartition.getCompleteName())); List<Long> rowCounts = ValidationJob.this.getValidationOutputFromHive(countValidationQueries); log.debug(String.format("Going to execute data validation queries: %s for format: %s and partition %s", dataValidationQueries, format, sourcePartition.getCompleteName())); List<Long> rowDataValidatedCount = ValidationJob.this.getValidationOutputFromHive(dataValidationQueries); // Validate and populate report validateAndPopulateReport(sourcePartition.getCompleteName(), updateTime, rowCounts, rowDataValidatedCount); return null; } }));
@Override public HiveDatasetVersion apply(Partition partition) { try { return getDatasetVersion(partition); } catch (Throwable e) { log.warn(String.format("Failed to get DatasetVersion %s. Skipping.", partition.getCompleteName()), e); return null; } } }), Predicates.notNull()));
/** * Determine if the {@link Table} or {@link Partition} should be validated by checking if its create time * lies between maxLookBackTime and skipRecentThanTime window. */ private boolean shouldValidate(Partition partition) { for (String pathToken : this.ignoreDataPathIdentifierList) { if (partition.getDataLocation().toString().toLowerCase().contains(pathToken.toLowerCase())) { log.info("Skipping partition " + partition.getCompleteName() + " containing invalid token " + pathToken .toLowerCase()); return false; } } try { long createTime = getPartitionCreateTime(partition.getName()); boolean withinTimeWindow = new DateTime(createTime).isAfter(this.maxLookBackTime) && new DateTime(createTime) .isBefore(this.skipRecentThanTime); if (!withinTimeWindow) { log.info("Skipping partition " + partition.getCompleteName() + " as create time " + new DateTime(createTime) .toString() + " is not within validation time window "); } else { log.info("Validating partition " + partition.getCompleteName()); return withinTimeWindow; } } catch (ParseException e) { Throwables.propagate(e); } return false; }
@BeforeMethod public void initialize() { Mockito.doReturn(PARTITION_NAME).when(this.partition).getCompleteName(); }
/** * Currently updated the {@link #HIVE_TABLE_AVRO_SCHEMA_URL} location for new hive partitions * @param targetTable, new Table to be registered in hive * @param sourcePartitions, source partitions * @throws IOException */ public static void updatePartitionAttributesIfAvro(Table targetTable, Map<List<String>, Partition> sourcePartitions, HiveCopyEntityHelper hiveHelper) throws IOException { if (isHiveTableAvroType(targetTable)) { for (Map.Entry<List<String>, Partition> partition : sourcePartitions.entrySet()) { updateAvroSchemaURL(partition.getValue().getCompleteName(), partition.getValue().getTPartition().getSd(), hiveHelper); } } }
/** * Create a {@link TimestampedHiveDatasetVersion} from a {@link Partition} based on the Modified time of underlying * hdfs data location * @throws IllegalArgumentException when argument is null * @throws IllegalArgumentException when data location of partition is null * @throws IllegalArgumentException when data location of partition doesn't exist * {@inheritDoc} */ @Override protected TimestampedHiveDatasetVersion getDatasetVersion(Partition partition) { try { Preconditions.checkArgument(partition != null, "Argument to method "); Path dataLocation = partition.getDataLocation(); Preconditions .checkArgument(dataLocation != null, "Data location is null for partition " + partition.getCompleteName()); boolean exists = this.fs.exists(dataLocation); Preconditions.checkArgument(exists, "Data location doesn't exist for partition " + partition.getCompleteName()); long modificationTS = this.fs.getFileStatus(dataLocation).getModificationTime(); return new TimestampedHiveDatasetVersion(new DateTime(modificationTS), partition); } catch (IOException e) { throw new RuntimeException(e); } } }
@Override protected Collection<CopyEntity> generateCopyEntities() throws IOException { List<CopyEntity> deregisterCopyEntities = Lists.newArrayList(); int priority = 1; for (Partition partition : partitionsToDeregister) { try { priority = this.helper.addPartitionDeregisterSteps(deregisterCopyEntities, getName(), priority, this.helper.getTargetTable(), partition); } catch (IOException ioe) { log.error( "Could not create work unit to deregister partition " + partition.getCompleteName()); } } return deregisterCopyEntities; } }
/** * Get the update time of a {@link Partition} * * @return the update time if available, 0 otherwise * * {@inheritDoc} * @see HiveUnitUpdateProvider#getUpdateTime(org.apache.hadoop.hive.ql.metadata.Partition) */ @Override public long getUpdateTime(Partition partition) throws UpdateNotFoundException { try { return getUpdateTime(partition.getDataLocation()); } catch (IOException e) { throw new UpdateNotFoundException(String.format("Failed to get update time for %s", partition.getCompleteName()), e); } }
@Override public void write(QueryBasedHiveConversionEntity hiveConversionEntity) throws IOException { List<String> conversionQueries = null; try { conversionQueries = hiveConversionEntity.getQueries(); EventWorkunitUtils.setBeginConversionDDLExecuteTimeMetadata(this.workUnit, System.currentTimeMillis()); this.hiveJdbcConnector.executeStatements(conversionQueries.toArray(new String[conversionQueries.size()])); // Adding properties for preserving partitionParams: addPropsForPublisher(hiveConversionEntity); EventWorkunitUtils.setEndConversionDDLExecuteTimeMetadata(this.workUnit, System.currentTimeMillis()); } catch (SQLException e) { StringBuilder sb = new StringBuilder(); sb.append(String.format("Failed to execute queries for %s: ", hiveConversionEntity.getPartition().isPresent() ? hiveConversionEntity.getPartition().get().getCompleteName() : hiveConversionEntity.getTable().getCompleteName())); for (String conversionQuery : conversionQueries) { sb.append("\nConversion query attempted by Hive Query writer: "); sb.append(conversionQuery); } String message = sb.toString(); log.warn(message); throw new IOException(message, e); } }
case PARTITION: JSONObject partitionInfo = new JSONObject(); partitionInfo.put("partitionName", input.getPartition().getCompleteName()); if ((input.getParents() != null) && (!input.getParents().isEmpty())) { partitionInfo.put("partitionParents", input.getParents().toString());
@VisibleForTesting public static long getCreateTime(Partition partition) { // If create time is set, use it. // .. this is always set if HiveJDBC or Hive mestastore is used to create partition. // .. it might not be set (ie. equals 0) if Thrift API call is used to create partition. if (partition.getTPartition().getCreateTime() > 0) { return TimeUnit.MILLISECONDS.convert(partition.getTPartition().getCreateTime(), TimeUnit.SECONDS); } // Try to use distcp-ng registration generation time if it is available else if (partition.getTPartition().isSetParameters() && partition.getTPartition().getParameters().containsKey(DISTCP_REGISTRATION_GENERATION_TIME_KEY)) { log.debug("Did not find createTime in Hive partition, used distcp registration generation time."); return Long.parseLong(partition.getTPartition().getParameters().get(DISTCP_REGISTRATION_GENERATION_TIME_KEY)); } else { log.warn(String.format("Could not find create time for partition %s. Will return createTime as 0", partition.getCompleteName())); return 0; } }
@Test public void testTimeStampForVersion() throws IOException { Mockito.doReturn(new Path("Invalid Location")).when(this.partition).getDataLocation(); Mockito.doReturn(true).when(this.fs).exists(Mockito.any(Path.class)); Mockito.doReturn(this.fileStatus).when(this.fs).getFileStatus(Mockito.any(Path.class)); Mockito.doReturn(Long.valueOf(TIMESTAMP)).when(this.fileStatus).getModificationTime(); TimestampedHiveDatasetVersion datasetVersion = this.hdfsModifiedTimeHiveVersionFinder.getDatasetVersion(this.partition); // Check if the datasetVersion contains the correct partition Assert.assertTrue(datasetVersion.getPartition().getCompleteName().equalsIgnoreCase(PARTITION_NAME)); // Check if the datasetVersion contains the correct modified timestamp of the underlying data location Assert.assertTrue(datasetVersion.getDateTime().getMillis() == Long.valueOf(TIMESTAMP)); System.out.println(datasetVersion); } }
@Override public void clean() throws IOException { // Possible empty directories to clean for this partition (version) Set<Path> possiblyEmptyDirectories = new HashSet<>(); try (AutoReturnableObject<IMetaStoreClient> client = cleanableHiveDataset.getClientPool().getClient()) { Partition partition = hiveDatasetVersion.getPartition(); try { if (!cleanableHiveDataset.isSimulate()) { client.get().dropPartition(partition.getTable().getDbName(), partition.getTable().getTableName(), partition.getValues(), false); log.info("Successfully dropped partition " + partition.getCompleteName()); } else { log.info("Simulating drop partition " + partition.getCompleteName()); } if (cleanableHiveDataset.isShouldDeleteData()) { cleanableHiveDataset.getFsCleanableHelper().clean(hiveDatasetVersion, possiblyEmptyDirectories); } } catch (TException | IOException e) { log.warn(String.format("Failed to completely delete partition %s.", partition.getCompleteName()), e); throw new IOException(e); } } cleanableHiveDataset.getFsCleanableHelper().cleanEmptyDirectories(possiblyEmptyDirectories, cleanableHiveDataset); }
public HivePartitionFileSet(HiveCopyEntityHelper hiveCopyEntityHelper, Partition partition, Properties properties) { super(partition.getCompleteName(), hiveCopyEntityHelper.getDataset()); this.hiveCopyEntityHelper = hiveCopyEntityHelper; this.partition = partition; this.properties = properties; this.existingTargetPartition = Optional.fromNullable(this.hiveCopyEntityHelper.getTargetPartitions().get(this.partition.getValues())); this.eventSubmitter = new EventSubmitter.Builder(this.hiveCopyEntityHelper.getDataset().getMetricContext(), "hive.dataset.copy") .addMetadata("Partition", this.partition.getName()).build(); }
@Test public void testWhitelist() throws Exception { BackfillHiveSource backfillHiveSource = new BackfillHiveSource(); SourceState state = new SourceState(); state.setProp(BackfillHiveSource.BACKFILL_SOURCE_PARTITION_WHITELIST_KEY, "service@logEvent@datepartition=2016-08-04-00,service@logEvent@datepartition=2016-08-05-00"); backfillHiveSource.initBackfillHiveSource(state); Partition pass1 = Mockito.mock(Partition.class, Mockito.RETURNS_SMART_NULLS); Mockito.when(pass1.getCompleteName()).thenReturn("service@logEvent@datepartition=2016-08-04-00"); Partition pass2 = Mockito.mock(Partition.class, Mockito.RETURNS_SMART_NULLS); Mockito.when(pass2.getCompleteName()).thenReturn("service@logEvent@datepartition=2016-08-05-00"); Partition fail = Mockito.mock(Partition.class, Mockito.RETURNS_SMART_NULLS); Mockito.when(fail.getCompleteName()).thenReturn("service@logEvent@datepartition=2016-08-06-00"); Assert.assertTrue(backfillHiveSource.shouldCreateWorkunit(pass1, new LongWatermark(0))); Assert.assertTrue(backfillHiveSource.shouldCreateWorkunit(pass2, new LongWatermark(0))); Assert.assertFalse(backfillHiveSource.shouldCreateWorkunit(fail, new LongWatermark(0))); } }
@Test public void testGetJSONDependenciesJsonShhouldMatch() throws Exception { ExplainWork work = mockExplainWork(); when(work.getDependency()).thenReturn(true); // Mock inputs HashSet<ReadEntity> inputs = new HashSet<>(); // One input table Table table = mock(Table.class); when(table.getCompleteName()).thenReturn("table-name-mock"); when(table.getTableType()).thenReturn(TableType.EXTERNAL_TABLE); ReadEntity input1 = mock(ReadEntity.class); when(input1.getType()).thenReturn(Entity.Type.TABLE); when(input1.getTable()).thenReturn(table); inputs.add(input1); // And one partition Partition partition = mock(Partition.class); when(partition.getCompleteName()).thenReturn("partition-name-mock"); ReadEntity input2 = mock(ReadEntity.class); when(input2.getType()).thenReturn(Entity.Type.PARTITION); when(input2.getPartition()).thenReturn(partition); inputs.add(input2); when(work.getInputs()).thenReturn(inputs); JsonNode result = objectMapper.readTree(ExplainTask.getJSONDependencies(work).toString()); JsonNode expected = objectMapper.readTree("{\"input_partitions\":[{\"partitionName\":" + "\"partition-name-mock\"}],\"input_tables\":[{\"tablename\":\"table-name-mock\"," + "\"tabletype\":\"EXTERNAL_TABLE\"}]}"); assertEquals(expected, result); }
hiveDatasetVersion.getPartition().getCompleteName()), e); throw new IOException(e);