@Override public void publish(Collection<? extends WorkUnitState> states) throws IOException { super.publish(states); this.hivePublisher.publish(states); }
/** * This method publishes task output data for the given {@link WorkUnitState}, but if there are output data of * other tasks in the same folder, it may also publish those data. */ private void publishMultiTaskData(WorkUnitState state, int branchId, Set<Path> writerOutputPathsMoved) throws IOException { publishData(state, branchId, false, writerOutputPathsMoved); addLineageInfo(state, branchId); }
mergeMetadataAndCollectPartitionNames(states, partitions); partitions.removeIf(Objects::isNull); String mdOutputPath = getMetadataOutputPathFromState(anyState, branchId); String userSpecifiedPath = getUserSpecifiedOutputPathFromState(anyState, branchId); publishMetadata(getMergedMetadataForPartitionAndBranch(null, branchId), branchId, getMetadataOutputFileForBranch(anyState, branchId)); } else { String metadataFilename = getMetadataFileNameForBranch(anyState, branchId); if (mdOutputPath == null || metadataFilename == null) { LOG.info("Metadata filename not set for branch " + String.valueOf(branchId) + ": not publishing metadata."); publishMetadata(getMergedMetadataForPartitionAndBranch(partition, branchId), branchId, new Path(new Path(mdOutputPath, partition), metadataFilename));
throws IOException { ParallelRunner parallelRunner = this.getParallelRunner(this.writerFileSystemByBranches.get(branchId)); Path publisherOutputDir = getPublisherOutputDir(state, branchId); addSingleTaskWriterOutputToExistingDir(writerOutputDir, publisherOutputDir, state, branchId, parallelRunner); } else { if (writerOutputPathsMoved.contains(writerOutputDir)) { boolean replaceFinalOutputDir = this.getState().getPropAsBoolean(ForkOperatorUtils .getPropertyNameForBranch(ConfigurationKeys.DATA_PUBLISHER_REPLACE_FINAL_DIR, this.numBranches, branchId)); addWriterOutputToExistingDir(writerOutputDir, publisherOutputDir, state, branchId, parallelRunner); writerOutputPathsMoved.add(writerOutputDir); return; movePath(parallelRunner, state, writerOutputDir, publisherOutputDir, branchId); writerOutputPathsMoved.add(writerOutputDir);
final String configBasedMetadata = getMetadataFromWorkUnitState(workUnitState); final int branch = branchId; MetadataMerger<String> mdMerger = metadataMergers.computeIfAbsent(partitionIdentifier, k -> buildMetadataMergerForBranch(configBasedMetadata, branch, getMetadataOutputFileForBranch(workUnitState, branch))); if (shouldPublishWriterMetadataForBranch(branchId)) { String md = getIntermediateMetadataFromState(workUnitState, branchId); mdMerger.update(md); Set<FsWriterMetrics> metricsForPartition =
@Test public void testNoOutputWhenDisabled() throws IOException { State s = buildDefaultState(1); WorkUnitState wuState = new WorkUnitState(); addStateToWorkunit(s, wuState); wuState.setProp(ConfigurationKeys.WRITER_METADATA_KEY, "abcdefg"); BaseDataPublisher publisher = new BaseDataPublisher(s); publisher.publishMetadata(Collections.singletonList(wuState)); File mdFile = openMetadataFile(s, 1, 0); Assert.assertFalse(mdFile.exists(), "Internal metadata from writer should not be written out if no merger is set in config"); }
/** * Test lineage info is set on publishing single task */ @Test public void testPublishSingleTask() throws IOException { WorkUnitState state = buildTaskState(1); LineageInfo lineageInfo = LineageInfo.getLineageInfo(state.getTaskBroker()).get(); DatasetDescriptor source = new DatasetDescriptor("kafka", "testTopic"); lineageInfo.setSource(source, state); BaseDataPublisher publisher = new BaseDataPublisher(state); publisher.publishData(state); Assert.assertTrue(state.contains("gobblin.event.lineage.branch.0.destination")); Assert.assertFalse(state.contains("gobblin.event.lineage.branch.1.destination")); }
DatasetDescriptor source = new DatasetDescriptor("kafka", "testTopic"); lineageInfo.setSource(source, state); BaseDataPublisher publisher = new BaseDataPublisher(state); publisher.publish(ImmutableList.of(state));
/** * Publish metadata for each branch. We expect the metadata to be of String format and * populated in either the WRITER_MERGED_METADATA_KEY state or the WRITER_METADATA_KEY configuration key. */ @Override public void publishMetadata(WorkUnitState state) throws IOException { publishMetadata(Collections.singleton(state)); }
/** * Make sure directory exists before running {@link BaseDataPublisher#publishData(WorkUnitState, int, boolean, Set)} * so that tables will be moved one at a time rather than all at once */ @Override protected void publishData(WorkUnitState state, int branchId, boolean publishSingleTaskData, Set<Path> writerOutputPathsMoved) throws IOException { Path publisherOutputDir = getPublisherOutputDir(state, branchId); if (!this.publisherFileSystemByBranches.get(branchId).exists(publisherOutputDir)) { WriterUtils.mkdirsWithRecursivePermissionWithRetry(this.publisherFileSystemByBranches.get(branchId), publisherOutputDir, this.permissions.get(branchId), this.retrierConfig); } super.publishData(state, branchId, publishSingleTaskData, writerOutputPathsMoved); }
private void addLineageInfo(WorkUnitState state, int branchId) { if (!this.lineageInfo.isPresent()) { LOG.info("Will not add lineage info"); return; } // Final dataset descriptor DatasetDescriptor datasetDescriptor = createDestinationDescriptor(state, branchId); List<PartitionDescriptor> partitions = PartitionedDataWriter.getPartitionInfoAndClean(state, branchId); List<Descriptor> descriptors = new ArrayList<>(); if (partitions.size() == 0) { // Report as dataset level lineage descriptors.add(datasetDescriptor); } else { // Report as partition level lineage for (PartitionDescriptor partition : partitions) { descriptors.add(partition.copyWithNewDataset(datasetDescriptor)); } } this.lineageInfo.get().putDestination(descriptors, branchId, state); }
throws IOException { ParallelRunner parallelRunner = this.getParallelRunner(this.writerFileSystemByBranches.get(branchId)); Path publisherOutputDir = getPublisherOutputDir(state, branchId); addSingleTaskWriterOutputToExistingDir(writerOutputDir, publisherOutputDir, state, branchId, parallelRunner); } else { if (writerOutputPathsMoved.contains(writerOutputDir)) { boolean replaceFinalOutputDir = this.getState().getPropAsBoolean(ForkOperatorUtils .getPropertyNameForBranch(ConfigurationKeys.DATA_PUBLISHER_REPLACE_FINAL_DIR, this.numBranches, branchId)); addWriterOutputToExistingDir(writerOutputDir, publisherOutputDir, state, branchId, parallelRunner); writerOutputPathsMoved.add(writerOutputDir); return; movePath(parallelRunner, state, writerOutputDir, publisherOutputDir, branchId); writerOutputPathsMoved.add(writerOutputDir);
final String configBasedMetadata = getMetadataFromWorkUnitState(workUnitState); final int branch = branchId; MetadataMerger<String> mdMerger = metadataMergers.computeIfAbsent(partitionIdentifier, k -> buildMetadataMergerForBranch(configBasedMetadata, branch, getMetadataOutputFileForBranch(workUnitState, branch))); if (shouldPublishWriterMetadataForBranch(branchId)) { String md = getIntermediateMetadataFromState(workUnitState, branchId); mdMerger.update(md); Set<FsWriterMetrics> metricsForPartition =
/** * Test DATA_PUBLISHER_METADATA_STR: a user should be able to put an arbitrary metadata string in job configuration * and have that written out. */ @Test public void testMetadataStrOneBranch() throws IOException { State s = buildDefaultState(1); WorkUnitState wuState = new WorkUnitState(); wuState.setProp(ConfigurationKeys.DATA_PUBLISHER_METADATA_STR, "foobar"); addStateToWorkunit(s, wuState); BaseDataPublisher publisher = new BaseDataPublisher(s); publisher.publishMetadata(wuState); try (InputStream mdStream = new FileInputStream(openMetadataFile(s, 1, 0))) { String mdBytes = IOUtils.toString(mdStream, StandardCharsets.UTF_8); Assert.assertEquals(mdBytes, "foobar", "Expected to read back metadata from string"); } }
/** * Test lineage info is set on publishing multiple tasks */ @Test public void testPublishMultiTasks() throws IOException { WorkUnitState state1 = buildTaskState(2); WorkUnitState state2 = buildTaskState(2); LineageInfo lineageInfo = LineageInfo.getLineageInfo(state1.getTaskBroker()).get(); DatasetDescriptor source = new DatasetDescriptor("kafka", "testTopic"); lineageInfo.setSource(source, state1); lineageInfo.setSource(source, state2); BaseDataPublisher publisher = new BaseDataPublisher(state1); publisher.publishData(ImmutableList.of(state1, state2)); Assert.assertTrue(state1.contains("gobblin.event.lineage.branch.0.destination")); Assert.assertTrue(state1.contains("gobblin.event.lineage.branch.1.destination")); Assert.assertTrue(state2.contains("gobblin.event.lineage.branch.0.destination")); Assert.assertTrue(state2.contains("gobblin.event.lineage.branch.1.destination")); }
/** * Publish metadata for each branch. We expect the metadata to be of String format and * populated in either the WRITER_MERGED_METADATA_KEY state or the WRITER_METADATA_KEY configuration key. */ @Override public void publishMetadata(WorkUnitState state) throws IOException { publishMetadata(Collections.singleton(state)); }
/** * Make sure directory exists before running {@link BaseDataPublisher#publishData(WorkUnitState, int, boolean, Set)} * so that tables will be moved one at a time rather than all at once */ @Override protected void publishData(WorkUnitState state, int branchId, boolean publishSingleTaskData, Set<Path> writerOutputPathsMoved) throws IOException { Path publisherOutputDir = getPublisherOutputDir(state, branchId); if (!this.publisherFileSystemByBranches.get(branchId).exists(publisherOutputDir)) { WriterUtils.mkdirsWithRecursivePermissionWithRetry(this.publisherFileSystemByBranches.get(branchId), publisherOutputDir, this.permissions.get(branchId), this.retrierConfig); } super.publishData(state, branchId, publishSingleTaskData, writerOutputPathsMoved); }
private void addLineageInfo(WorkUnitState state, int branchId) { if (!this.lineageInfo.isPresent()) { LOG.info("Will not add lineage info"); return; } // Final dataset descriptor DatasetDescriptor datasetDescriptor = createDestinationDescriptor(state, branchId); List<PartitionDescriptor> partitions = PartitionedDataWriter.getPartitionInfoAndClean(state, branchId); List<Descriptor> descriptors = new ArrayList<>(); if (partitions.size() == 0) { // Report as dataset level lineage descriptors.add(datasetDescriptor); } else { // Report as partition level lineage for (PartitionDescriptor partition : partitions) { descriptors.add(partition.copyWithNewDataset(datasetDescriptor)); } } this.lineageInfo.get().putDestination(descriptors, branchId, state); }
mergeMetadataAndCollectPartitionNames(states, partitions); partitions.removeIf(Objects::isNull); String mdOutputPath = getMetadataOutputPathFromState(anyState, branchId); String userSpecifiedPath = getUserSpecifiedOutputPathFromState(anyState, branchId); publishMetadata(getMergedMetadataForPartitionAndBranch(null, branchId), branchId, getMetadataOutputFileForBranch(anyState, branchId)); } else { String metadataFilename = getMetadataFileNameForBranch(anyState, branchId); if (mdOutputPath == null || metadataFilename == null) { LOG.info("Metadata filename not set for branch " + String.valueOf(branchId) + ": not publishing metadata."); publishMetadata(getMergedMetadataForPartitionAndBranch(partition, branchId), branchId, new Path(new Path(mdOutputPath, partition), metadataFilename));
/** * Test that DATA_PUBLISHER_METADATA_STR functionality works across multiple branches. */ @Test public void testMetadataStrMultipleWorkUnitsAndBranches() throws IOException { final int numBranches = 3; State s = buildDefaultState(numBranches); List<WorkUnitState> workUnits = new ArrayList<>(); for (int i = 0; i < numBranches; i++) { WorkUnitState wuState = new WorkUnitState(); wuState.setProp(ConfigurationKeys.DATA_PUBLISHER_METADATA_STR, "foobar"); addStateToWorkunit(s, wuState); workUnits.add(wuState); } BaseDataPublisher publisher = new BaseDataPublisher(s); publisher.publishMetadata(workUnits); for (int branch = 0; branch < numBranches; branch++) { try (InputStream mdStream = new FileInputStream(openMetadataFile(s, numBranches, branch))) { String mdBytes = IOUtils.toString(mdStream, StandardCharsets.UTF_8); Assert.assertEquals(mdBytes, "foobar", "Expected to read back metadata from string"); } } }