/** * This method publishes output data for a single task based on the given {@link WorkUnitState}. * Output data from other tasks won't be published even if they are in the same folder. */ private void publishSingleTaskData(WorkUnitState state, int branchId) throws IOException { publishData(state, branchId, true, new HashSet<Path>()); addLineageInfo(state, branchId); }
/** * This method publishes task output data for the given {@link WorkUnitState}, but if there are output data of * other tasks in the same folder, it may also publish those data. */ private void publishMultiTaskData(WorkUnitState state, int branchId, Set<Path> writerOutputPathsMoved) throws IOException { publishData(state, branchId, false, writerOutputPathsMoved); addLineageInfo(state, branchId); }
/** * Make sure directory exists before running {@link BaseDataPublisher#publishData(WorkUnitState, int, boolean, Set)} * so that tables will be moved one at a time rather than all at once */ @Override protected void publishData(WorkUnitState state, int branchId, boolean publishSingleTaskData, Set<Path> writerOutputPathsMoved) throws IOException { Path publisherOutputDir = getPublisherOutputDir(state, branchId); if (!this.publisherFileSystemByBranches.get(branchId).exists(publisherOutputDir)) { WriterUtils.mkdirsWithRecursivePermissionWithRetry(this.publisherFileSystemByBranches.get(branchId), publisherOutputDir, this.permissions.get(branchId), this.retrierConfig); } super.publishData(state, branchId, publishSingleTaskData, writerOutputPathsMoved); }
/** * Test lineage info is set on publishing single task */ @Test public void testPublishSingleTask() throws IOException { WorkUnitState state = buildTaskState(1); LineageInfo lineageInfo = LineageInfo.getLineageInfo(state.getTaskBroker()).get(); DatasetDescriptor source = new DatasetDescriptor("kafka", "testTopic"); lineageInfo.setSource(source, state); BaseDataPublisher publisher = new BaseDataPublisher(state); publisher.publishData(state); Assert.assertTrue(state.contains("gobblin.event.lineage.branch.0.destination")); Assert.assertFalse(state.contains("gobblin.event.lineage.branch.1.destination")); }
/** * Test lineage info is set on publishing multiple tasks */ @Test public void testPublishMultiTasks() throws IOException { WorkUnitState state1 = buildTaskState(2); WorkUnitState state2 = buildTaskState(2); LineageInfo lineageInfo = LineageInfo.getLineageInfo(state1.getTaskBroker()).get(); DatasetDescriptor source = new DatasetDescriptor("kafka", "testTopic"); lineageInfo.setSource(source, state1); lineageInfo.setSource(source, state2); BaseDataPublisher publisher = new BaseDataPublisher(state1); publisher.publishData(ImmutableList.of(state1, state2)); Assert.assertTrue(state1.contains("gobblin.event.lineage.branch.0.destination")); Assert.assertTrue(state1.contains("gobblin.event.lineage.branch.1.destination")); Assert.assertTrue(state2.contains("gobblin.event.lineage.branch.0.destination")); Assert.assertTrue(state2.contains("gobblin.event.lineage.branch.1.destination")); }
/** * This method publishes output data for a single task based on the given {@link WorkUnitState}. * Output data from other tasks won't be published even if they are in the same folder. */ private void publishSingleTaskData(WorkUnitState state, int branchId) throws IOException { publishData(state, branchId, true, new HashSet<Path>()); addLineageInfo(state, branchId); }
/** * This method publishes task output data for the given {@link WorkUnitState}, but if there are output data of * other tasks in the same folder, it may also publish those data. */ private void publishMultiTaskData(WorkUnitState state, int branchId, Set<Path> writerOutputPathsMoved) throws IOException { publishData(state, branchId, false, writerOutputPathsMoved); addLineageInfo(state, branchId); }
/** * Make sure directory exists before running {@link BaseDataPublisher#publishData(WorkUnitState, int, boolean, Set)} * so that tables will be moved one at a time rather than all at once */ @Override protected void publishData(WorkUnitState state, int branchId, boolean publishSingleTaskData, Set<Path> writerOutputPathsMoved) throws IOException { Path publisherOutputDir = getPublisherOutputDir(state, branchId); if (!this.publisherFileSystemByBranches.get(branchId).exists(publisherOutputDir)) { WriterUtils.mkdirsWithRecursivePermissionWithRetry(this.publisherFileSystemByBranches.get(branchId), publisherOutputDir, this.permissions.get(branchId), this.retrierConfig); } super.publishData(state, branchId, publishSingleTaskData, writerOutputPathsMoved); }