private static boolean shouldObtainTablePropsFromConfigStore(SourceState state) { return state.getPropAsBoolean(SOURCE_OBTAIN_TABLE_PROPS_FROM_CONFIG_STORE, DEFAULT_SOURCE_OBTAIN_TABLE_PROPS_FROM_CONFIG_STORE); }
if (state.getPropAsBoolean(HAS_USER_SPECIFIED_PARTITIONS)) { return createUserSpecifiedPartitions(); boolean allowEqualBoundary = state.getPropAsBoolean(ALLOW_EQUAL_WATERMARK_BOUNDARY, false); LOG.info("Single partition with LWM = HWM and allowEqualBoundary=" + allowEqualBoundary); if (!allowEqualBoundary) {
@Override public void init(SourceState state) { String regexPattern = state.getProp(PartitionedFileSourceBase.DATE_PARTITIONED_SOURCE_PARTITION_PATTERN); Preconditions.checkNotNull(regexPattern, "Must specify a regex pattern in " + PartitionedFileSourceBase.DATE_PARTITIONED_SOURCE_PARTITION_PATTERN ); this.leadTime = PartitionAwareFileRetrieverUtils.getLeadTimeDurationFromConfig(state); this.pattern = Pattern.compile(regexPattern); this.helper = new HadoopFsHelper(state); this.sourceDir = new Path(state.getProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY)); this.schemaInSourceDir = state.getPropAsBoolean(ConfigurationKeys.SCHEMA_IN_SOURCE_DIR, ConfigurationKeys.DEFAULT_SCHEMA_IN_SOURCE_DIR); this.schemaFile = this.schemaInSourceDir ? state.getProp(ConfigurationKeys.SCHEMA_FILENAME, ConfigurationKeys.DEFAULT_SCHEMA_FILENAME) : ""; }
result.put(entry.getKey(), entry.getValue()); } else if (tablesWithNoUpdatesOnPreviousRun.contains(entry.getKey()) && state.getPropAsBoolean(ConfigurationKeys.SOURCE_QUERYBASED_RESET_EMPTY_PARTITION_WATERMARK, ConfigurationKeys.DEFAULT_SOURCE_QUERYBASED_RESET_EMPTY_PARTITION_WATERMARK)) { log.info("Resetting low watermakr to {} because previous run processed no data.", entry.getValue());
@Override public List<WorkUnit> getWorkunits(SourceState state) { try { this.beginGetWorkunitsTime = System.currentTimeMillis(); initialize(state); EventSubmitter.submit(Optional.of(this.eventSubmitter), EventConstants.CONVERSION_FIND_HIVE_TABLES_EVENT); Iterator<HiveDataset> iterator = this.datasetFinder.getDatasetsIterator(); while (iterator.hasNext()) { HiveDataset hiveDataset = iterator.next(); try (AutoReturnableObject<IMetaStoreClient> client = hiveDataset.getClientPool().getClient()) { log.debug(String.format("Processing dataset: %s", hiveDataset)); // Create workunits for partitions if (HiveUtils.isPartitioned(hiveDataset.getTable()) && state.getPropAsBoolean(HIVE_SOURCE_CREATE_WORKUNITS_FOR_PARTITIONS, DEFAULT_HIVE_SOURCE_CREATE_WORKUNITS_FOR_PARTITIONS)) { createWorkunitsForPartitionedTable(hiveDataset, client); } else { createWorkunitForNonPartitionedTable(hiveDataset); } } } } catch (IOException e) { throw new RuntimeException(e); } int realWorkunits = this.workunits.size(); this.watermarker.onGetWorkunitsEnd(this.workunits); log.info(String.format("Created %s real workunits and %s watermark workunits", realWorkunits, (this.workunits.size() - realWorkunits))); return this.workunits; }
public void testSkipWorkUnitPersistence(SourceState state) { if (!state.getPropAsBoolean(TEST_WORKUNIT_PERSISTENCE)) { return; } int skipCount = 0; for (WorkUnitState workUnitState : state.getPreviousWorkUnitStates()) { if (workUnitState.getWorkingState() == WorkUnitState.WorkingState.SKIPPED) { skipCount++; } } Assert.assertEquals(skipCount, NUMBER_OF_SKIP_WORKUNITS, "All skipped work units are not persisted in the state store"); }
if (previousWorkunits.get(0).getWorkunit().contains(ConfigurationKeys.SOURCE_FILEBASED_FS_SNAPSHOT)) { prevFsSnapshot.addAll(previousWorkunits.get(0).getWorkunit().getPropAsSet(ConfigurationKeys.SOURCE_FILEBASED_FS_SNAPSHOT)); } else if (state.getPropAsBoolean(ConfigurationKeys.SOURCE_FILEBASED_FS_PRIOR_SNAPSHOT_REQUIRED, ConfigurationKeys.DEFAULT_SOURCE_FILEBASED_FS_PRIOR_SNAPSHOT_REQUIRED)) { workUnit.setProp(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL, StringUtils.join(partitionFilesToPull, ",")); if (state.getPropAsBoolean(ConfigurationKeys.SOURCE_FILEBASED_PRESERVE_FILE_NAME, false)) { if (partitionFilesToPull.size() != 1) { throw new RuntimeException("Cannot preserve the file name if a workunit is given multiple files");
boolean isEarlyStopped = state.getPropAsBoolean(IS_EARLY_STOPPED);
} else { boolean retryFailedWorkUnits = state.getPropAsBoolean(ConfigurationKeys.WORK_UNIT_RETRY_ENABLED_KEY, true); workUnitRetryPolicy = retryFailedWorkUnits ? WorkUnitRetryPolicy.ALWAYS : WorkUnitRetryPolicy.NEVER; if (state.getPropAsBoolean(ConfigurationKeys.OVERWRITE_CONFIGS_IN_STATESTORE, ConfigurationKeys.DEFAULT_OVERWRITE_CONFIGS_IN_STATESTORE)) {
if (state.getPropAsBoolean(ConfigurationKeys.EXTRACT_IS_FULL_KEY) && !state.contains(ConfigurationKeys.EXTRACT_FULL_RUN_TIME_KEY)) { super.setProp(ConfigurationKeys.EXTRACT_FULL_RUN_TIME_KEY, System.currentTimeMillis());
@Override public void init(SourceState state) { DateTimeZone.setDefault(DateTimeZone .forID(state.getProp(ConfigurationKeys.SOURCE_TIMEZONE, ConfigurationKeys.DEFAULT_SOURCE_TIMEZONE))); initDatePartition(state); this.sourcePartitionPrefix = state.getProp(PartitionedFileSourceBase.DATE_PARTITIONED_SOURCE_PARTITION_PREFIX, StringUtils.EMPTY); this.sourcePartitionSuffix = state.getProp(PartitionedFileSourceBase.DATE_PARTITIONED_SOURCE_PARTITION_SUFFIX, StringUtils.EMPTY); this.sourceDir = new Path(state.getProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY)); this.leadTimeDuration = PartitionAwareFileRetrieverUtils.getLeadTimeDurationFromConfig(state); this.helper = new HadoopFsHelper(state); this.schemaInSourceDir = state.getPropAsBoolean(ConfigurationKeys.SCHEMA_IN_SOURCE_DIR, ConfigurationKeys.DEFAULT_SCHEMA_IN_SOURCE_DIR); this.schemaFile = this.schemaInSourceDir ? state.getProp(ConfigurationKeys.SCHEMA_FILENAME, ConfigurationKeys.DEFAULT_SCHEMA_FILENAME) : ""; }
/** * Generate the histogram */ private Histogram getHistogram(String entity, String watermarkColumn, SourceState state, Partition partition) { SalesforceConnector connector = getConnector(state); try { if (!connector.connect()) { throw new RuntimeException("Failed to connect."); } } catch (RestApiConnectionException e) { throw new RuntimeException("Failed to connect.", e); } Histogram histogram = getHistogramByDayBucketing(connector, entity, watermarkColumn, partition); // exchange the first histogram group key with the global low watermark to ensure that the low watermark is captured // in the range of generated partitions HistogramGroup firstGroup = histogram.get(0); Date lwmDate = Utils.toDate(partition.getLowWatermark(), Partitioner.WATERMARKTIMEFORMAT); histogram.getGroups().set(0, new HistogramGroup(Utils.epochToDate(lwmDate.getTime(), SECONDS_FORMAT), firstGroup.getCount())); // refine the histogram if (state.getPropAsBoolean(ENABLE_DYNAMIC_PROBING)) { histogram = getRefinedHistogram(connector, entity, watermarkColumn, state, partition, histogram); } return histogram; }
if (state.getPropAsBoolean(KafkaSource.GOBBLIN_KAFKA_EXTRACT_ALLOW_TABLE_TYPE_NAMESPACE_CUSTOMIZATION)) { String tableTypeStr = state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, KafkaSource.DEFAULT_TABLE_TYPE.toString()); extractNamespace = KafkaSource.DEFAULT_NAMESPACE_NAME; isFullExtract = state.getPropAsBoolean(ConfigurationKeys.EXTRACT_IS_FULL_KEY); kafkaBrokers = state.getProp(ConfigurationKeys.KAFKA_BROKERS, ""); this.shouldEnableDatasetStateStore = state.getPropAsBoolean(GOBBLIN_KAFKA_SHOULD_ENABLE_DATASET_STATESTORE, DEFAULT_GOBBLIN_KAFKA_SHOULD_ENABLE_DATASET_STATESTORE); Executors.newFixedThreadPool(numOfThreads, ExecutorsUtils.newThreadFactory(Optional.of(LOG))); if (state.getPropAsBoolean(ConfigurationKeys.KAFKA_SOURCE_SHARE_CONSUMER_CLIENT, ConfigurationKeys.DEFAULT_KAFKA_SOURCE_SHARE_CONSUMER_CLIENT)) { this.sharedKafkaConsumerClient = this.kafkaConsumerClient.get();
singleWorkUnit.setProp(ConfigurationKeys.WORK_UNIT_DATE_PARTITION_KEY, file.getWatermarkMsSinceEpoch()); if (this.sourceState.getPropAsBoolean(ConfigurationKeys.SCHEMA_IN_SOURCE_DIR, ConfigurationKeys.DEFAULT_SCHEMA_IN_SOURCE_DIR)) { addSchemaFile(file, singleWorkUnit);
if (state.contains(SIMULATE) && state.getPropAsBoolean(SIMULATE)) { log.info("Simulate mode enabled. Will not execute the copy."); for (Map.Entry<FileSet<CopyEntity>, Collection<WorkUnit>> entry : workUnitsMap.asMap().entrySet()) {
@Override public List<WorkUnit> getWorkunits(SourceState state) { String nameSpace = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY); Extract extract1 = createExtract(TableType.SNAPSHOT_ONLY, nameSpace, "TestTable1"); Extract extract2 = createExtract(TableType.SNAPSHOT_ONLY, nameSpace, "TestTable2"); String sourceFileList = state.getProp(SOURCE_FILE_LIST_KEY); List<String> list = SPLITTER.splitToList(sourceFileList); List<WorkUnit> workUnits = Lists.newArrayList(); for (int i = 0; i < list.size(); i++) { WorkUnit workUnit = WorkUnit.create(i % 2 == 0 ? extract1 : extract2); workUnit.setProp(SOURCE_FILE_KEY, list.get(i)); workUnits.add(workUnit); } if (state.getPropAsBoolean("use.multiworkunit", false)) { MultiWorkUnit multiWorkUnit = MultiWorkUnit.createEmpty(); multiWorkUnit.addWorkUnits(workUnits); workUnits.clear(); workUnits.add(multiWorkUnit); } return workUnits; }
state.getProp(ComplianceConfigurationKeys.PURGE_POLICY_CLASS, HivePurgerPolicy.class.getName()); this.policy = GobblinConstructorUtils.invokeConstructor(PurgePolicy.class, policyClass, this.lowWatermark); this.shouldProxy = state.getPropAsBoolean(ComplianceConfigurationKeys.GOBBLIN_COMPLIANCE_SHOULD_PROXY, ComplianceConfigurationKeys.GOBBLIN_COMPLIANCE_DEFAULT_SHOULD_PROXY); if (!this.shouldProxy) {
if (watermarkType == WatermarkType.SIMPLE || Strings.isNullOrEmpty(watermarkColumn) || !state.getPropAsBoolean( ENABLE_DYNAMIC_PARTITIONING) || maxPartitions <= 1) { return super.generateWorkUnits(sourceEntity, state, previousWatermark);
private static boolean shouldObtainTablePropsFromConfigStore(SourceState state) { return state.getPropAsBoolean(SOURCE_OBTAIN_TABLE_PROPS_FROM_CONFIG_STORE, DEFAULT_SOURCE_OBTAIN_TABLE_PROPS_FROM_CONFIG_STORE); }
@Override public void init(SourceState state) { String regexPattern = state.getProp(PartitionedFileSourceBase.DATE_PARTITIONED_SOURCE_PARTITION_PATTERN); Preconditions.checkNotNull(regexPattern, "Must specify a regex pattern in " + PartitionedFileSourceBase.DATE_PARTITIONED_SOURCE_PARTITION_PATTERN ); this.leadTime = PartitionAwareFileRetrieverUtils.getLeadTimeDurationFromConfig(state); this.pattern = Pattern.compile(regexPattern); this.helper = new HadoopFsHelper(state); this.sourceDir = new Path(state.getProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY)); this.schemaInSourceDir = state.getPropAsBoolean(ConfigurationKeys.SCHEMA_IN_SOURCE_DIR, ConfigurationKeys.DEFAULT_SCHEMA_IN_SOURCE_DIR); this.schemaFile = this.schemaInSourceDir ? state.getProp(ConfigurationKeys.SCHEMA_FILENAME, ConfigurationKeys.DEFAULT_SCHEMA_FILENAME) : ""; }