public SourceState(State properties, Map<String, ? extends SourceState> previousDatasetStatesByUrns, Iterable<WorkUnitState> previousWorkUnitStates) { super(properties, previousDatasetStatesByUrns, adaptWorkUnitStates(previousWorkUnitStates)); }
state.setProp(SlaEventKeys.SOURCE_URI, sourceFs.getUri()); state.setProp(SlaEventKeys.DESTINATION_URI, targetFs.getUri()); long maxSizePerBin = state.getPropAsLong(MAX_SIZE_MULTI_WORKUNITS, 0); long maxWorkUnitsPerMultiWorkUnit = state.getPropAsLong(MAX_WORK_UNITS_PER_BIN, 50); final long minWorkUnitWeight = Math.max(1, maxSizePerBin / maxWorkUnitsPerMultiWorkUnit); final Optional<CopyableFileWatermarkGenerator> watermarkGenerator = CopyableFileWatermarkHelper.getCopyableFileWatermarkGenerator(state); int maxThreads = state.getPropAsInt(MAX_CONCURRENT_LISTING_SERVICES, DEFAULT_MAX_CONCURRENT_LISTING_SERVICES); final CopyConfiguration copyConfiguration = CopyConfiguration.builder(targetFs, state.getProperties()).build(); .instantiateDatasetFinder(state.getProperties(), sourceFs, DEFAULT_DATASET_PROFILE_CLASS_KEY, new EventSubmitter.Builder(this.metricContext, CopyConfiguration.COPY_PREFIX).build(), state); if (state.contains(SIMULATE) && state.getPropAsBoolean(SIMULATE)) { log.info("Simulate mode enabled. Will not execute the copy."); for (Map.Entry<FileSet<CopyEntity>, Collection<WorkUnit>> entry : workUnitsMap.asMap().entrySet()) {
@VisibleForTesting public void initialize(SourceState state) throws IOException { this.updateProvider = UpdateProviderFactory.create(state); this.metricContext = Instrumented.getMetricContext(state, HiveSource.class); this.eventSubmitter = new EventSubmitter.Builder(this.metricContext, EventConstants.CONVERSION_NAMESPACE).build(); this.avroSchemaManager = new AvroSchemaManager(getSourceFs(state), state); this.workunits = Lists.newArrayList(); this.watermarker = GobblinConstructorUtils.invokeConstructor(HiveSourceWatermarkerFactory.class, state.getProp(HIVE_SOURCE_WATERMARKER_FACTORY_CLASS_KEY, DEFAULT_HIVE_SOURCE_WATERMARKER_FACTORY_CLASS)) .createFromState(state); EventSubmitter.submit(Optional.of(this.eventSubmitter), EventConstants.CONVERSION_SETUP_EVENT); this.datasetFinder = GobblinConstructorUtils.invokeConstructor(HiveDatasetFinder.class, state.getProp(HIVE_SOURCE_DATASET_FINDER_CLASS_KEY, DEFAULT_HIVE_SOURCE_DATASET_FINDER_CLASS), getSourceFs(state), state.getProperties(), this.eventSubmitter); int maxLookBackDays = state.getPropAsInt(HIVE_SOURCE_MAXIMUM_LOOKBACK_DAYS_KEY, DEFAULT_HIVE_SOURCE_MAXIMUM_LOOKBACK_DAYS); this.maxLookBackTime = new DateTime().minusDays(maxLookBackDays).getMillis(); this.ignoreDataPathIdentifierList = COMMA_BASED_SPLITTER.splitToList(state.getProp(HIVE_SOURCE_IGNORE_DATA_PATH_IDENTIFIER_KEY, DEFAULT_HIVE_SOURCE_IGNORE_DATA_PATH_IDENTIFIER)); silenceHiveLoggers(); }
@Override public List<WorkUnit> getWorkunits(SourceState state) { List<WorkUnit> workUnits = Lists.newArrayList(); if (!state.contains(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL)) { return workUnits; } // Create a single snapshot-type extract for all files Extract extract = new Extract(Extract.TableType.SNAPSHOT_ONLY, state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, "ExampleNamespace"), "ExampleTable"); String filesToPull = state.getProp(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL); for (String file : Splitter.on(',').omitEmptyStrings().split(filesToPull)) { // Create one work unit for each file to pull WorkUnit workUnit = WorkUnit.create(extract); workUnit.setProp(SOURCE_FILE_KEY, file); workUnits.add(workUnit); } return workUnits; }
@Override public List<WorkUnit> getWorkunits(SourceState state) { if (!state.contains(HIVE_SOURCE_DATASET_FINDER_CLASS_KEY)) { state.setProp(HIVE_SOURCE_DATASET_FINDER_CLASS_KEY, ConvertibleHiveDatasetFinder.class.getName()); } if (!state.contains(HiveDatasetFinder.HIVE_DATASET_CONFIG_PREFIX_KEY)) { state.setProp(HiveDatasetFinder.HIVE_DATASET_CONFIG_PREFIX_KEY, "hive.conversion.avro"); } return super.getWorkunits(state); }
super.setProp(ConfigurationKeys.EXTRACT_EXTRACT_ID_KEY, extractId); for (WorkUnitState pre : state.getPreviousWorkUnitStates()) { Extract previousExtract = pre.getWorkunit().getExtract(); if (previousExtract.getNamespace().equals(namespace) && previousExtract.getTable().equals(table)) { if (state.getPropAsBoolean(ConfigurationKeys.EXTRACT_IS_FULL_KEY) && !state.contains(ConfigurationKeys.EXTRACT_FULL_RUN_TIME_KEY)) { super.setProp(ConfigurationKeys.EXTRACT_FULL_RUN_TIME_KEY, System.currentTimeMillis());
/** * Create a temporary job directory based on job id or (if not available) UUID */ private void initJobDir (SourceState state) throws IOException { String tmpBase = state.getProp(MRCompactor.COMPACTION_TMP_DEST_DIR, MRCompactor.DEFAULT_COMPACTION_TMP_DEST_DIR); String jobId; if (state instanceof JobState) { jobId = ((JobState) state).getJobId(); } else { jobId = UUID.randomUUID().toString(); } this.tmpJobDir = new Path (tmpBase, jobId); this.fs.mkdirs(this.tmpJobDir); state.setProp (MRCompactor.COMPACTION_JOB_DIR, tmpJobDir.toString()); log.info ("Job dir is created under {}", this.tmpJobDir); }
@Override public List<WorkUnit> getWorkunits(SourceState state) { Map<String, Iterable<WorkUnitState>> previousWorkUnits = state.getPreviousWorkUnitStatesByDatasetUrns(); List<String> titles = new LinkedList<>(Splitter.on(",").omitEmptyStrings(). splitToList(state.getProp(WikipediaExtractor.SOURCE_PAGE_TITLES))); state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY), "WikipediaOutput"); List<WorkUnit> workUnits = Lists.newArrayList();
public void run () { try { Stopwatch stopwatch = Stopwatch.createStarted(); int threads = this.state.getPropAsInt(CompactionVerifier.COMPACTION_VERIFICATION_THREADS, 5); long timeOutInMinute = this.state.getPropAsLong(CompactionVerifier.COMPACTION_VERIFICATION_TIMEOUT_MINUTES, 30); long iterationCountLimit = this.state.getPropAsLong(CompactionVerifier.COMPACTION_VERIFICATION_ITERATION_COUNT_LIMIT, Integer.MAX_VALUE); long iteration = 0; while (datasets.size() > 0 && iteration++ < iterationCountLimit) {
@Override public List<WorkUnit> getWorkunits(SourceState state) { Config rootCfg = ConfigUtils.propertiesToConfig(state.getProperties()); Config cfg = rootCfg.hasPath(CONFIG_NAMESPACE) ? rootCfg.getConfig(CONFIG_NAMESPACE) : ConfigFactory.empty(); int numHellos = cfg.hasPath(NUM_HELLOS_KEY) ? cfg.getInt(NUM_HELLOS_KEY) : DEFAULT_NUM_HELLOS; Extract extract = new Extract(TableType.APPEND_ONLY, HelloWorldSource.class.getPackage().getName(), HelloWorldSource.class.getSimpleName()); List<WorkUnit> wus = new ArrayList<>(numHellos); for (int i = 1; i <= numHellos; ++i) { WorkUnit wu = new WorkUnit(extract); wu.setProp(HELLO_ID_FULL_KEY, i); wus.add(wu); } return wus; }
@VisibleForTesting public void initBackfillHiveSource(SourceState state) { this.partitionsWhitelist = Sets.newHashSet(Splitter.on(",").omitEmptyStrings().trimResults().split(state.getProp(BACKFILL_SOURCE_PARTITION_WHITELIST_KEY, StringUtils.EMPTY))); }
@Override public List<WorkUnit> getWorkunits(SourceState state) { List<WorkUnit> workUnits = Lists.newArrayList(); for (int i = 0; i < state.getPropAsInt(NUM_WORK_UNITS, 1); i++) { workUnits.add(new WorkUnit()); } return workUnits; }
public TableLevelWatermarker(State state) { this.tableWatermarks = Maps.newHashMap(); // Load previous watermarks in case of sourceState if (state instanceof SourceState) { SourceState sourceState = (SourceState)state; for (Map.Entry<String, Iterable<WorkUnitState>> datasetWorkUnitStates : sourceState .getPreviousWorkUnitStatesByDatasetUrns().entrySet()) { // Use the minimum of all previous watermarks for this dataset List<LongWatermark> previousWatermarks = FluentIterable.from(datasetWorkUnitStates.getValue()) .filter(Predicates.not(PartitionLevelWatermarker.WATERMARK_WORKUNIT_PREDICATE)) .transform(new Function<WorkUnitState, LongWatermark>() { @Override public LongWatermark apply(WorkUnitState w) { return w.getActualHighWatermark(LongWatermark.class); } }).toList(); if (!previousWatermarks.isEmpty()) { this.tableWatermarks.put(datasetWorkUnitStates.getKey(), Collections.min(previousWatermarks)); } } log.debug("Loaded table watermarks from previous state " + this.tableWatermarks); } }
@Override public List<WorkUnit> getWorkunits(SourceState state) { try { this.beginGetWorkunitsTime = System.currentTimeMillis(); initialize(state); EventSubmitter.submit(Optional.of(this.eventSubmitter), EventConstants.CONVERSION_FIND_HIVE_TABLES_EVENT); Iterator<HiveDataset> iterator = this.datasetFinder.getDatasetsIterator(); while (iterator.hasNext()) { HiveDataset hiveDataset = iterator.next(); try (AutoReturnableObject<IMetaStoreClient> client = hiveDataset.getClientPool().getClient()) { log.debug(String.format("Processing dataset: %s", hiveDataset)); // Create workunits for partitions if (HiveUtils.isPartitioned(hiveDataset.getTable()) && state.getPropAsBoolean(HIVE_SOURCE_CREATE_WORKUNITS_FOR_PARTITIONS, DEFAULT_HIVE_SOURCE_CREATE_WORKUNITS_FOR_PARTITIONS)) { createWorkunitsForPartitionedTable(hiveDataset, client); } else { createWorkunitForNonPartitionedTable(hiveDataset); } } } } catch (IOException e) { throw new RuntimeException(e); } int realWorkunits = this.workunits.size(); this.watermarker.onGetWorkunitsEnd(this.workunits); log.info(String.format("Created %s real workunits and %s watermark workunits", realWorkunits, (this.workunits.size() - realWorkunits))); return this.workunits; }
@Override public WorkUnitStream getWorkunitStream(SourceState state) { try { fs = getSourceFileSystem(state); suite = CompactionSuiteUtils.getCompactionSuiteFactory(state).createSuite(state); initRequestAllocator(state); initJobDir(state); copyJarDependencies(state); DatasetsFinder finder = DatasetUtils.instantiateDatasetFinder(state.getProperties(), getSourceFileSystem(state), DefaultFileSystemGlobFinder.class.getName()); List<Dataset> datasets = finder.findDatasets(); CompactionWorkUnitIterator workUnitIterator = new CompactionWorkUnitIterator (); // Spawn a single thread to create work units new Thread(new SingleWorkUnitGeneratorService (state, prioritize(datasets, state), workUnitIterator), "SingleWorkUnitGeneratorService").start(); return new BasicWorkUnitStream.Builder (workUnitIterator).build(); } catch (IOException e) { throw new RuntimeException(e); } }
.getPreviousWorkUnitStatesByDatasetUrns().entrySet()) {
public SourceState(State properties, Iterable<WorkUnitState> prevWorkUnitStates) { super(properties, adaptWorkUnitStates(prevWorkUnitStates)); }
public SourceState(State properties, Map<String, ? extends SourceState> previousDatasetStatesByUrns, Iterable<WorkUnitState> previousWorkUnitStates) { super(properties, previousDatasetStatesByUrns, adaptWorkUnitStates(previousWorkUnitStates)); }
public SourceState(State properties, Iterable<WorkUnitState> prevWorkUnitStates) { super(properties, adaptWorkUnitStates(prevWorkUnitStates)); }