/** * If config store is enabled, then intersection of topics from blacklisting/whitelisting will be taken against * the topics from config-store */ private List<KafkaTopic> getFilteredTopics(SourceState state) { List<Pattern> blacklist = DatasetFilterUtils.getPatternList(state, TOPIC_BLACKLIST); List<Pattern> whitelist = DatasetFilterUtils.getPatternList(state, TOPIC_WHITELIST); List<KafkaTopic> topics = this.kafkaConsumerClient.get().getFilteredTopics(blacklist, whitelist); Optional<String> configStoreUri = ConfigStoreUtils.getConfigStoreUri(state.getProperties()); if (configStoreUri.isPresent()) { List<KafkaTopic> topicsFromConfigStore = ConfigStoreUtils .getTopicsFromConfigStore(state.getProperties(), configStoreUri.get(), this.kafkaConsumerClient.get()); return topics.stream().filter((KafkaTopic p) -> (topicsFromConfigStore.stream() .anyMatch((KafkaTopic q) -> q.getName().equalsIgnoreCase(p.getName())))).collect(toList()); } return topics; }
/** * Can be overriden to specify a non-pluggable {@link org.apache.gobblin.dataset.DatasetsFinder}. * @throws IOException */ protected IterableDatasetFinder createDatasetsFinder(SourceState state) throws IOException { return DatasetUtils.instantiateIterableDatasetFinder(state.getProperties(), HadoopUtils.getSourceFileSystem(state), null); }
@Override public List<WorkUnit> getWorkunits(SourceState state) { Config rootCfg = ConfigUtils.propertiesToConfig(state.getProperties()); Config cfg = rootCfg.hasPath(CONFIG_NAMESPACE) ? rootCfg.getConfig(CONFIG_NAMESPACE) : ConfigFactory.empty(); int numHellos = cfg.hasPath(NUM_HELLOS_KEY) ? cfg.getInt(NUM_HELLOS_KEY) : DEFAULT_NUM_HELLOS; Extract extract = new Extract(TableType.APPEND_ONLY, HelloWorldSource.class.getPackage().getName(), HelloWorldSource.class.getSimpleName()); List<WorkUnit> wus = new ArrayList<>(numHellos); for (int i = 1; i <= numHellos; ++i) { WorkUnit wu = new WorkUnit(extract); wu.setProp(HELLO_ID_FULL_KEY, i); wus.add(wu); } return wus; }
/** * Create a work unit for each configuration defined or a single work unit if no configurations are defined * @param state see {@link org.apache.gobblin.configuration.SourceState} * @return list of workunits */ @Override public List<WorkUnit> getWorkunits(SourceState state) { List<WorkUnit> workUnits = Lists.newArrayList(); Config config = ConfigUtils.propertiesToConfig(state.getProperties()); Config sourceConfig = ConfigUtils.getConfigOrEmpty(config, DATASET_CLEANER_SOURCE_PREFIX); List<String> configurationNames = ConfigUtils.getStringList(config, DATASET_CLEANER_CONFIGURATIONS); // use a dummy configuration name if none set if (configurationNames.isEmpty()) { configurationNames = ImmutableList.of("DummyConfig"); } for (String configurationName: configurationNames) { WorkUnit workUnit = WorkUnit.createEmpty(); // specific configuration prefixed by the configuration name has precedence over the source specific configuration // and the source specific configuration has precedence over the general configuration Config wuConfig = ConfigUtils.getConfigOrEmpty(sourceConfig, configurationName).withFallback(sourceConfig) .withFallback(config); workUnit.setProps(ConfigUtils.configToProperties(wuConfig), new Properties()); TaskUtils.setTaskFactoryClass(workUnit, DatasetCleanerTaskFactory.class); workUnits.add(workUnit); } return workUnits; }
@Override public List<WorkUnit> getWorkunits(SourceState state) { Config config = ConfigUtils.propertiesToConfig(state.getProperties()); Consumer<String, byte[]> consumer = getKafkaConsumer(config); LOG.debug("Consumer is {}", consumer); String topic = ConfigUtils.getString(config, TOPIC_WHITELIST, StringUtils.EMPTY); // TODO: fix this to use the new API when KafkaWrapper is fixed List<WorkUnit> workUnits = new ArrayList<WorkUnit>(); List<PartitionInfo> topicPartitions; topicPartitions = consumer.partitionsFor(topic); LOG.info("Partition count is {}", topicPartitions.size()); for (PartitionInfo topicPartition : topicPartitions) { Extract extract = this.createExtract(DEFAULT_TABLE_TYPE, DEFAULT_NAMESPACE_NAME, topicPartition.topic()); LOG.info("Partition info is {}", topicPartition); WorkUnit workUnit = WorkUnit.create(extract); setTopicNameInState(workUnit, topicPartition.topic()); workUnit.setProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY, topicPartition.topic()); setPartitionId(workUnit, topicPartition.partition()); workUnits.add(workUnit); } return workUnits; }
@VisibleForTesting public void initialize(SourceState state) throws IOException { this.updateProvider = UpdateProviderFactory.create(state); this.metricContext = Instrumented.getMetricContext(state, HiveSource.class); this.eventSubmitter = new EventSubmitter.Builder(this.metricContext, EventConstants.CONVERSION_NAMESPACE).build(); this.avroSchemaManager = new AvroSchemaManager(getSourceFs(state), state); this.workunits = Lists.newArrayList(); this.watermarker = GobblinConstructorUtils.invokeConstructor(HiveSourceWatermarkerFactory.class, state.getProp(HIVE_SOURCE_WATERMARKER_FACTORY_CLASS_KEY, DEFAULT_HIVE_SOURCE_WATERMARKER_FACTORY_CLASS)) .createFromState(state); EventSubmitter.submit(Optional.of(this.eventSubmitter), EventConstants.CONVERSION_SETUP_EVENT); this.datasetFinder = GobblinConstructorUtils.invokeConstructor(HiveDatasetFinder.class, state.getProp(HIVE_SOURCE_DATASET_FINDER_CLASS_KEY, DEFAULT_HIVE_SOURCE_DATASET_FINDER_CLASS), getSourceFs(state), state.getProperties(), this.eventSubmitter); int maxLookBackDays = state.getPropAsInt(HIVE_SOURCE_MAXIMUM_LOOKBACK_DAYS_KEY, DEFAULT_HIVE_SOURCE_MAXIMUM_LOOKBACK_DAYS); this.maxLookBackTime = new DateTime().minusDays(maxLookBackDays).getMillis(); this.ignoreDataPathIdentifierList = COMMA_BASED_SPLITTER.splitToList(state.getProp(HIVE_SOURCE_IGNORE_DATA_PATH_IDENTIFIER_KEY, DEFAULT_HIVE_SOURCE_IGNORE_DATA_PATH_IDENTIFIER)); silenceHiveLoggers(); }
int maxThreads = state.getPropAsInt(MAX_CONCURRENT_LISTING_SERVICES, DEFAULT_MAX_CONCURRENT_LISTING_SERVICES); final CopyConfiguration copyConfiguration = CopyConfiguration.builder(targetFs, state.getProperties()).build(); .instantiateDatasetFinder(state.getProperties(), sourceFs, DEFAULT_DATASET_PROFILE_CLASS_KEY, this.eventSubmitter, state);
@Override public List<WorkUnit> getWorkunits(SourceState state) { try { FileSystem fs = HadoopUtils.getSourceFileSystem(state); Config config = ConfigUtils.propertiesToConfig(state.getProperties()); if (state.contains(COPY_TABLE_KEY)) { HiveDataset dataset = getHiveDataset(state.getProp(COPY_TABLE_KEY), fs, state); WorkUnit workUnit = HiveMaterializer.tableCopyWorkUnit(dataset, new StageableTableMetadata(config.getConfig(HIVE_MATERIALIZER_SOURCE_PREFIX), dataset.getTable()), null); HiveTask.disableHiveWatermarker(workUnit); return Lists.newArrayList(workUnit); } else if (state.contains(MATERIALIZE_VIEW)) { HiveDataset dataset = getHiveDataset(state.getProp(MATERIALIZE_VIEW), fs, state); WorkUnit workUnit = HiveMaterializer.viewMaterializationWorkUnit(dataset, getOutputStorageFormat(state), new StageableTableMetadata(config.getConfig(HIVE_MATERIALIZER_SOURCE_PREFIX), dataset.getTable()), null); HiveTask.disableHiveWatermarker(workUnit); return Lists.newArrayList(workUnit); } else if (state.contains(MATERIALIZE_QUERY)) { String query = state.getProp(MATERIALIZE_QUERY); WorkUnit workUnit = HiveMaterializer.queryResultMaterializationWorkUnit(query, getOutputStorageFormat(state), new StageableTableMetadata(config.getConfig(HIVE_MATERIALIZER_SOURCE_PREFIX), null)); HiveTask.disableHiveWatermarker(workUnit); return Lists.newArrayList(workUnit); } } catch (IOException ioe) { throw new RuntimeException(ioe); } throw new RuntimeException(String.format("Must specify either %s, %s, or %s.", COPY_TABLE_KEY, MATERIALIZE_QUERY, MATERIALIZE_VIEW)); }
Config config = ConfigUtils.propertiesToConfig(state.getProperties()); GobblinKafkaConsumerClientFactory kafkaConsumerClientFactory = kafkaConsumerClientResolver .resolveClass(
@Override public WorkUnitStream getWorkunitStream(SourceState state) { try { fs = getSourceFileSystem(state); state.setProp(COMPACTION_INIT_TIME, DateTimeUtils.currentTimeMillis()); suite = CompactionSuiteUtils.getCompactionSuiteFactory(state).createSuite(state); initRequestAllocator(state); initJobDir(state); copyJarDependencies(state); DatasetsFinder finder = DatasetUtils.instantiateDatasetFinder(state.getProperties(), getSourceFileSystem(state), DefaultFileSystemGlobFinder.class.getName()); List<Dataset> datasets = finder.findDatasets(); CompactionWorkUnitIterator workUnitIterator = new CompactionWorkUnitIterator (); // Spawn a single thread to create work units new Thread(new SingleWorkUnitGeneratorService (state, prioritize(datasets, state), workUnitIterator), "SingleWorkUnitGeneratorService").start(); return new BasicWorkUnitStream.Builder (workUnitIterator).build(); } catch (IOException e) { throw new RuntimeException(e); } }
@Override public List<WorkUnit> getWorkunits(SourceState state) { configureIfNeeded(ConfigFactory.parseProperties(state.getProperties())); final List<WorkUnitState> previousWorkUnitStates = state.getPreviousWorkUnitStates(); if (!previousWorkUnitStates.isEmpty())
.getPropAsInt(CopySource.MAX_CONCURRENT_LISTING_SERVICES, CopySource.DEFAULT_MAX_CONCURRENT_LISTING_SERVICES); final CopyConfiguration copyConfiguration = CopyConfiguration.builder(targetFs, state.getProperties()).build(); .instantiateDatasetFinder(state.getProperties(), sourceFs, CopySource.DEFAULT_DATASET_PROFILE_CLASS_KEY, eventSubmitter, state);
/** * Can be overriden to specify a non-pluggable {@link org.apache.gobblin.dataset.DatasetsFinder}. * @throws IOException */ protected IterableDatasetFinder createDatasetsFinder(SourceState state) throws IOException { return DatasetUtils.instantiateIterableDatasetFinder(state.getProperties(), HadoopUtils.getSourceFileSystem(state), null); }
@Override public List<WorkUnit> getWorkunits(SourceState state) { Config rootCfg = ConfigUtils.propertiesToConfig(state.getProperties()); Config cfg = rootCfg.hasPath(CONFIG_NAMESPACE) ? rootCfg.getConfig(CONFIG_NAMESPACE) : ConfigFactory.empty(); int numHellos = cfg.hasPath(NUM_HELLOS_KEY) ? cfg.getInt(NUM_HELLOS_KEY) : DEFAULT_NUM_HELLOS; Extract extract = new Extract(TableType.APPEND_ONLY, HelloWorldSource.class.getPackage().getName(), HelloWorldSource.class.getSimpleName()); List<WorkUnit> wus = new ArrayList<>(numHellos); for (int i = 1; i <= numHellos; ++i) { WorkUnit wu = new WorkUnit(extract); wu.setProp(HELLO_ID_FULL_KEY, i); wus.add(wu); } return wus; }
/** * If config store is enabled, then intersection of topics from blacklisting/whitelisting will be taken against * the topics from config-store */ private List<KafkaTopic> getFilteredTopics(SourceState state) { List<Pattern> blacklist = DatasetFilterUtils.getPatternList(state, TOPIC_BLACKLIST); List<Pattern> whitelist = DatasetFilterUtils.getPatternList(state, TOPIC_WHITELIST); List<KafkaTopic> topics = this.kafkaConsumerClient.get().getFilteredTopics(blacklist, whitelist); Optional<String> configStoreUri = ConfigStoreUtils.getConfigStoreUri(state.getProperties()); if (configStoreUri.isPresent()) { List<KafkaTopic> topicsFromConfigStore = ConfigStoreUtils .getTopicsFromConfigStore(state.getProperties(), configStoreUri.get(), this.kafkaConsumerClient.get()); return topics.stream().filter((KafkaTopic p) -> (topicsFromConfigStore.stream() .anyMatch((KafkaTopic q) -> q.getName().equalsIgnoreCase(p.getName())))).collect(toList()); } return topics; }
/** * Create a work unit for each configuration defined or a single work unit if no configurations are defined * @param state see {@link org.apache.gobblin.configuration.SourceState} * @return list of workunits */ @Override public List<WorkUnit> getWorkunits(SourceState state) { List<WorkUnit> workUnits = Lists.newArrayList(); Config config = ConfigUtils.propertiesToConfig(state.getProperties()); Config sourceConfig = ConfigUtils.getConfigOrEmpty(config, DATASET_CLEANER_SOURCE_PREFIX); List<String> configurationNames = ConfigUtils.getStringList(config, DATASET_CLEANER_CONFIGURATIONS); // use a dummy configuration name if none set if (configurationNames.isEmpty()) { configurationNames = ImmutableList.of("DummyConfig"); } for (String configurationName: configurationNames) { WorkUnit workUnit = WorkUnit.createEmpty(); // specific configuration prefixed by the configuration name has precedence over the source specific configuration // and the source specific configuration has precedence over the general configuration Config wuConfig = ConfigUtils.getConfigOrEmpty(sourceConfig, configurationName).withFallback(sourceConfig) .withFallback(config); workUnit.setProps(ConfigUtils.configToProperties(wuConfig), new Properties()); TaskUtils.setTaskFactoryClass(workUnit, DatasetCleanerTaskFactory.class); workUnits.add(workUnit); } return workUnits; }
@VisibleForTesting public void initialize(SourceState state) throws IOException { this.updateProvider = UpdateProviderFactory.create(state); this.metricContext = Instrumented.getMetricContext(state, HiveSource.class); this.eventSubmitter = new EventSubmitter.Builder(this.metricContext, EventConstants.CONVERSION_NAMESPACE).build(); this.avroSchemaManager = new AvroSchemaManager(getSourceFs(state), state); this.workunits = Lists.newArrayList(); this.watermarker = GobblinConstructorUtils.invokeConstructor(HiveSourceWatermarkerFactory.class, state.getProp(HIVE_SOURCE_WATERMARKER_FACTORY_CLASS_KEY, DEFAULT_HIVE_SOURCE_WATERMARKER_FACTORY_CLASS)) .createFromState(state); EventSubmitter.submit(Optional.of(this.eventSubmitter), EventConstants.CONVERSION_SETUP_EVENT); this.datasetFinder = GobblinConstructorUtils.invokeConstructor(HiveDatasetFinder.class, state.getProp(HIVE_SOURCE_DATASET_FINDER_CLASS_KEY, DEFAULT_HIVE_SOURCE_DATASET_FINDER_CLASS), getSourceFs(state), state.getProperties(), this.eventSubmitter); int maxLookBackDays = state.getPropAsInt(HIVE_SOURCE_MAXIMUM_LOOKBACK_DAYS_KEY, DEFAULT_HIVE_SOURCE_MAXIMUM_LOOKBACK_DAYS); this.maxLookBackTime = new DateTime().minusDays(maxLookBackDays).getMillis(); this.ignoreDataPathIdentifierList = COMMA_BASED_SPLITTER.splitToList(state.getProp(HIVE_SOURCE_IGNORE_DATA_PATH_IDENTIFIER_KEY, DEFAULT_HIVE_SOURCE_IGNORE_DATA_PATH_IDENTIFIER)); silenceHiveLoggers(); }
@Override public List<WorkUnit> getWorkunits(SourceState state) { try { FileSystem fs = HadoopUtils.getSourceFileSystem(state); Config config = ConfigUtils.propertiesToConfig(state.getProperties()); if (state.contains(COPY_TABLE_KEY)) { HiveDataset dataset = getHiveDataset(state.getProp(COPY_TABLE_KEY), fs, state); WorkUnit workUnit = HiveMaterializer.tableCopyWorkUnit(dataset, new StageableTableMetadata(config.getConfig(HIVE_MATERIALIZER_SOURCE_PREFIX), dataset.getTable()), null); HiveTask.disableHiveWatermarker(workUnit); return Lists.newArrayList(workUnit); } else if (state.contains(MATERIALIZE_VIEW)) { HiveDataset dataset = getHiveDataset(state.getProp(MATERIALIZE_VIEW), fs, state); WorkUnit workUnit = HiveMaterializer.viewMaterializationWorkUnit(dataset, getOutputStorageFormat(state), new StageableTableMetadata(config.getConfig(HIVE_MATERIALIZER_SOURCE_PREFIX), dataset.getTable()), null); HiveTask.disableHiveWatermarker(workUnit); return Lists.newArrayList(workUnit); } else if (state.contains(MATERIALIZE_QUERY)) { String query = state.getProp(MATERIALIZE_QUERY); WorkUnit workUnit = HiveMaterializer.queryResultMaterializationWorkUnit(query, getOutputStorageFormat(state), new StageableTableMetadata(config.getConfig(HIVE_MATERIALIZER_SOURCE_PREFIX), null)); HiveTask.disableHiveWatermarker(workUnit); return Lists.newArrayList(workUnit); } } catch (IOException ioe) { throw new RuntimeException(ioe); } throw new RuntimeException(String.format("Must specify either %s, %s, or %s.", COPY_TABLE_KEY, MATERIALIZE_QUERY, MATERIALIZE_VIEW)); }
@Override public WorkUnitStream getWorkunitStream(SourceState state) { try { fs = getSourceFileSystem(state); state.setProp(COMPACTION_INIT_TIME, DateTimeUtils.currentTimeMillis()); suite = CompactionSuiteUtils.getCompactionSuiteFactory(state).createSuite(state); initRequestAllocator(state); initJobDir(state); copyJarDependencies(state); DatasetsFinder finder = DatasetUtils.instantiateDatasetFinder(state.getProperties(), getSourceFileSystem(state), DefaultFileSystemGlobFinder.class.getName()); List<Dataset> datasets = finder.findDatasets(); CompactionWorkUnitIterator workUnitIterator = new CompactionWorkUnitIterator (); // Spawn a single thread to create work units new Thread(new SingleWorkUnitGeneratorService (state, prioritize(datasets, state), workUnitIterator), "SingleWorkUnitGeneratorService").start(); return new BasicWorkUnitStream.Builder (workUnitIterator).build(); } catch (IOException e) { throw new RuntimeException(e); } }
@Override public List<WorkUnit> getWorkunits(SourceState state) { configureIfNeeded(ConfigFactory.parseProperties(state.getProperties())); final List<WorkUnitState> previousWorkUnitStates = state.getPreviousWorkUnitStates(); if (!previousWorkUnitStates.isEmpty())