/** * Get all tables in db with given table pattern. */ public Collection<DbAndTable> getTables() throws IOException { List<DbAndTable> tables = Lists.newArrayList(); try (AutoReturnableObject<IMetaStoreClient> client = this.clientPool.getClient()) { Iterable<String> databases = Iterables.filter(client.get().getAllDatabases(), new Predicate<String>() { @Override public boolean apply(String db) { return HiveDatasetFinder.this.whitelistBlacklist.acceptDb(db); } }); for (final String db : databases) { Iterable<String> tableNames = Iterables.filter(client.get().getAllTables(db), new Predicate<String>() { @Override public boolean apply(String table) { return HiveDatasetFinder.this.whitelistBlacklist.acceptTable(db, table); } }); for (String tableName : tableNames) { tables.add(new DbAndTable(db, tableName)); } } } catch (Exception exc) { throw new IOException(exc); } return tables; }
@Override protected HiveDataset computeNext() { while (this.tables.hasNext()) { DbAndTable dbAndTable = this.tables.next(); try (AutoReturnableObject<IMetaStoreClient> client = HiveDatasetFinder.this.clientPool.getClient()) { Table table = client.get().getTable(dbAndTable.getDb(), dbAndTable.getTable()); Config datasetConfig = getDatasetConfig(table); if (ConfigUtils.getBoolean(datasetConfig, HIVE_DATASET_IS_BLACKLISTED_KEY, DEFAULT_HIVE_DATASET_IS_BLACKLISTED_KEY)) { continue; } if (HiveDatasetFinder.this.eventSubmitter.isPresent()) { SlaEventSubmitter.builder().datasetUrn(dbAndTable.toString()) .eventSubmitter(HiveDatasetFinder.this.eventSubmitter.get()).eventName(DATASET_FOUND).build().submit(); } return createHiveDataset(table, datasetConfig); } catch (IllegalArgumentException e) { Throwables.propagate(e); } catch (Throwable t) { log.error(String.format("Failed to create HiveDataset for table %s.%s", dbAndTable.getDb(), dbAndTable.getTable()), t); if (HiveDatasetFinder.this.eventSubmitter.isPresent()) { SlaEventSubmitter.builder().datasetUrn(dbAndTable.toString()) .eventSubmitter(HiveDatasetFinder.this.eventSubmitter.get()).eventName(DATASET_ERROR) .additionalMetadata(FAILURE_CONTEXT, t.toString()).build().submit(); } } } return endOfData(); } };
String resolvedValue = StringUtils.replaceEach(rawValue, new String[] { DATABASE_TOKEN, TABLE_TOKEN, LOGICAL_DB_TOKEN, LOGICAL_TABLE_TOKEN }, new String[] { realDbAndTable.getDb(), realDbAndTable.getTable(), logicalDbAndTable.getDb(), logicalDbAndTable.getTable() }); resolvedValueList.add(resolvedValue); String resolvedValue = StringUtils.replaceEach(resolvedConfig.getString(entry.getKey()), new String[] { DATABASE_TOKEN, TABLE_TOKEN, LOGICAL_DB_TOKEN, LOGICAL_TABLE_TOKEN }, new String[] { realDbAndTable.getDb(), realDbAndTable.getTable(), logicalDbAndTable.getDb(), logicalDbAndTable.getTable() }); resolvedProperties.setProperty(entry.getKey(), resolvedValue);
/** * Get all tables in db with given table pattern. */ public Collection<DbAndTable> getTables() throws IOException { List<DbAndTable> tables = Lists.newArrayList(); try (AutoReturnableObject<IMetaStoreClient> client = this.clientPool.getClient()) { Iterable<String> databases = Iterables.filter(client.get().getAllDatabases(), new Predicate<String>() { @Override public boolean apply(String db) { return HiveDatasetFinder.this.whitelistBlacklist.acceptDb(db); } }); for (final String db : databases) { Iterable<String> tableNames = Iterables.filter(client.get().getAllTables(db), new Predicate<String>() { @Override public boolean apply(String table) { return HiveDatasetFinder.this.whitelistBlacklist.acceptTable(db, table); } }); for (String tableName : tableNames) { tables.add(new DbAndTable(db, tableName)); } } } catch (Exception exc) { throw new IOException(exc); } return tables; }
/*** * Parse logical Database and Table name from a given DbAndTable object. * * Eg. * Dataset Name Pattern : prod_$LOGICAL_DB_linkedin.prod_$LOGICAL_TABLE_linkedin * Source DB and Table : prod_dbName_linkedin.prod_tableName_linkedin * Logical DB Token : $LOGICAL_DB * Logical Table Token : $LOGICAL_TABLE * Parsed Logical DB and Table : dbName.tableName * * @param datasetNamePattern Dataset name pattern. * @param dbAndTable Source DB and Table. * @param logicalDbToken Logical DB token. * @param logicalTableToken Logical Table token. * @return Parsed logical DB and Table. */ @VisibleForTesting protected static DbAndTable parseLogicalDbAndTable(String datasetNamePattern, DbAndTable dbAndTable, String logicalDbToken, String logicalTableToken) { Preconditions.checkArgument(StringUtils.isNotBlank(datasetNamePattern), "Dataset name pattern must not be empty."); List<String> datasetNameSplit = Lists.newArrayList(SPLIT_ON_DOT.split(datasetNamePattern)); Preconditions.checkArgument(datasetNameSplit.size() == 2, "Dataset name pattern must of the format: " + "dbPrefix_$LOGICAL_DB_dbPostfix.tablePrefix_$LOGICAL_TABLE_tablePostfix (prefix / postfix are optional)"); String dbNamePattern = datasetNameSplit.get(0); String tableNamePattern = datasetNameSplit.get(1); String logicalDb = extractTokenValueFromEntity(dbAndTable.getDb(), dbNamePattern, logicalDbToken); String logicalTable = extractTokenValueFromEntity(dbAndTable.getTable(), tableNamePattern, logicalTableToken); return new DbAndTable(logicalDb, logicalTable); }
@Override protected HiveDataset computeNext() { while (this.tables.hasNext()) { DbAndTable dbAndTable = this.tables.next(); try (AutoReturnableObject<IMetaStoreClient> client = HiveDatasetFinder.this.clientPool.getClient()) { Table table = client.get().getTable(dbAndTable.getDb(), dbAndTable.getTable()); Config datasetConfig = getDatasetConfig(table); if (ConfigUtils.getBoolean(datasetConfig, HIVE_DATASET_IS_BLACKLISTED_KEY, DEFAULT_HIVE_DATASET_IS_BLACKLISTED_KEY)) { continue; } if (HiveDatasetFinder.this.eventSubmitter.isPresent()) { SlaEventSubmitter.builder().datasetUrn(dbAndTable.toString()) .eventSubmitter(HiveDatasetFinder.this.eventSubmitter.get()).eventName(DATASET_FOUND).build().submit(); } return createHiveDataset(table, datasetConfig); } catch (IllegalArgumentException e) { Throwables.propagate(e); } catch (Throwable t) { log.error(String.format("Failed to create HiveDataset for table %s.%s", dbAndTable.getDb(), dbAndTable.getTable()), t); if (HiveDatasetFinder.this.eventSubmitter.isPresent()) { SlaEventSubmitter.builder().datasetUrn(dbAndTable.toString()) .eventSubmitter(HiveDatasetFinder.this.eventSubmitter.get()).eventName(DATASET_ERROR) .additionalMetadata(FAILURE_CONTEXT, t.toString()).build().submit(); } } } return endOfData(); } };
/** * Generates a CTAS statement to dump the contents of a table / partition into a new table. * @param outputDbAndTable output db and table where contents should be written. * @param sourceEntity source table / partition. * @param partitionDMLInfo map of partition values. * @param storageFormat format of output table. * @param outputTableLocation location where files of output table should be written. */ public static String generateStagingCTASStatementFromSelectStar(HiveDatasetFinder.DbAndTable outputDbAndTable, HiveDatasetFinder.DbAndTable sourceEntity, Map<String, String> partitionDMLInfo, StorageFormat storageFormat, String outputTableLocation) { StringBuilder sourceQueryBuilder = new StringBuilder("SELECT * FROM `").append(sourceEntity.getDb()) .append("`.`").append(sourceEntity.getTable()).append("`"); if (partitionDMLInfo != null && !partitionDMLInfo.isEmpty()) { sourceQueryBuilder.append(" WHERE "); sourceQueryBuilder.append(partitionDMLInfo.entrySet().stream() .map(e -> "`" + e.getKey() + "`='" + e.getValue() + "'") .collect(joining(" AND "))); } return generateStagingCTASStatement(outputDbAndTable, sourceQueryBuilder.toString(), storageFormat, outputTableLocation); }
public HiveDataset(FileSystem fs, HiveMetastoreClientPool clientPool, Table table, Properties properties, Config datasetConfig) { this.fs = fs; this.clientPool = clientPool; this.table = table; this.properties = properties; this.tableRootPath = PathUtils.isGlob(this.table.getDataLocation()) ? Optional.<Path> absent() : Optional.fromNullable(this.table.getDataLocation()); this.tableIdentifier = this.table.getDbName() + "." + this.table.getTableName(); this.datasetNamePattern = Optional.fromNullable(ConfigUtils.getString(datasetConfig, DATASET_NAME_PATTERN_KEY, null)); this.dbAndTable = new DbAndTable(table.getDbName(), table.getTableName()); if (this.datasetNamePattern.isPresent()) { this.logicalDbAndTable = parseLogicalDbAndTable(this.datasetNamePattern.get(), this.dbAndTable, LOGICAL_DB_TOKEN, LOGICAL_TABLE_TOKEN); } else { this.logicalDbAndTable = this.dbAndTable; } this.datasetConfig = resolveConfig(datasetConfig, dbAndTable, logicalDbAndTable); this.metricContext = Instrumented.getMetricContext(new State(properties), HiveDataset.class, Lists.<Tag<?>> newArrayList(new Tag<>(DATABASE, table.getDbName()), new Tag<>(TABLE, table.getTableName()))); }
private HiveProcessingEntity getConversionEntity(HiveWorkUnit hiveWorkUnit) throws IOException, TException, HiveException { try (AutoReturnableObject<IMetaStoreClient> client = this.pool.getClient()) { HiveDataset dataset = hiveWorkUnit.getHiveDataset(); HiveDatasetFinder.DbAndTable dbAndTable = dataset.getDbAndTable(); Table table = new Table(client.get().getTable(dbAndTable.getDb(), dbAndTable.getTable())); Partition partition = null; if (hiveWorkUnit.getPartitionName().isPresent()) { partition = new Partition(table, client.get() .getPartition(dbAndTable.getDb(), dbAndTable.getTable(), hiveWorkUnit.getPartitionName().get())); } return new HiveProcessingEntity(dataset, table, Optional.fromNullable(partition)); } }
private HiveDataset getHiveDataset(String tableString, FileSystem fs, State state) throws IOException { try { HiveMetastoreClientPool pool = HiveMetastoreClientPool.get(state.getProperties(), Optional.fromNullable(state.getProp(HIVE_METASTORE_URI_KEY))); List<String> tokens = Splitter.on(".").splitToList(tableString); DbAndTable sourceDbAndTable = new DbAndTable(tokens.get(0), tokens.get(1)); try (AutoReturnableObject<IMetaStoreClient> client = pool.getClient()) { Table sourceTable = new Table(client.get().getTable(sourceDbAndTable.getDb(), sourceDbAndTable.getTable())); return new HiveDataset(fs, pool, sourceTable, ConfigUtils.propertiesToConfig(state.getProperties())); } } catch (TException exc) { throw new RuntimeException(exc); } }
/** * Generates a CTAS statement to dump the results of a query into a new table. * @param outputDbAndTable output db and table where contents should be written. * @param sourceQuery query to materialize. * @param storageFormat format of output table. * @param outputTableLocation location where files of output table should be written. */ public static String generateStagingCTASStatement(HiveDatasetFinder.DbAndTable outputDbAndTable, String sourceQuery, StorageFormat storageFormat, String outputTableLocation) { Preconditions.checkArgument(!Strings.isNullOrEmpty(outputDbAndTable.getDb()) && !Strings.isNullOrEmpty(outputDbAndTable.getTable()), "Invalid output db and table " + outputDbAndTable); return String.format("CREATE TEMPORARY TABLE `%s`.`%s` STORED AS %s LOCATION '%s' AS %s", outputDbAndTable.getDb(), outputDbAndTable.getTable(), storageFormat.getHiveName(), outputTableLocation, sourceQuery); }
@Override public List<String> generateQueries() { ensureParentOfStagingPathExists(); return Lists.newArrayList(HiveConverterUtils.generateStagingCTASStatementFromSelectStar( new HiveDatasetFinder.DbAndTable(this.outputDatabaseName, this.stagingTableName), new HiveDatasetFinder.DbAndTable(this.inputDbName, this.inputTableName), this.partitionsDMLInfo, this.storageFormat, this.stagingDataLocation)); } }
public HiveBaseExtractor(WorkUnitState state) throws IOException { if (Boolean.valueOf(state.getPropAsBoolean(PartitionLevelWatermarker.IS_WATERMARK_WORKUNIT_KEY))) { return; } this.hiveWorkUnit = new HiveWorkUnit(state.getWorkunit()); this.hiveDataset = hiveWorkUnit.getHiveDataset(); this.dbName = hiveDataset.getDbAndTable().getDb(); this.tableName = hiveDataset.getDbAndTable().getTable(); this.pool = HiveMetastoreClientPool.get(state.getJobState().getProperties(), Optional.fromNullable(state.getJobState().getProp(HiveDatasetFinder.HIVE_METASTORE_URI_KEY))); }
@Override public List<String> generateQueries() { ensureParentOfStagingPathExists(); return Lists.newArrayList(HiveConverterUtils.generateStagingCTASStatement( new HiveDatasetFinder.DbAndTable(this.outputDatabaseName, this.stagingTableName), this.sourceQuery, this.storageFormat, this.stagingDataLocation)); }
new HiveDatasetFinder.DbAndTable("dbPrefix_myDB_dbPostfix", "tablePrefix_myTable_tablePostfix"), HiveDataset.LOGICAL_DB_TOKEN, HiveDataset.LOGICAL_TABLE_TOKEN); Assert.assertEquals(logicalDbAndTable.getDb(), "myDB", "DB name not parsed correctly"); Assert.assertEquals(logicalDbAndTable.getTable(), "myTable", "Table name not parsed correctly"); new HiveDatasetFinder.DbAndTable("myDB_dbPostfix", "myTable_tablePostfix"), HiveDataset.LOGICAL_DB_TOKEN, HiveDataset.LOGICAL_TABLE_TOKEN); Assert.assertEquals(logicalDbAndTable.getDb(), "myDB", "DB name not parsed correctly"); Assert.assertEquals(logicalDbAndTable.getTable(), "myTable", "Table name not parsed correctly"); new HiveDatasetFinder.DbAndTable("dbPrefix_myDB", "tablePrefix_myTable"), HiveDataset.LOGICAL_DB_TOKEN, HiveDataset.LOGICAL_TABLE_TOKEN); Assert.assertEquals(logicalDbAndTable.getDb(), "myDB", "DB name not parsed correctly"); Assert.assertEquals(logicalDbAndTable.getTable(), "myTable", "Table name not parsed correctly"); new HiveDatasetFinder.DbAndTable("myDB", "myTable"), HiveDataset.LOGICAL_DB_TOKEN, HiveDataset.LOGICAL_TABLE_TOKEN); Assert.assertEquals(logicalDbAndTable.getDb(), "myDB", "DB name not parsed correctly"); Assert.assertEquals(logicalDbAndTable.getTable(), "myTable", "Table name not parsed correctly"); try { logicalDbAndTable = HiveDataset.parseLogicalDbAndTable(datasetNamePattern, new HiveDatasetFinder.DbAndTable("dbPrefix_myDB_dbPostfix", "tablePrefix_myTable_tablePostfix"), HiveDataset.LOGICAL_DB_TOKEN, HiveDataset.LOGICAL_TABLE_TOKEN); Assert.fail("Dataset name pattern is missing, code should have thrown exception"); try { logicalDbAndTable = HiveDataset.parseLogicalDbAndTable(datasetNamePattern, new HiveDatasetFinder.DbAndTable("dbPrefix_myDB_dbPostfix", "tablePrefix_myTable_tablePostfix"),
String resolvedValue = StringUtils.replaceEach(rawValue, new String[] { DATABASE_TOKEN, TABLE_TOKEN, LOGICAL_DB_TOKEN, LOGICAL_TABLE_TOKEN }, new String[] { realDbAndTable.getDb(), realDbAndTable.getTable(), logicalDbAndTable.getDb(), logicalDbAndTable.getTable() }); resolvedValueList.add(resolvedValue); String resolvedValue = StringUtils.replaceEach(resolvedConfig.getString(entry.getKey()), new String[] { DATABASE_TOKEN, TABLE_TOKEN, LOGICAL_DB_TOKEN, LOGICAL_TABLE_TOKEN }, new String[] { realDbAndTable.getDb(), realDbAndTable.getTable(), logicalDbAndTable.getDb(), logicalDbAndTable.getTable() }); resolvedProperties.setProperty(entry.getKey(), resolvedValue);
/*** * Parse logical Database and Table name from a given DbAndTable object. * * Eg. * Dataset Name Pattern : prod_$LOGICAL_DB_linkedin.prod_$LOGICAL_TABLE_linkedin * Source DB and Table : prod_dbName_linkedin.prod_tableName_linkedin * Logical DB Token : $LOGICAL_DB * Logical Table Token : $LOGICAL_TABLE * Parsed Logical DB and Table : dbName.tableName * * @param datasetNamePattern Dataset name pattern. * @param dbAndTable Source DB and Table. * @param logicalDbToken Logical DB token. * @param logicalTableToken Logical Table token. * @return Parsed logical DB and Table. */ @VisibleForTesting protected static DbAndTable parseLogicalDbAndTable(String datasetNamePattern, DbAndTable dbAndTable, String logicalDbToken, String logicalTableToken) { Preconditions.checkArgument(StringUtils.isNotBlank(datasetNamePattern), "Dataset name pattern must not be empty."); List<String> datasetNameSplit = Lists.newArrayList(SPLIT_ON_DOT.split(datasetNamePattern)); Preconditions.checkArgument(datasetNameSplit.size() == 2, "Dataset name pattern must of the format: " + "dbPrefix_$LOGICAL_DB_dbPostfix.tablePrefix_$LOGICAL_TABLE_tablePostfix (prefix / postfix are optional)"); String dbNamePattern = datasetNameSplit.get(0); String tableNamePattern = datasetNameSplit.get(1); String logicalDb = extractTokenValueFromEntity(dbAndTable.getDb(), dbNamePattern, logicalDbToken); String logicalTable = extractTokenValueFromEntity(dbAndTable.getTable(), tableNamePattern, logicalTableToken); return new DbAndTable(logicalDb, logicalTable); }
private HiveMetastoreClientPool getTestPool(List<HiveDatasetFinder.DbAndTable> dbAndTables) throws Exception { SetMultimap<String, String> entities = HashMultimap.create(); for (HiveDatasetFinder.DbAndTable dbAndTable : dbAndTables) { entities.put(dbAndTable.getDb(), dbAndTable.getTable()); } HiveMetastoreClientPool pool = Mockito.mock(HiveMetastoreClientPool.class); IMetaStoreClient client = Mockito.mock(IMetaStoreClient.class); Mockito.when(client.getAllDatabases()).thenReturn(Lists.newArrayList(entities.keySet())); for (String db : entities.keySet()) { Mockito.doReturn(Lists.newArrayList(entities.get(db))).when(client).getAllTables(db); } for (HiveDatasetFinder.DbAndTable dbAndTable : dbAndTables) { Table table = new Table(); table.setDbName(dbAndTable.getDb()); table.setTableName(dbAndTable.getTable()); StorageDescriptor sd = new StorageDescriptor(); sd.setLocation("/tmp/test"); table.setSd(sd); Mockito.doReturn(table).when(client).getTable(dbAndTable.getDb(), dbAndTable.getTable()); } @SuppressWarnings("unchecked") AutoReturnableObject<IMetaStoreClient> aro = Mockito.mock(AutoReturnableObject.class); Mockito.when(aro.get()).thenReturn(client); Mockito.when(pool.getHiveRegProps()).thenReturn(null); Mockito.when(pool.getClient()).thenReturn(aro); return pool; }