private DrillIndexDescriptor buildIndexDescriptor(String tableName, IndexDesc desc) throws InvalidIndexDefinitionException { if (desc.isExternal()) { List<LogicalExpression> indexFields = field2SchemaPath(desc.getIndexedFields()); List<LogicalExpression> coveringFields = field2SchemaPath(desc.getIncludedFields()); coveringFields.add(SchemaPath.getSimplePath("_id")); CollationContext collationContext = null; if (!desc.isHashed()) { // hash index has no collation property List<RelFieldCollation> indexFieldCollations = getFieldCollations(desc, desc.getIndexedFields()); collationContext = buildCollationContext(indexFields, indexFieldCollations); coveringFields, null, desc.getIndexName(), tableName, idxType, desc, this.getOriginalScan(), desc.getMissingAndNullOrdering() == MissingAndNullOrdering.MissingAndNullFirst ? NullDirection.FIRST : (desc.getMissingAndNullOrdering() == MissingAndNullOrdering.MissingAndNullLast ? NullDirection.LAST : NullDirection.UNSPECIFIED));
public int hashCode() { final int IdxDescHashCode = (indexDesc == null) ? 0 : indexDesc.getIndexFid().hashCode(); return (path.hashCode() + IdxDescHashCode + ugi.hashCode()); }
/** * Get the estimated average rowsize. DO NOT call this API directly. * Call the stats API instead which modifies the counts based on preference options. * @param index, to use for generating the estimate * @return row count post filtering */ public MapRDBStatisticsPayload getAverageRowSizeStats(IndexDescriptor index) { IndexDesc indexDesc = null; double avgRowSize = AVG_ROWSIZE_UNKNOWN; if (index != null) { indexDesc = (IndexDesc)((MapRDBIndexDescriptor)index).getOriginalDesc(); } // If no index is specified, get it from the primary table if (indexDesc == null && scanSpec.isSecondaryIndex()) { throw new UnsupportedOperationException("getAverageRowSizeStats should be invoked on primary table"); } // Get the index table or primary table and use the DB API to get the estimated number of rows. For size estimates, // we assume that all the columns would be read from the disk. final Table table = this.formatPlugin.getJsonTableCache().getTable(scanSpec.getTableName(), indexDesc, getUserName()); if (table != null) { final MetaTable metaTable = table.getMetaTable(); if (metaTable != null) { avgRowSize = metaTable.getAverageRowSize(); } } logger.debug("index_plan_info: getEstimatedRowCount obtained from DB Client for {}: indexName: {}, indexInfo: {}, " + "avgRowSize: {}, estimatedSize {}", this, (indexDesc == null ? "null" : indexDesc.getIndexName()), (indexDesc == null ? "null" : indexDesc.getIndexInfo()), avgRowSize); return new MapRDBStatisticsPayload(ROWCOUNT_UNKNOWN, ROWCOUNT_UNKNOWN, avgRowSize); }
private ScanStats indexScanStats() { if (!this.getIndexHint().equals("") && this.getIndexHint().equals(getIndexDesc().getIndexName())) { logger.debug("JsonIndexGroupScan:{} forcing index {} by making tiny cost", this, this.getIndexHint()); return new ScanStats(GroupScanProperty.NO_EXACT_ROW_COUNT, 1,1, 0); boolean filterPushed = (scanSpec.getSerializedFilter() != null); if (scanSpec != null && scanSpec.getIndexDesc() != null) { totalColNum = scanSpec.getIndexDesc().getIncludedFields().size() + scanSpec.getIndexDesc().getIndexedFields().size() + 1; String idxIdentifier = stats.buildUniqueIndexIdentifier(scanSpec.getIndexDesc().getPrimaryTablePath(), scanSpec.getIndexDesc().getIndexName()); double rowCount = stats.getRowCount(scanSpec.getCondition(), idxIdentifier); rowCount = (filterPushed ? 0.0005f : 0.001f) * fullTableRowCount / scanSpec.getIndexDesc().getIndexedFields().size(); double diskCost = numBlocks * pluginCostModel.getSequentialBlockReadCost(this); logger.debug("index_plan_info: JsonIndexGroupScan:{} - indexName:{}: rowCount:{}, avgRowSize:{}, blocks:{}, totalBlocks:{}, rowsFromDisk {}, diskCost:{}", System.identityHashCode(this), scanSpec.getIndexDesc().getIndexName(), rowCount, avgRowSize, numBlocks, totalBlocks, rowsFromDisk, diskCost); return new ScanStats(GroupScanProperty.NO_EXACT_ROW_COUNT, rowCount, 1, diskCost);
@JsonIgnore public String getIndexName() { return (this.indexDesc == null) ? null : this.indexDesc.getIndexName(); }
final TreeMap<TabletFragmentInfo, String> regionsToScan = new TreeMap<>(); if (isIndexScan()) { String idxIdentifier = stats.buildUniqueIndexIdentifier(scanSpec.getIndexDesc().getPrimaryTablePath(), scanSpec.getIndexDesc().getIndexName()); if (stats.isStatsAvailable()) { estimatedRowCount = stats.getRowCount(scanSpec.getCondition(), idxIdentifier);
@JsonIgnore public Path getPrimaryTablePath() { return (this.indexDesc == null) ? null : new Path(this.indexDesc.getPrimaryTablePath()); }
private ScanStats indexScanStats() { if (!this.getIndexHint().equals("") && this.getIndexHint().equals(getIndexDesc().getIndexName())) { logger.debug("JsonIndexGroupScan:{} forcing index {} by making tiny cost", this, this.getIndexHint()); return new ScanStats(GroupScanProperty.NO_EXACT_ROW_COUNT, 1,1, 0); boolean filterPushed = (scanSpec.getSerializedFilter() != null); if (scanSpec != null && scanSpec.getIndexDesc() != null) { totalColNum = scanSpec.getIndexDesc().getIncludedFields().size() + scanSpec.getIndexDesc().getIndexedFields().size() + 1; String idxIdentifier = stats.buildUniqueIndexIdentifier(scanSpec.getIndexDesc().getPrimaryTablePath(), scanSpec.getIndexDesc().getIndexName()); double rowCount = stats.getRowCount(scanSpec.getCondition(), idxIdentifier); rowCount = (filterPushed ? 0.0005f : 0.001f) * fullTableRowCount / scanSpec.getIndexDesc().getIndexedFields().size(); double diskCost = numBlocks * pluginCostModel.getSequentialBlockReadCost(this); logger.debug("index_plan_info: JsonIndexGroupScan:{} - indexName:{}: rowCount:{}, avgRowSize:{}, blocks:{}, totalBlocks:{}, rowsFromDisk {}, diskCost:{}", System.identityHashCode(this), scanSpec.getIndexDesc().getIndexName(), rowCount, avgRowSize, numBlocks, totalBlocks, rowsFromDisk, diskCost); return new ScanStats(GroupScanProperty.NO_EXACT_ROW_COUNT, rowCount, 1, diskCost);
@Override public Table load(final MapRDBTableCache.Key key) throws Exception { // getTable is already calling tableCache.get in correct user UGI context, so should be fine here. // key.Left is Path. key.Right is indexDesc. Table table = (key.indexDesc == null ? MapRDBImpl.getTable(key.path) : MapRDBImpl.getIndexTable(key.indexDesc)); logger.debug("time {} opened the table for tablePath {} tableHandle {} index {} userName {}", System.nanoTime(), key.path == null ? "null" : key.path, table == null ? "null" : table, key.indexDesc == null ? "null" : key.indexDesc.getIndexName(), key.ugi.getUserName() == null ? "null" : key.ugi.getUserName()); return table; } });
final TreeMap<TabletFragmentInfo, String> regionsToScan = new TreeMap<>(); if (isIndexScan()) { String idxIdentifier = stats.buildUniqueIndexIdentifier(scanSpec.getIndexDesc().getPrimaryTablePath(), scanSpec.getIndexDesc().getIndexName()); if (stats.isStatsAvailable()) { estimatedRowCount = stats.getRowCount(scanSpec.getCondition(), idxIdentifier);
/** * Get the row count after applying the {@link RexNode} condition * @param condition, filter to apply * @return row count post filtering */ @Override @JsonIgnore public double getRowCount(RexNode condition, RelNode scanRel) { // Do not use statistics if row count is forced. Forced rowcounts take precedence over stats double rowcount; if (forcedRowCountMap.get(condition) != null) { return forcedRowCountMap.get(condition); } if (scanSpec.getIndexDesc() != null) { String idxIdentifier = stats.buildUniqueIndexIdentifier(scanSpec.getIndexDesc().getPrimaryTablePath(), scanSpec.getIndexName()); rowcount = stats.getRowCount(condition, idxIdentifier, scanRel); } else { rowcount = stats.getRowCount(condition, null, scanRel); } // Stats might NOT have the full rows (e.g. table is newly populated and DB stats APIs return it after // 15 mins). Use the table rows as populated using the (expensive but accurate) Hbase API if needed. if (condition == null && (rowcount == 0 || rowcount == ROWCOUNT_UNKNOWN)) { rowcount = fullTableRowCount; logger.debug("getRowCount: Stats not available yet! Use Admin APIs full table rowcount {}", fullTableRowCount); } return rowcount; }
private DrillIndexDescriptor buildIndexDescriptor(String tableName, IndexDesc desc) throws InvalidIndexDefinitionException { if (desc.isExternal()) { List<LogicalExpression> indexFields = field2SchemaPath(desc.getIndexedFields()); List<LogicalExpression> coveringFields = field2SchemaPath(desc.getIncludedFields()); coveringFields.add(SchemaPath.getSimplePath("_id")); CollationContext collationContext = null; if (!desc.isHashed()) { // hash index has no collation property List<RelFieldCollation> indexFieldCollations = getFieldCollations(desc, desc.getIndexedFields()); collationContext = buildCollationContext(indexFields, indexFieldCollations); coveringFields, null, desc.getIndexName(), tableName, idxType, desc, this.getOriginalScan(), desc.getMissingAndNullOrdering() == MissingAndNullOrdering.MissingAndNullFirst ? NullDirection.FIRST : (desc.getMissingAndNullOrdering() == MissingAndNullOrdering.MissingAndNullLast ? NullDirection.LAST : NullDirection.UNSPECIFIED));
"condition: {} rowCount: {}, avgRowSize: {}, estimatedSize {}, tabletCount {}, totalTabletCount {}, " + "scalingFactor {}", this, (index == null ? "null" : index.getIndexName()), (index == null ? "null" : index.getIndexInfo()), (condition == null ? "null" : condition.toString()), stats.getEstimatedNumRows(), (stats.getEstimatedNumRows() == 0 ? 0 : stats.getEstimatedSize()/stats.getEstimatedNumRows()), } else { logger.info("index_plan_info: getEstimatedRowCount: {} indexName: {}, indexInfo: {}, " + "condition: {} rowCount: UNKNOWN, avgRowSize: UNKNOWN", this, (index == null ? "null" : index.getIndexName()), (index == null ? "null" : index.getIndexInfo()), (condition == null ? "null" : condition.toString())); return new MapRDBStatisticsPayload(ROWCOUNT_UNKNOWN, ROWCOUNT_UNKNOWN, AVG_ROWSIZE_UNKNOWN);
public void onRemoval(RemovalNotification<MapRDBTableCache.Key, Table> removal) { Table table = removal.getValue(); MapRDBTableCache.Key key = removal.getKey(); logger.debug("time {} closing the tablePath {} tableHandle {} index {} userName {}", System.nanoTime(), key.path == null ? "null" : key.path, table == null ? "null" : table, key.indexDesc == null ? "null" : key.indexDesc.getIndexName(), key.ugi.getUserName() == null ? "null" : key.ugi.getUserName()); table.close(); // close the table } };
@Override public boolean isDistributed() { // getMaxParallelizationWidth gets information about all regions to scan and is expensive. // This option is meant to be used only for unit tests. boolean useNumRegions = storagePlugin.getContext().getConfig().getBoolean(PluginConstants.JSON_TABLE_USE_NUM_REGIONS_FOR_DISTRIBUTION_PLANNING); double fullTableSize; if (useNumRegions) { return getMaxParallelizationWidth() > 1 ? true: false; } // This function gets called multiple times during planning. To avoid performance // bottleneck, estimate degree of parallelization using stats instead of actually getting information // about all regions. double rowCount, rowSize; double scanRangeSize = storagePlugin.getContext().getConfig().getInt(PluginConstants.JSON_TABLE_SCAN_SIZE_MB) * 1024 * 1024; if (scanSpec.getIndexDesc() != null) { String idxIdentifier = stats.buildUniqueIndexIdentifier(scanSpec.getIndexDesc().getPrimaryTablePath(), scanSpec.getIndexName()); rowCount = stats.getRowCount(scanSpec.getCondition(), idxIdentifier); rowSize = stats.getAvgRowSize(idxIdentifier, false); } else { rowCount = stats.getRowCount(scanSpec.getCondition(), null); rowSize = stats.getAvgRowSize(null, false); } if (rowCount == ROWCOUNT_UNKNOWN || rowCount == 0 || rowSize == AVG_ROWSIZE_UNKNOWN || rowSize == 0) { fullTableSize = (scanSpec.getSerializedFilter() != null ? .5 : 1) * this.fullTableEstimatedSize; } else { fullTableSize = rowCount * rowSize; } return (long) fullTableSize / scanRangeSize > 1 ? true : false; }
public int hashCode() { final int IdxDescHashCode = (indexDesc == null) ? 0 : indexDesc.getIndexFid().hashCode(); return (path.hashCode() + IdxDescHashCode + ugi.hashCode()); }
/** * Get the estimated average rowsize. DO NOT call this API directly. * Call the stats API instead which modifies the counts based on preference options. * @param index, to use for generating the estimate * @return row count post filtering */ public MapRDBStatisticsPayload getAverageRowSizeStats(IndexDescriptor index) { IndexDesc indexDesc = null; double avgRowSize = AVG_ROWSIZE_UNKNOWN; if (index != null) { indexDesc = (IndexDesc)((MapRDBIndexDescriptor)index).getOriginalDesc(); } // If no index is specified, get it from the primary table if (indexDesc == null && scanSpec.isSecondaryIndex()) { throw new UnsupportedOperationException("getAverageRowSizeStats should be invoked on primary table"); } // Get the index table or primary table and use the DB API to get the estimated number of rows. For size estimates, // we assume that all the columns would be read from the disk. final Table table = this.formatPlugin.getJsonTableCache().getTable(scanSpec.getTableName(), indexDesc, getUserName()); if (table != null) { final MetaTable metaTable = table.getMetaTable(); if (metaTable != null) { avgRowSize = metaTable.getAverageRowSize(); } } logger.debug("index_plan_info: getEstimatedRowCount obtained from DB Client for {}: indexName: {}, indexInfo: {}, " + "avgRowSize: {}, estimatedSize {}", this, (indexDesc == null ? "null" : indexDesc.getIndexName()), (indexDesc == null ? "null" : indexDesc.getIndexInfo()), avgRowSize); return new MapRDBStatisticsPayload(ROWCOUNT_UNKNOWN, ROWCOUNT_UNKNOWN, avgRowSize); }
public Table run() throws Exception { if (logger.isTraceEnabled()) { logger.trace("Getting MaprDB Table handle for proxy user: " + UserGroupInformation.getCurrentUser()); } if (tableCachingEnabled) { Table table = tableCache.get(new MapRDBTableCache.Key(tablePath, indexDesc)); logger.trace("time {} get the tablePath {} tableHandle {} index {} userName {} currentUser {}", System.nanoTime(), tablePath == null ? "null" : tablePath, table == null ? "null" : table, indexDesc == null ? "null" : indexDesc.getIndexName(), userName == null ? "null" : userName, UserGroupInformation.getCurrentUser() == null ? "null" : UserGroupInformation.getCurrentUser()); return table; } else { return indexDesc == null ? MapRDBImpl.getTable(tablePath) : MapRDBImpl.getIndexTable(indexDesc); } } });
@JsonIgnore public Path getPrimaryTablePath() { return (this.indexDesc == null) ? null : new Path(this.indexDesc.getPrimaryTablePath()); }
"condition: {} rowCount: {}, avgRowSize: {}, estimatedSize {}, tabletCount {}, totalTabletCount {}, " + "scalingFactor {}", this, (index == null ? "null" : index.getIndexName()), (index == null ? "null" : index.getIndexInfo()), (condition == null ? "null" : condition.toString()), stats.getEstimatedNumRows(), (stats.getEstimatedNumRows() == 0 ? 0 : stats.getEstimatedSize()/stats.getEstimatedNumRows()), } else { logger.info("index_plan_info: getEstimatedRowCount: {} indexName: {}, indexInfo: {}, " + "condition: {} rowCount: UNKNOWN, avgRowSize: UNKNOWN", this, (index == null ? "null" : index.getIndexName()), (index == null ? "null" : index.getIndexInfo()), (condition == null ? "null" : condition.toString())); return new MapRDBStatisticsPayload(ROWCOUNT_UNKNOWN, ROWCOUNT_UNKNOWN, AVG_ROWSIZE_UNKNOWN);