/** * Tests Number of inputSplits for MR job when specify number of mappers for TableInputFormatXXX * This test does not run MR job */ protected void testNumOfSplits(int splitsPerRegion, int expectedNumOfSplits) throws IOException, InterruptedException, ClassNotFoundException { String jobName = "TestJobForNumOfSplits"; LOG.info("Before map/reduce startup - job " + jobName); Configuration c = new Configuration(TEST_UTIL.getConfiguration()); Scan scan = new Scan(); scan.addFamily(INPUT_FAMILYS[0]); scan.addFamily(INPUT_FAMILYS[1]); c.setInt("hbase.mapreduce.tableinput.mappers.per.region", splitsPerRegion); c.set(KEY_STARTROW, ""); c.set(KEY_LASTROW, ""); Job job = Job.getInstance(c, jobName); TableMapReduceUtil.initTableMapperJob(TABLE_NAME.getNameAsString(), scan, ScanMapper.class, ImmutableBytesWritable.class, ImmutableBytesWritable.class, job); TableInputFormat tif = new TableInputFormat(); tif.setConf(job.getConfiguration()); Assert.assertEquals(TABLE_NAME, table.getName()); List<InputSplit> splits = tif.getSplits(job); Assert.assertEquals(expectedNumOfSplits, splits.size()); }
@Override public Configuration getConf() { return tableInputFormat.getConf(); } }
addColumns(scan, conf.get(SCAN_COLUMNS));
/** * Sets the configuration. This is used to set the details for the table to * be scanned. * * @param configuration The configuration to set. * @see org.apache.hadoop.conf.Configurable#setConf( * org.apache.hadoop.conf.Configuration) */ @Override @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="REC_CATCH_EXCEPTION", justification="Intentional") public void setConf(Configuration configuration) { this.conf = configuration; Scan scan = null; if (conf.get(SCAN) != null) { try { scan = TableMapReduceUtil.convertStringToScan(conf.get(SCAN)); } catch (IOException e) { LOG.error("An error occurred.", e); } } else { try { scan = createScanFromConfiguration(conf); } catch (Exception e) { LOG.error(StringUtils.stringifyException(e)); } } setScan(scan); }
@Test public void testNonSuccessiveSplitsAreNotMerged() throws IOException { JobContext context = mock(JobContext.class); Configuration conf = HBaseConfiguration.create(); conf.set(ClusterConnection.HBASE_CLIENT_CONNECTION_IMPL, ConnectionForMergeTesting.class.getName()); conf.set(TableInputFormat.INPUT_TABLE, "testTable"); conf.setBoolean(TableInputFormatBase.MAPREDUCE_INPUT_AUTOBALANCE, true); when(context.getConfiguration()).thenReturn(conf); TableInputFormat tifExclude = new TableInputFormatForMergeTesting(); tifExclude.setConf(conf); // split["b", "c"] is excluded, split["o", "p"] and split["p", "q"] are merged, // but split["a", "b"] and split["c", "d"] are not merged. assertEquals(ConnectionForMergeTesting.START_KEYS.length - 1 - 1, tifExclude.getSplits(context).size()); }
@Test public void testTableInputFormatBaseReverseDNSForIPv6() throws UnknownHostException { String address = "ipv6.google.com"; String localhost = null; InetAddress addr = null; TableInputFormat inputFormat = new TableInputFormat(); try { localhost = InetAddress.getByName(address).getCanonicalHostName(); addr = Inet6Address.getByName(address); } catch (UnknownHostException e) { // google.com is down, we can probably forgive this test. return; } System.out.println("Should retrun the hostname for this host " + localhost + " addr : " + addr); String actualHostName = inputFormat.reverseDNS(addr); assertEquals("Should retrun the hostname for this host. Expected : " + localhost + " Actual : " + actualHostName, localhost, actualHostName); }
@Override public List<InputSplit> getSplits(final JobContext jobContext) throws IOException, InterruptedException { return this.tableInputFormat.getSplits(jobContext); }
this.tableInputFormat.setConf(config);
private TableInputFormat getDelegate(Configuration conf) throws IOException { TableInputFormat delegate = new TableInputFormat(); String tableName = HBaseMetadataProvider.getTableName(dataset.getName()); conf.set(TableInputFormat.INPUT_TABLE, tableName); if (view != null) { Job tempJob = new Job(); Scan scan = ((BaseEntityScanner) view.newEntityScanner()).getScan(); TableMapReduceUtil.initTableMapperJob(tableName, scan, TableMapper.class, null, null, tempJob); Configuration tempConf = Hadoop.JobContext.getConfiguration.invoke(tempJob); conf.set(SCAN, tempConf.get(SCAN)); } delegate.setConf(conf); return delegate; }
String tableName = conf.get(INPUT_TABLE); try { setHTable(new HTable(new Configuration(conf), tableName)); } catch (Exception e) { LOG.error(StringUtils.stringifyException(e)); addColumns(scan, conf.get(SCAN_COLUMNS)); setScan(scan);
@Override public RecordReader<ImmutableBytesWritable, ResultWritable> getRecordReader( InputSplit split, JobConf job, Reporter reporter) throws IOException { String jobString = job.get(HCatConstants.HCAT_KEY_JOB_INFO); InputJobInfo inputJobInfo = (InputJobInfo) HCatUtil.deserialize(jobString); String tableName = job.get(TableInputFormat.INPUT_TABLE); TableSplit tSplit = (TableSplit) split; HbaseSnapshotRecordReader recordReader = new HbaseSnapshotRecordReader(inputJobInfo, job); inputFormat.setConf(job); Scan inputScan = inputFormat.getScan(); // TODO: Make the caching configurable by the user inputScan.setCaching(200); inputScan.setCacheBlocks(false); Scan sc = new Scan(inputScan); sc.setStartRow(tSplit.getStartRow()); sc.setStopRow(tSplit.getEndRow()); recordReader.setScan(sc); recordReader.setHTable(new HTable(job, tableName)); recordReader.init(); return recordReader; }
addColumns(scan, conf.get(SCAN_COLUMNS)); setScan(scan);
/** * Exec the HbaseSplit for a query against an Hbase table. * <p> * Does a whole bunch of fun stuff! Splitting on row ID ranges, applying secondary indexes, column pruning, * all sorts of sweet optimizations. What you have here is an important method. * * @param session Current session * @param split HbaseSplit * @param columnHandles List of HbaseColumnHandle * @return RecordReader<ImmutableBytesWritable , Result> for {@link org.apache.hadoop.mapreduce.RecordReader} */ public RecordReader<ImmutableBytesWritable, Result> execSplit(ConnectorSession session, HbaseSplit split, List<HbaseColumnHandle> columnHandles) throws IllegalAccessException, NoSuchFieldException, IOException, InterruptedException { TableName tableName = TableName.valueOf(split.getSchema(), split.getTable()); Scan scan = TabletSplitMetadata.convertStringToScan(split.getSplitMetadata().getScan()); buildScan(scan, session, columnHandles); TableInputFormat tableInputFormat = getNewTableInputFormat(connection, tableName); tableInputFormat.setScan(scan); RecordReader<ImmutableBytesWritable, Result> resultRecordReader = tableInputFormat.createRecordReader(new TableSplit( TableName.valueOf(split.getSplitMetadata().getTableName()), scan, split.getSplitMetadata().getStartRow(), split.getSplitMetadata().getEndRow(), split.getSplitMetadata().getRegionLocation(), split.getSplitMetadata().getLength() ), null); resultRecordReader.initialize(null, null); return resultRecordReader; }
/** * Run MR job to test autobalance for setting number of mappers for TIF This does not run real MR * job */ protected void testAutobalanceNumOfSplit() throws IOException { // set up splits for testing List<InputSplit> splits = new ArrayList<>(5); int[] regionLen = { 10, 20, 20, 40, 60 }; for (int i = 0; i < 5; i++) { InputSplit split = new TableSplit(TABLE_NAME, new Scan(), Bytes.toBytes(i), Bytes.toBytes(i + 1), "", "", regionLen[i] * 1048576); splits.add(split); } TableInputFormat tif = new TableInputFormat(); List<InputSplit> res = tif.calculateAutoBalancedSplits(splits, 1073741824); assertEquals("Saw the wrong number of splits", 5, res.size()); TableSplit ts1 = (TableSplit) res.get(0); assertEquals("The first split end key should be", 2, Bytes.toInt(ts1.getEndRow())); TableSplit ts2 = (TableSplit) res.get(1); assertEquals("The second split regionsize should be", 20 * 1048576, ts2.getLength()); TableSplit ts3 = (TableSplit) res.get(2); assertEquals("The third split start key should be", 3, Bytes.toInt(ts3.getStartRow())); TableSplit ts4 = (TableSplit) res.get(4); assertNotEquals("The seventh split start key should not be", 4, Bytes.toInt(ts4.getStartRow())); } }
@Override public RecordReader<StaticBuffer, Iterable<Entry>> createRecordReader(final InputSplit inputSplit, final TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { tableReader = (TableRecordReader) tableInputFormat.createRecordReader(inputSplit, taskAttemptContext); titanRecordReader = new HBaseBinaryRecordReader(tableReader, inputCFBytes); return titanRecordReader; }
@Override protected void initialize(JobContext context) throws IOException { // Do we have to worry about mis-matches between the Configuration from setConf and the one // in this context? TableName tableName = TableName.valueOf(conf.get(INPUT_TABLE)); try { initializeTable(ConnectionFactory.createConnection(new Configuration(conf)), tableName); } catch (Exception e) { LOG.error(StringUtils.stringifyException(e)); } }
TableInputFormat.configureSplitTable(job, TableName.valueOf(dstTableName));
public HBaseInputFormat() { inputFormat = new TableInputFormat(); }
@Override public InputFormat getInputFormat() { TableInputFormat inputFormat = new HBaseTableIFBuilder() .withLimit(limit_) .withGt(gt_) .withGte(gte_) .withLt(lt_) .withLte(lte_) .withConf(m_conf) .build(); inputFormat.setScan(scan); return inputFormat; }