org.apache.hadoop.hbase.mapreduce.TableInputFormat java code examples

/**
 * Tests Number of inputSplits for MR job when specify number of mappers for TableInputFormatXXX
 * This test does not run MR job
 */
protected void testNumOfSplits(int splitsPerRegion, int expectedNumOfSplits)
  throws IOException, InterruptedException, ClassNotFoundException {
 String jobName = "TestJobForNumOfSplits";
 LOG.info("Before map/reduce startup - job " + jobName);
 Configuration c = new Configuration(TEST_UTIL.getConfiguration());
 Scan scan = new Scan();
 scan.addFamily(INPUT_FAMILYS[0]);
 scan.addFamily(INPUT_FAMILYS[1]);
 c.setInt("hbase.mapreduce.tableinput.mappers.per.region", splitsPerRegion);
 c.set(KEY_STARTROW, "");
 c.set(KEY_LASTROW, "");
 Job job = Job.getInstance(c, jobName);
 TableMapReduceUtil.initTableMapperJob(TABLE_NAME.getNameAsString(), scan, ScanMapper.class,
  ImmutableBytesWritable.class, ImmutableBytesWritable.class, job);
 TableInputFormat tif = new TableInputFormat();
 tif.setConf(job.getConfiguration());
 Assert.assertEquals(TABLE_NAME, table.getName());
 List<InputSplit> splits = tif.getSplits(job);
 Assert.assertEquals(expectedNumOfSplits, splits.size());
}

  @Override
  public Configuration getConf() {
    return tableInputFormat.getConf();
  }
}

addColumns(scan, conf.get(SCAN_COLUMNS));

/**
 * Sets the configuration. This is used to set the details for the table to
 * be scanned.
 *
 * @param configuration  The configuration to set.
 * @see org.apache.hadoop.conf.Configurable#setConf(
 *   org.apache.hadoop.conf.Configuration)
 */
@Override
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value="REC_CATCH_EXCEPTION",
 justification="Intentional")
public void setConf(Configuration configuration) {
 this.conf = configuration;
 Scan scan = null;
 if (conf.get(SCAN) != null) {
  try {
   scan = TableMapReduceUtil.convertStringToScan(conf.get(SCAN));
  } catch (IOException e) {
   LOG.error("An error occurred.", e);
  }
 } else {
  try {
   scan = createScanFromConfiguration(conf);
  } catch (Exception e) {
    LOG.error(StringUtils.stringifyException(e));
  }
 }
 setScan(scan);
}

@Test
public void testNonSuccessiveSplitsAreNotMerged() throws IOException {
 JobContext context = mock(JobContext.class);
 Configuration conf = HBaseConfiguration.create();
 conf.set(ClusterConnection.HBASE_CLIENT_CONNECTION_IMPL,
   ConnectionForMergeTesting.class.getName());
 conf.set(TableInputFormat.INPUT_TABLE, "testTable");
 conf.setBoolean(TableInputFormatBase.MAPREDUCE_INPUT_AUTOBALANCE, true);
 when(context.getConfiguration()).thenReturn(conf);
 TableInputFormat tifExclude = new TableInputFormatForMergeTesting();
 tifExclude.setConf(conf);
 // split["b", "c"] is excluded, split["o", "p"] and split["p", "q"] are merged,
 // but split["a", "b"] and split["c", "d"] are not merged.
 assertEquals(ConnectionForMergeTesting.START_KEYS.length - 1 - 1,
   tifExclude.getSplits(context).size());
}

@Test
public void testTableInputFormatBaseReverseDNSForIPv6()
  throws UnknownHostException {
 String address = "ipv6.google.com";
 String localhost = null;
 InetAddress addr = null;
 TableInputFormat inputFormat = new TableInputFormat();
 try {
  localhost = InetAddress.getByName(address).getCanonicalHostName();
  addr = Inet6Address.getByName(address);
 } catch (UnknownHostException e) {
  // google.com is down, we can probably forgive this test.
  return;
 }
 System.out.println("Should retrun the hostname for this host " +
   localhost + " addr : " + addr);
 String actualHostName = inputFormat.reverseDNS(addr);
 assertEquals("Should retrun the hostname for this host. Expected : " +
   localhost + " Actual : " + actualHostName, localhost, actualHostName);
}

@Override
public List<InputSplit> getSplits(final JobContext jobContext) throws IOException, InterruptedException {
  return this.tableInputFormat.getSplits(jobContext);
}

tableInputFormat.setConf(connection.getConfiguration());
tableInputFormat.setScan(scan);
List<TableSplit> splits = tableInputFormat.getSplits(context)
    .stream().map(x -> (TableSplit) x).collect(Collectors.toList());

this.tableInputFormat.setConf(config);

private TableInputFormat getDelegate(Configuration conf) throws IOException {
 TableInputFormat delegate = new TableInputFormat();
 String tableName = HBaseMetadataProvider.getTableName(dataset.getName());
 conf.set(TableInputFormat.INPUT_TABLE, tableName);
 if (view != null) {
  Job tempJob = new Job();
  Scan scan = ((BaseEntityScanner) view.newEntityScanner()).getScan();
  TableMapReduceUtil.initTableMapperJob(tableName, scan, TableMapper.class, null,
    null, tempJob);
  Configuration tempConf = Hadoop.JobContext.getConfiguration.invoke(tempJob);
  conf.set(SCAN, tempConf.get(SCAN));
 }
 delegate.setConf(conf);
 return delegate;
}

String tableName = conf.get(INPUT_TABLE);
try {
 setHTable(new HTable(new Configuration(conf), tableName));
} catch (Exception e) {
 LOG.error(StringUtils.stringifyException(e));
   addColumns(scan, conf.get(SCAN_COLUMNS));
setScan(scan);

@Override
public RecordReader<ImmutableBytesWritable, ResultWritable> getRecordReader(
 InputSplit split, JobConf job, Reporter reporter)
 throws IOException {
 String jobString = job.get(HCatConstants.HCAT_KEY_JOB_INFO);
 InputJobInfo inputJobInfo = (InputJobInfo) HCatUtil.deserialize(jobString);
 String tableName = job.get(TableInputFormat.INPUT_TABLE);
 TableSplit tSplit = (TableSplit) split;
 HbaseSnapshotRecordReader recordReader = new HbaseSnapshotRecordReader(inputJobInfo, job);
 inputFormat.setConf(job);
 Scan inputScan = inputFormat.getScan();
 // TODO: Make the caching configurable by the user
 inputScan.setCaching(200);
 inputScan.setCacheBlocks(false);
 Scan sc = new Scan(inputScan);
 sc.setStartRow(tSplit.getStartRow());
 sc.setStopRow(tSplit.getEndRow());
 recordReader.setScan(sc);
 recordReader.setHTable(new HTable(job, tableName));
 recordReader.init();
 return recordReader;
}

   addColumns(scan, conf.get(SCAN_COLUMNS));
setScan(scan);

/**
 * Exec the HbaseSplit for a query against an Hbase table.
 * <p>
 * Does a whole bunch of fun stuff! Splitting on row ID ranges, applying secondary indexes, column pruning,
 * all sorts of sweet optimizations. What you have here is an important method.
 *
 * @param session Current session
 * @param split HbaseSplit
 * @param columnHandles List of HbaseColumnHandle
 * @return RecordReader<ImmutableBytesWritable ,   Result> for {@link org.apache.hadoop.mapreduce.RecordReader}
 */
public RecordReader<ImmutableBytesWritable, Result> execSplit(ConnectorSession session, HbaseSplit split, List<HbaseColumnHandle> columnHandles)
    throws IllegalAccessException, NoSuchFieldException, IOException, InterruptedException
{
  TableName tableName = TableName.valueOf(split.getSchema(), split.getTable());
  Scan scan = TabletSplitMetadata.convertStringToScan(split.getSplitMetadata().getScan());
  buildScan(scan, session, columnHandles);
  TableInputFormat tableInputFormat = getNewTableInputFormat(connection, tableName);
  tableInputFormat.setScan(scan);
  RecordReader<ImmutableBytesWritable, Result> resultRecordReader = tableInputFormat.createRecordReader(new TableSplit(
      TableName.valueOf(split.getSplitMetadata().getTableName()),
      scan,
      split.getSplitMetadata().getStartRow(),
      split.getSplitMetadata().getEndRow(),
      split.getSplitMetadata().getRegionLocation(),
      split.getSplitMetadata().getLength()
  ), null);
  resultRecordReader.initialize(null, null);
  return resultRecordReader;
}

 /**
  * Run MR job to test autobalance for setting number of mappers for TIF This does not run real MR
  * job
  */
 protected void testAutobalanceNumOfSplit() throws IOException {
  // set up splits for testing
  List<InputSplit> splits = new ArrayList<>(5);
  int[] regionLen = { 10, 20, 20, 40, 60 };
  for (int i = 0; i < 5; i++) {
   InputSplit split = new TableSplit(TABLE_NAME, new Scan(), Bytes.toBytes(i),
    Bytes.toBytes(i + 1), "", "", regionLen[i] * 1048576);
   splits.add(split);
  }
  TableInputFormat tif = new TableInputFormat();
  List<InputSplit> res = tif.calculateAutoBalancedSplits(splits, 1073741824);

  assertEquals("Saw the wrong number of splits", 5, res.size());
  TableSplit ts1 = (TableSplit) res.get(0);
  assertEquals("The first split end key should be", 2, Bytes.toInt(ts1.getEndRow()));
  TableSplit ts2 = (TableSplit) res.get(1);
  assertEquals("The second split regionsize should be", 20 * 1048576, ts2.getLength());
  TableSplit ts3 = (TableSplit) res.get(2);
  assertEquals("The third split start key should be", 3, Bytes.toInt(ts3.getStartRow()));
  TableSplit ts4 = (TableSplit) res.get(4);
  assertNotEquals("The seventh split start key should not be", 4, Bytes.toInt(ts4.getStartRow()));
 }
}

@Override
public RecordReader<StaticBuffer, Iterable<Entry>> createRecordReader(final InputSplit inputSplit, final TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
  tableReader =
      (TableRecordReader) tableInputFormat.createRecordReader(inputSplit, taskAttemptContext);
  titanRecordReader =
      new HBaseBinaryRecordReader(tableReader, inputCFBytes);
  return titanRecordReader;
}

@Override
protected void initialize(JobContext context) throws IOException {
 // Do we have to worry about mis-matches between the Configuration from setConf and the one
 // in this context?
 TableName tableName = TableName.valueOf(conf.get(INPUT_TABLE));
 try {
  initializeTable(ConnectionFactory.createConnection(new Configuration(conf)), tableName);
 } catch (Exception e) {
  LOG.error(StringUtils.stringifyException(e));
 }
}

TableInputFormat.configureSplitTable(job, TableName.valueOf(dstTableName));

public HBaseInputFormat() {
 inputFormat = new TableInputFormat();
}

@Override
public InputFormat getInputFormat() {
  TableInputFormat inputFormat = new HBaseTableIFBuilder()
  .withLimit(limit_)
  .withGt(gt_)
  .withGte(gte_)
  .withLt(lt_)
  .withLte(lte_)
  .withConf(m_conf)
  .build();
  inputFormat.setScan(scan);
  return inputFormat;
}

Javadoc

Convert HBase tabular data into a format that is consumable by Map/Reduce.

Most used methods

getSplits
Calculates the splits that will serve as input for the map tasks. The number of splits matches the n
setConf
Sets the configuration. This is used to set the details for the table to be scanned.
getConf
Returns the current configuration.
<init>
addColumns
Adds an array of columns specified using old format, family:qualifier. Overrides previous calls to S
createRecordReader
setScan
configureSplitTable
Sets split table in map-reduce job.
initializeTable
addColumn
Parses a combined family and qualifier and adds either both or just the family in case there is no q
calculateAutoBalancedSplits
createScanFromConfiguration
Sets up a Scan instance, applying settings from the configuration property constants defined in Tabl

Popular in Java

Reading from database using SQL prepared statement
startActivity (Activity)
getExternalFilesDir (Context)
getSupportFragmentManager (FragmentActivity)
FileNotFoundException (java.io)
Thrown when a file specified by a program cannot be found.
String (java.lang)
Callable (java.util.concurrent)
A task that returns a result and may throw an exception. Implementors define a single method with no
AtomicInteger (java.util.concurrent.atomic)
An int value that may be updated atomically. See the java.util.concurrent.atomic package specificati
FlowLayout (java.awt)
A flow layout arranges components in a left-to-right flow, much like lines of text in a paragraph. F
Scheduler (org.quartz)
This is the main interface of a Quartz Scheduler. A Scheduler maintains a registry of org.quartz.Job
Top plugins for Android Studio

How to useTableInputFormat in org.apache.hadoop.hbase.mapreduce

Best Java code snippets using org.apache.hadoop.hbase.mapreduce.TableInputFormat (Showing top 20 results out of 315)

How to use
TableInputFormat
in
org.apache.hadoop.hbase.mapreduce