org.apache.hive.hcatalog.mapreduce.HCatInputFormat java code examples

/**
 * Initializes the input with a null filter.
 * See {@link #setInput(org.apache.hadoop.conf.Configuration, String, String, String)}
 */
public static HCatInputFormat setInput(
    Configuration conf, String dbName, String tableName)
 throws IOException {
 return setInput(conf, dbName, tableName, null);
}

@Override
public Iterator<HCatRecord> read() throws HCatException {
 HCatInputFormat inpFmt = new HCatInputFormat();
 RecordReader<WritableComparable, HCatRecord> rr;
 try {
  TaskAttemptContext cntxt = ShimLoader.getHadoopShims().getHCatShim().createTaskAttemptContext(conf, new TaskAttemptID());
  rr = inpFmt.createRecordReader(split, cntxt);
  rr.initialize(split, cntxt);
 } catch (IOException e) {
  throw new HCatException(ErrorType.ERROR_NOT_INITIALIZED, e);
 } catch (InterruptedException e) {
  throw new HCatException(ErrorType.ERROR_NOT_INITIALIZED, e);
 }
 return new HCatRecordItr(rr);
}

@Override
public ReaderContext prepareRead() throws HCatException {
 try {
  Job job = new Job(conf);
  HCatInputFormat hcif = HCatInputFormat.setInput(
   job, re.getDbName(), re.getTableName(), re.getFilterString());
  ReaderContextImpl cntxt = new ReaderContextImpl();
  cntxt.setInputSplits(hcif.getSplits(
    ShimLoader.getHadoopShims().getHCatShim().createJobContext(job.getConfiguration(), null)));
  cntxt.setConf(job.getConfiguration());
  return cntxt;
 } catch (IOException e) {
  throw new HCatException(ErrorType.ERROR_NOT_INITIALIZED, e);
 } catch (InterruptedException e) {
  throw new HCatException(ErrorType.ERROR_NOT_INITIALIZED, e);
 }
}

/**
 * Creates a HCatInputFormat for the given database, table, and
 * {@link org.apache.hadoop.conf.Configuration}.
 * By default, the InputFormat returns {@link org.apache.hive.hcatalog.data.HCatRecord}.
 * The return type of the InputFormat can be changed to Flink-native tuples by calling
 * {@link HCatInputFormatBase#asFlinkTuples()}.
 *
 * @param database The name of the database to read from.
 * @param table The name of the table to read.
 * @param config The Configuration for the InputFormat.
 * @throws java.io.IOException
 */
public HCatInputFormatBase(String database, String table, Configuration config) throws IOException {
  super();
  this.configuration = config;
  HadoopUtils.mergeHadoopConf(this.configuration);
  this.hCatInputFormat = org.apache.hive.hcatalog.mapreduce.HCatInputFormat.setInput(this.configuration, database, table);
  this.outputSchema = org.apache.hive.hcatalog.mapreduce.HCatInputFormat.getTableSchema(this.configuration);
  // configure output schema of HCatFormat
  configuration.set("mapreduce.lib.hcat.output.schema", HCatUtil.serialize(outputSchema));
  // set type information
  this.resultType = new WritableTypeInfo(DefaultHCatRecord.class);
}

HCatInputFormat.setInput(job, dbName, tableName, getPartitionFilterString());
 HCatInputFormat.setOutputSchema(job, outputSchema);
} catch (Exception e) {
 throw new IOException(e);
  HCatSchema hcatTableSchema = (HCatSchema) udfProps.get(HCatConstants.HCAT_TABLE_SCHEMA);
  outputSchema = hcatTableSchema;
  HCatInputFormat.setOutputSchema(job, outputSchema);
 } catch (Exception e) {
  throw new IOException(e);

@Override
public InputFormat<?, ?> getInputFormat() throws IOException {
 if (hcatInputFormat == null) {
  hcatInputFormat = new HCatInputFormat();
 }
 return hcatInputFormat;
}

@Test
public void testGetPartitionAndDataColumns() throws Exception {
 Configuration conf = new Configuration();
 Job myJob = new Job(conf, "hcatTest");
 HCatInputFormat.setInput(myJob, "default", "testHCIFMethods");
 HCatSchema cols = HCatInputFormat.getDataColumns(myJob.getConfiguration());
 Assert.assertTrue(cols.getFields() != null);
 Assert.assertEquals(cols.getFields().size(), 2);
 Assert.assertTrue(cols.getFields().get(0).getName().equals("a"));
 Assert.assertTrue(cols.getFields().get(1).getName().equals("b"));
 Assert.assertTrue(cols.getFields().get(0).getType().equals(HCatFieldSchema.Type.STRING));
 Assert.assertTrue(cols.getFields().get(1).getType().equals(HCatFieldSchema.Type.INT));
 HCatSchema pcols = HCatInputFormat.getPartitionColumns(myJob.getConfiguration());
 Assert.assertTrue(pcols.getFields() != null);
 Assert.assertEquals(pcols.getFields().size(), 2);
 Assert.assertTrue(pcols.getFields().get(0).getName().equals("x"));
 Assert.assertTrue(pcols.getFields().get(1).getName().equals("y"));
 Assert.assertTrue(pcols.getFields().get(0).getType().equals(HCatFieldSchema.Type.STRING));
 Assert.assertTrue(pcols.getFields().get(1).getType().equals(HCatFieldSchema.Type.STRING));
}

job.setOutputFormatClass(TextOutputFormat.class);
HCatInputFormat.setInput(job, dbName, tableName).setFilter(filter);

@Override
protected void setup(Context context) throws IOException {
  super.publishConfiguration(context.getConfiguration());
  schema = HCatInputFormat.getTableSchema(context.getConfiguration());
  columnSize = schema.getFields().size();
}

private org.apache.hadoop.mapred.RecordReader createBaseRecordReader(HCatSplit hcatSplit,
                  HiveStorageHandler storageHandler, TaskAttemptContext taskContext) throws IOException {
 JobConf jobConf = HCatUtil.getJobConfFromContext(taskContext);
 HCatUtil.copyJobPropertiesToJobConf(hcatSplit.getPartitionInfo().getJobProperties(), jobConf);
 org.apache.hadoop.mapred.InputFormat inputFormat =
  HCatInputFormat.getMapRedInputFormat(jobConf, storageHandler.getInputFormatClass());
 return inputFormat.getRecordReader(hcatSplit.getBaseSplit(), jobConf,
  InternalUtil.createReporter(taskContext));
}

@Override
public HadoopInputSplit[] createInputSplits(int minNumSplits)
    throws IOException {
  configuration.setInt("mapreduce.input.fileinputformat.split.minsize", minNumSplits);
  JobContext jobContext = new JobContextImpl(configuration, new JobID());
  List<InputSplit> splits;
  try {
    splits = this.hCatInputFormat.getSplits(jobContext);
  } catch (InterruptedException e) {
    throw new IOException("Could not get Splits.", e);
  }
  HadoopInputSplit[] hadoopInputSplits = new HadoopInputSplit[splits.size()];
  for (int i = 0; i < hadoopInputSplits.length; i++){
    hadoopInputSplits[i] = new HadoopInputSplit(i, splits.get(i), jobContext);
  }
  return hadoopInputSplits;
}

@Override
public void open(HadoopInputSplit split) throws IOException {
  TaskAttemptContext context = new TaskAttemptContextImpl(configuration, new TaskAttemptID());
  try {
    this.recordReader = this.hCatInputFormat
        .createRecordReader(split.getHadoopInputSplit(), context);
    this.recordReader.initialize(split.getHadoopInputSplit(), context);
  } catch (InterruptedException e) {
    throw new IOException("Could not create RecordReader.", e);
  } finally {
    this.fetched = false;
  }
}

protected HCatSchema getTableSchema() throws Exception {
 Configuration conf = new Configuration();
 Job job = new Job(conf, "hcat mapreduce read schema test");
 job.setJarByClass(this.getClass());
 // input/output settings
 job.setInputFormatClass(HCatInputFormat.class);
 job.setOutputFormatClass(TextOutputFormat.class);
 HCatInputFormat.setInput(job, dbName, tableName);
 return HCatInputFormat.getTableSchema(job.getConfiguration());
}

HCatInputFormat.setInput(job, dbName, tableName, getPartitionFilterString());
 HCatInputFormat.setOutputSchema(job, outputSchema);
} catch (Exception e) {
 throw new IOException(e);
  HCatSchema hcatTableSchema = (HCatSchema) udfProps.get(HCatConstants.HCAT_TABLE_SCHEMA);
  outputSchema = hcatTableSchema;
  HCatInputFormat.setOutputSchema(job, outputSchema);
 } catch (Exception e) {
  throw new IOException(e);

@SuppressWarnings("unchecked")
private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
  this.fieldNames = new String[in.readInt()];
  for (int i = 0; i < this.fieldNames.length; i++) {
    this.fieldNames[i] = in.readUTF();
  }
  Configuration configuration = new Configuration();
  configuration.readFields(in);
  if (this.configuration == null) {
    this.configuration = configuration;
  }
  this.hCatInputFormat = new org.apache.hive.hcatalog.mapreduce.HCatInputFormat();
  this.outputSchema = (HCatSchema) HCatUtil.deserialize(this.configuration.get("mapreduce.lib.hcat.output.schema"));
}

@Override
protected void setup(Context context) throws IOException {
  super.publishConfiguration(context.getConfiguration());
  schema = HCatInputFormat.getTableSchema(context.getConfiguration());
  columnSize = schema.getFields().size();
}

private org.apache.hadoop.mapred.RecordReader createBaseRecordReader(HCatSplit hcatSplit,
                  HiveStorageHandler storageHandler, TaskAttemptContext taskContext) throws IOException {
 JobConf jobConf = HCatUtil.getJobConfFromContext(taskContext);
 HCatUtil.copyJobPropertiesToJobConf(hcatSplit.getPartitionInfo().getJobProperties(), jobConf);
 org.apache.hadoop.mapred.InputFormat inputFormat =
  HCatInputFormat.getMapRedInputFormat(jobConf, storageHandler.getInputFormatClass());
 return inputFormat.getRecordReader(hcatSplit.getBaseSplit(), jobConf,
  InternalUtil.createReporter(taskContext));
}

/**
 * Initializes the input with a null filter.
 * See {@link #setInput(org.apache.hadoop.conf.Configuration, String, String, String)}
 */
public static HCatInputFormat setInput(
    Job job, String dbName, String tableName)
 throws IOException {
 return setInput(job.getConfiguration(), dbName, tableName, null);
}

@Override
public ReaderContext prepareRead() throws HCatException {
 try {
  Job job = new Job(conf);
  HCatInputFormat hcif = HCatInputFormat.setInput(
   job, re.getDbName(), re.getTableName(), re.getFilterString());
  ReaderContextImpl cntxt = new ReaderContextImpl();
  cntxt.setInputSplits(hcif.getSplits(
    ShimLoader.getHadoopShims().getHCatShim().createJobContext(job.getConfiguration(), null)));
  cntxt.setConf(job.getConfiguration());
  return cntxt;
 } catch (IOException e) {
  throw new HCatException(ErrorType.ERROR_NOT_INITIALIZED, e);
 } catch (InterruptedException e) {
  throw new HCatException(ErrorType.ERROR_NOT_INITIALIZED, e);
 }
}

List<HCatFieldSchema> columns;
try {
 HCatInputFormat.setInput(conf, database == null ? "default" : database, table, partitionFilter);
 HCatSchema tableSchema = HCatInputFormat.getTableSchema(conf);
 columns = tableSchema.getFields();
} catch (IOException exc) {

Javadoc

The InputFormat to use to read data from HCatalog.

Most used methods

setInput
Initializes the input with a provided filter. See #setInput(org.apache.hadoop.conf.Configuration,Str
<init>
createRecordReader
getSplits
getMapRedInputFormat
getTableSchema
setFilter
setOutputSchema
getDataColumns
Return data columns for this input, can only be called after setInput is called.
getPartitionColumns
Return partitioning columns for this input, can only be called after setInput is called.

Popular in Java

Running tasks concurrently on multiple threads
setScale (BigDecimal)
setRequestProperty (URLConnection)
compareTo (BigDecimal)
PrintStream (java.io)
Fake signature of an existing Java class.
Stream (java.util.stream)
A sequence of elements supporting sequential and parallel aggregate operations. The following exampl
ZipFile (java.util.zip)
This class provides random read access to a zip file. You pay more to read the zip file's central di
SSLHandshakeException (javax.net.ssl)
The exception that is thrown when a handshake could not be completed successfully.
Logger (org.apache.log4j)
This is the central class in the log4j package. Most logging operations, except configuration, are d
Response (javax.ws.rs.core)
Defines the contract between a returned instance and the runtime when an application needs to provid
Top Vim plugins

How to useHCatInputFormat in org.apache.hive.hcatalog.mapreduce

Best Java code snippets using org.apache.hive.hcatalog.mapreduce.HCatInputFormat (Showing top 20 results out of 315)

How to use
HCatInputFormat
in
org.apache.hive.hcatalog.mapreduce