org.apache.parquet.hadoop.ParquetInputFormat java code examples

   configuration, file, cacheKey, range(split.getStart(), split.getEnd()), cacheTag);
 MessageType fileSchema = footer.getFileMetaData().getSchema();
 FilterCompat.Filter filter = getFilter(configuration);
 blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);
} else {

public FilterCompat.Filter setFilter(final JobConf conf, MessageType schema) {
 SearchArgument sarg = ConvertAstToSearchArg.createFromConf(conf);
 if (sarg == null) {
  return null;
 }
 // Create the Parquet FilterPredicate without including columns that do not exist
 // on the schema (such as partition columns).
 FilterPredicate p = ParquetFilterPredicateConverter.toFilterPredicate(sarg, schema);
 if (p != null) {
  // Filter may have sensitive information. Do not send to debug.
  LOG.debug("PARQUET predicate push down generated.");
  ParquetInputFormat.setFilterPredicate(conf, p);
  return FilterCompat.get(p);
 } else {
  // Filter may have sensitive information. Do not send to debug.
  LOG.debug("No PARQUET predicate push down is generated.");
  return null;
 }
}

protected static FileSplit getFileSplit(Job vectorJob) throws IOException, InterruptedException {
 ParquetInputFormat parquetInputFormat = new ParquetInputFormat(GroupReadSupport.class);
 InputSplit split = (InputSplit) parquetInputFormat.getSplits(vectorJob).get(0);
 FileSplit fsplit = new FileSplit(file, 0L, split.getLength(), split.getLocations());
 return fsplit;
}

@SuppressWarnings("rawtypes")
@Override
public void sourceConfInit(FlowProcess<? extends JobConf> fp,
  Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) {
 if (filterPredicate != null) {
  ParquetInputFormat.setFilterPredicate(jobConf, filterPredicate);
 }
 jobConf.setInputFormat(DeprecatedParquetInputFormat.class);
 ParquetInputFormat.setReadSupportClass(jobConf, TupleReadSupport.class);
 TupleReadSupport.setRequestedFields(jobConf, getSourceFields());
}

conf.setBoolean("parquet.avro.compatible", false);
ParquetInputFormat.setReadSupportClass(job, GroupReadSupport.class);

/**
 * {@inheritDoc}
 */
@Override
public RecordReader<Void, T> createRecordReader(
  InputSplit inputSplit,
  TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
 Configuration conf = ContextUtil.getConfiguration(taskAttemptContext);
 ReadSupport<T> readSupport = getReadSupport(conf);
 return new ParquetRecordReader<T>(readSupport, getFilter(conf));
}

public RecordReaderWrapper(
  InputSplit oldSplit, JobConf oldJobConf, Reporter reporter)
  throws IOException {
 splitLen = oldSplit.getLength();
 try {
  realReader = new ParquetRecordReader<V>(
    ParquetInputFormat.<V>getReadSupportInstance(oldJobConf),
    ParquetInputFormat.getFilter(oldJobConf));
  if (oldSplit instanceof ParquetInputSplitWrapper) {
   realReader.initialize(((ParquetInputSplitWrapper) oldSplit).realSplit, oldJobConf, reporter);
  } else if (oldSplit instanceof FileSplit) {
   realReader.initialize((FileSplit) oldSplit, oldJobConf, reporter);
  } else {
   throw new IllegalArgumentException(
     "Invalid split (not a FileSplit or ParquetInputSplitWrapper): " + oldSplit);
  }
  // read once to gain access to key and value objects
  if (realReader.nextKeyValue()) {
   firstRecord = true;
   valueContainer = new Container<V>();
   valueContainer.set(realReader.getCurrentValue());
  } else {
   eof = true;
  }
 } catch (InterruptedException e) {
  Thread.interrupted();
  throw new IOException(e);
 }
}

/**
 * {@inheritDoc}
 */
@Override
public List<InputSplit> getSplits(JobContext jobContext) throws IOException {
 Configuration configuration = ContextUtil.getConfiguration(jobContext);
 List<InputSplit> splits = new ArrayList<InputSplit>();
 if (isTaskSideMetaData(configuration)) {
  // Although not required by the API, some clients may depend on always
  // receiving ParquetInputSplit. Translation is required at some point.
  for (InputSplit split : super.getSplits(jobContext)) {
   Preconditions.checkArgument(split instanceof FileSplit,
     "Cannot wrap non-FileSplit: " + split);
   splits.add(ParquetInputSplit.from((FileSplit) split));
  }
  return splits;
 } else {
  splits.addAll(getSplits(configuration, getFooters(jobContext)));
 }
 return splits;
}

public PentahoParquetInputFormat() throws Exception {
 logger.info( "We are initializing parquet input format" );
 inClassloader( () -> {
  ConfigurationProxy conf = new ConfigurationProxy();
  job = Job.getInstance( conf );
  nativeParquetInputFormat = new ParquetInputFormat<>();
  ParquetInputFormat.setReadSupportClass( job, PentahoParquetReadSupport.class );
  ParquetInputFormat.setTaskSideMetaData( job, false );
 } );
}

job.setNumReduceTasks(0);
job.setInputFormatClass(ParquetInputFormat.class);
ParquetInputFormat.setReadSupportClass(job, ParquetReadSupport.class);
ParquetInputFormat.setInputPaths(job, inputDir);

@Override
public List<IPentahoInputSplit> getSplits() throws Exception {
 return inClassloader( () -> {
  List<InputSplit> splits = nativeParquetInputFormat.getSplits( job );
  return splits.stream().map( PentahoInputSplitImpl::new ).collect( Collectors.toList() );
 } );
}

@Override
public void setInputFile( String file ) throws Exception {
 inClassloader( () -> {
  S3NCredentialUtils.applyS3CredentialsToHadoopConfigurationIfNecessary( file, job.getConfiguration() );
  Path filePath = new Path( S3NCredentialUtils.scrubFilePathIfNecessary( file ) );
  FileSystem fs = FileSystem.get( filePath.toUri(), job.getConfiguration() );
  if ( !fs.exists( filePath ) ) {
   throw new NoSuchFileException( file );
  }
  if ( fs.getFileStatus( filePath ).isDirectory() ) { // directory
   ParquetInputFormat.setInputPaths( job, filePath );
   ParquetInputFormat.setInputDirRecursive( job, true );
  } else { // file
   ParquetInputFormat.setInputPaths( job, filePath.getParent() );
   ParquetInputFormat.setInputDirRecursive( job, false );
   ParquetInputFormat.setInputPathFilter( job, ReadFileFilter.class );
   job.getConfiguration().set( ReadFileFilter.FILTER_DIR, filePath.getParent().toString() );
   job.getConfiguration().set( ReadFileFilter.FILTER_FILE, filePath.toString() );
  }
 } );
}

public static VectorizedParquetRecordReader createTestParquetReader(String schemaString, Configuration conf)
  throws IOException, InterruptedException, HiveException {
 conf.set(PARQUET_READ_SCHEMA, schemaString);
 HiveConf.setBoolVar(conf, HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, true);
 HiveConf.setVar(conf, HiveConf.ConfVars.PLAN, "//tmp");
 Job vectorJob = new Job(conf, "read vector");
 ParquetInputFormat.setInputPaths(vectorJob, file);
 initialVectorizedRowBatchCtx(conf);
 return new VectorizedParquetRecordReader(getFileSplit(vectorJob), new JobConf(conf));
}

public MapredParquetInputFormat() {
 this(new ParquetInputFormat<ArrayWritable>(DataWritableReadSupport.class));
}

@Override
protected List<FileStatus> listStatus(JobContext jobContext) throws IOException {
 return getAllFileRecursively(super.listStatus(jobContext),
   ContextUtil.getConfiguration(jobContext));
}

@SuppressWarnings("rawtypes")
@Override
public void sourceConfInit(FlowProcess<JobConf> fp,
  Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) {
 if (filterPredicate != null) {
  ParquetInputFormat.setFilterPredicate(jobConf, filterPredicate);
 }
 jobConf.setInputFormat(DeprecatedParquetInputFormat.class);
 ParquetInputFormat.setReadSupportClass(jobConf, TupleReadSupport.class);
 TupleReadSupport.setRequestedFields(jobConf, getSourceFields());
}

@Override
public void sourceConfInit(FlowProcess<? extends JobConf> fp,
  Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) {
 super.sourceConfInit(fp, tap, jobConf);
 jobConf.setInputFormat(DeprecatedParquetInputFormat.class);
 ParquetInputFormat.setReadSupportClass(jobConf, ThriftReadSupport.class);
 ThriftReadSupport.setRecordConverterClass(jobConf, TBaseRecordConverter.class);
}

/**
 * {@inheritDoc}
 */
@Override
public RecordReader<Void, T> createRecordReader(
  InputSplit inputSplit,
  TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
 Configuration conf = ContextUtil.getConfiguration(taskAttemptContext);
 ReadSupport<T> readSupport = getReadSupport(conf);
 return new ParquetRecordReader<T>(readSupport, getFilter(conf));
}

public RecordReaderWrapper(
  InputSplit oldSplit, JobConf oldJobConf, Reporter reporter)
  throws IOException {
 splitLen = oldSplit.getLength();
 try {
  realReader = new ParquetRecordReader<V>(
    ParquetInputFormat.<V>getReadSupportInstance(oldJobConf),
    ParquetInputFormat.getFilter(oldJobConf));
  if (oldSplit instanceof ParquetInputSplitWrapper) {
   realReader.initialize(((ParquetInputSplitWrapper) oldSplit).realSplit, oldJobConf, reporter);
  } else if (oldSplit instanceof FileSplit) {
   realReader.initialize((FileSplit) oldSplit, oldJobConf, reporter);
  } else {
   throw new IllegalArgumentException(
     "Invalid split (not a FileSplit or ParquetInputSplitWrapper): " + oldSplit);
  }
  // read once to gain access to key and value objects
  if (realReader.nextKeyValue()) {
   firstRecord = true;
   valueContainer = new Container<V>();
   valueContainer.set(realReader.getCurrentValue());
  } else {
   eof = true;
  }
 } catch (InterruptedException e) {
  Thread.interrupted();
  throw new IOException(e);
 }
}

/**
 * {@inheritDoc}
 */
@Override
public List<InputSplit> getSplits(JobContext jobContext) throws IOException {
 Configuration configuration = ContextUtil.getConfiguration(jobContext);
 List<InputSplit> splits = new ArrayList<InputSplit>();
 if (isTaskSideMetaData(configuration)) {
  // Although not required by the API, some clients may depend on always
  // receiving ParquetInputSplit. Translation is required at some point.
  for (InputSplit split : super.getSplits(jobContext)) {
   Preconditions.checkArgument(split instanceof FileSplit,
     "Cannot wrap non-FileSplit: " + split);
   splits.add(ParquetInputSplit.from((FileSplit) split));
  }
  return splits;
 } else {
  splits.addAll(getSplits(configuration, getFooters(jobContext)));
 }
 return splits;
}

Javadoc

The input format to read a Parquet file. It requires an implementation of ReadSupport to materialize the records. The requestedSchema will control how the original records get projected by the loader. It must be a subset of the original schema. Only the columns needed to reconstruct the records with the requestedSchema will be scanned.

Most used methods

getFilter
Returns a non-null Filter, which is a wrapper around either a FilterPredicate, an UnboundRecordFilte
setReadSupportClass
setFilterPredicate
getSplits
<init>
Constructor for subclasses, such as AvroParquetInputFormat, or wrappers. Subclasses and wrappers ma
setInputPaths
getAllFileRecursively
getFilterPredicate
getFooters
getFormatMinSplitSize
getReadSupport
getReadSupportClass

Popular in Java

Updating database using SQL prepared statement
getSystemService (Context)
putExtra (Intent)
setScale (BigDecimal)
Proxy (java.net)
This class represents proxy server settings. A created instance of Proxy stores a type and an addres
SocketException (java.net)
This SocketException may be thrown during socket creation or setting options, and is the superclass
ByteBuffer (java.nio)
A buffer for bytes. A byte buffer can be created in either one of the following ways: * #allocate
ReentrantLock (java.util.concurrent.locks)
A reentrant mutual exclusion Lock with the same basic behavior and semantics as the implicit monitor
Pattern (java.util.regex)
Patterns are compiled regular expressions. In many cases, convenience methods such as String#matches
Option (scala)
Best plugins for Eclipse

How to useParquetInputFormat in org.apache.parquet.hadoop

Best Java code snippets using org.apache.parquet.hadoop.ParquetInputFormat (Showing top 20 results out of 315)

How to use
ParquetInputFormat
in
org.apache.parquet.hadoop