configuration, file, cacheKey, range(split.getStart(), split.getEnd()), cacheTag); MessageType fileSchema = footer.getFileMetaData().getSchema(); FilterCompat.Filter filter = getFilter(configuration); blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema); } else {
public FilterCompat.Filter setFilter(final JobConf conf, MessageType schema) { SearchArgument sarg = ConvertAstToSearchArg.createFromConf(conf); if (sarg == null) { return null; } // Create the Parquet FilterPredicate without including columns that do not exist // on the schema (such as partition columns). FilterPredicate p = ParquetFilterPredicateConverter.toFilterPredicate(sarg, schema); if (p != null) { // Filter may have sensitive information. Do not send to debug. LOG.debug("PARQUET predicate push down generated."); ParquetInputFormat.setFilterPredicate(conf, p); return FilterCompat.get(p); } else { // Filter may have sensitive information. Do not send to debug. LOG.debug("No PARQUET predicate push down is generated."); return null; } }
protected static FileSplit getFileSplit(Job vectorJob) throws IOException, InterruptedException { ParquetInputFormat parquetInputFormat = new ParquetInputFormat(GroupReadSupport.class); InputSplit split = (InputSplit) parquetInputFormat.getSplits(vectorJob).get(0); FileSplit fsplit = new FileSplit(file, 0L, split.getLength(), split.getLocations()); return fsplit; }
@SuppressWarnings("rawtypes") @Override public void sourceConfInit(FlowProcess<? extends JobConf> fp, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) { if (filterPredicate != null) { ParquetInputFormat.setFilterPredicate(jobConf, filterPredicate); } jobConf.setInputFormat(DeprecatedParquetInputFormat.class); ParquetInputFormat.setReadSupportClass(jobConf, TupleReadSupport.class); TupleReadSupport.setRequestedFields(jobConf, getSourceFields()); }
conf.setBoolean("parquet.avro.compatible", false); ParquetInputFormat.setReadSupportClass(job, GroupReadSupport.class);
/** * {@inheritDoc} */ @Override public RecordReader<Void, T> createRecordReader( InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { Configuration conf = ContextUtil.getConfiguration(taskAttemptContext); ReadSupport<T> readSupport = getReadSupport(conf); return new ParquetRecordReader<T>(readSupport, getFilter(conf)); }
public RecordReaderWrapper( InputSplit oldSplit, JobConf oldJobConf, Reporter reporter) throws IOException { splitLen = oldSplit.getLength(); try { realReader = new ParquetRecordReader<V>( ParquetInputFormat.<V>getReadSupportInstance(oldJobConf), ParquetInputFormat.getFilter(oldJobConf)); if (oldSplit instanceof ParquetInputSplitWrapper) { realReader.initialize(((ParquetInputSplitWrapper) oldSplit).realSplit, oldJobConf, reporter); } else if (oldSplit instanceof FileSplit) { realReader.initialize((FileSplit) oldSplit, oldJobConf, reporter); } else { throw new IllegalArgumentException( "Invalid split (not a FileSplit or ParquetInputSplitWrapper): " + oldSplit); } // read once to gain access to key and value objects if (realReader.nextKeyValue()) { firstRecord = true; valueContainer = new Container<V>(); valueContainer.set(realReader.getCurrentValue()); } else { eof = true; } } catch (InterruptedException e) { Thread.interrupted(); throw new IOException(e); } }
/** * {@inheritDoc} */ @Override public List<InputSplit> getSplits(JobContext jobContext) throws IOException { Configuration configuration = ContextUtil.getConfiguration(jobContext); List<InputSplit> splits = new ArrayList<InputSplit>(); if (isTaskSideMetaData(configuration)) { // Although not required by the API, some clients may depend on always // receiving ParquetInputSplit. Translation is required at some point. for (InputSplit split : super.getSplits(jobContext)) { Preconditions.checkArgument(split instanceof FileSplit, "Cannot wrap non-FileSplit: " + split); splits.add(ParquetInputSplit.from((FileSplit) split)); } return splits; } else { splits.addAll(getSplits(configuration, getFooters(jobContext))); } return splits; }
public PentahoParquetInputFormat() throws Exception { logger.info( "We are initializing parquet input format" ); inClassloader( () -> { ConfigurationProxy conf = new ConfigurationProxy(); job = Job.getInstance( conf ); nativeParquetInputFormat = new ParquetInputFormat<>(); ParquetInputFormat.setReadSupportClass( job, PentahoParquetReadSupport.class ); ParquetInputFormat.setTaskSideMetaData( job, false ); } ); }
job.setNumReduceTasks(0); job.setInputFormatClass(ParquetInputFormat.class); ParquetInputFormat.setReadSupportClass(job, ParquetReadSupport.class); ParquetInputFormat.setInputPaths(job, inputDir);
@Override public List<IPentahoInputSplit> getSplits() throws Exception { return inClassloader( () -> { List<InputSplit> splits = nativeParquetInputFormat.getSplits( job ); return splits.stream().map( PentahoInputSplitImpl::new ).collect( Collectors.toList() ); } ); }
@Override public void setInputFile( String file ) throws Exception { inClassloader( () -> { S3NCredentialUtils.applyS3CredentialsToHadoopConfigurationIfNecessary( file, job.getConfiguration() ); Path filePath = new Path( S3NCredentialUtils.scrubFilePathIfNecessary( file ) ); FileSystem fs = FileSystem.get( filePath.toUri(), job.getConfiguration() ); if ( !fs.exists( filePath ) ) { throw new NoSuchFileException( file ); } if ( fs.getFileStatus( filePath ).isDirectory() ) { // directory ParquetInputFormat.setInputPaths( job, filePath ); ParquetInputFormat.setInputDirRecursive( job, true ); } else { // file ParquetInputFormat.setInputPaths( job, filePath.getParent() ); ParquetInputFormat.setInputDirRecursive( job, false ); ParquetInputFormat.setInputPathFilter( job, ReadFileFilter.class ); job.getConfiguration().set( ReadFileFilter.FILTER_DIR, filePath.getParent().toString() ); job.getConfiguration().set( ReadFileFilter.FILTER_FILE, filePath.toString() ); } } ); }
public static VectorizedParquetRecordReader createTestParquetReader(String schemaString, Configuration conf) throws IOException, InterruptedException, HiveException { conf.set(PARQUET_READ_SCHEMA, schemaString); HiveConf.setBoolVar(conf, HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, true); HiveConf.setVar(conf, HiveConf.ConfVars.PLAN, "//tmp"); Job vectorJob = new Job(conf, "read vector"); ParquetInputFormat.setInputPaths(vectorJob, file); initialVectorizedRowBatchCtx(conf); return new VectorizedParquetRecordReader(getFileSplit(vectorJob), new JobConf(conf)); }
public MapredParquetInputFormat() { this(new ParquetInputFormat<ArrayWritable>(DataWritableReadSupport.class)); }
@Override protected List<FileStatus> listStatus(JobContext jobContext) throws IOException { return getAllFileRecursively(super.listStatus(jobContext), ContextUtil.getConfiguration(jobContext)); }
@SuppressWarnings("rawtypes") @Override public void sourceConfInit(FlowProcess<JobConf> fp, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) { if (filterPredicate != null) { ParquetInputFormat.setFilterPredicate(jobConf, filterPredicate); } jobConf.setInputFormat(DeprecatedParquetInputFormat.class); ParquetInputFormat.setReadSupportClass(jobConf, TupleReadSupport.class); TupleReadSupport.setRequestedFields(jobConf, getSourceFields()); }
@Override public void sourceConfInit(FlowProcess<? extends JobConf> fp, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) { super.sourceConfInit(fp, tap, jobConf); jobConf.setInputFormat(DeprecatedParquetInputFormat.class); ParquetInputFormat.setReadSupportClass(jobConf, ThriftReadSupport.class); ThriftReadSupport.setRecordConverterClass(jobConf, TBaseRecordConverter.class); }
/** * {@inheritDoc} */ @Override public RecordReader<Void, T> createRecordReader( InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { Configuration conf = ContextUtil.getConfiguration(taskAttemptContext); ReadSupport<T> readSupport = getReadSupport(conf); return new ParquetRecordReader<T>(readSupport, getFilter(conf)); }
public RecordReaderWrapper( InputSplit oldSplit, JobConf oldJobConf, Reporter reporter) throws IOException { splitLen = oldSplit.getLength(); try { realReader = new ParquetRecordReader<V>( ParquetInputFormat.<V>getReadSupportInstance(oldJobConf), ParquetInputFormat.getFilter(oldJobConf)); if (oldSplit instanceof ParquetInputSplitWrapper) { realReader.initialize(((ParquetInputSplitWrapper) oldSplit).realSplit, oldJobConf, reporter); } else if (oldSplit instanceof FileSplit) { realReader.initialize((FileSplit) oldSplit, oldJobConf, reporter); } else { throw new IllegalArgumentException( "Invalid split (not a FileSplit or ParquetInputSplitWrapper): " + oldSplit); } // read once to gain access to key and value objects if (realReader.nextKeyValue()) { firstRecord = true; valueContainer = new Container<V>(); valueContainer.set(realReader.getCurrentValue()); } else { eof = true; } } catch (InterruptedException e) { Thread.interrupted(); throw new IOException(e); } }
/** * {@inheritDoc} */ @Override public List<InputSplit> getSplits(JobContext jobContext) throws IOException { Configuration configuration = ContextUtil.getConfiguration(jobContext); List<InputSplit> splits = new ArrayList<InputSplit>(); if (isTaskSideMetaData(configuration)) { // Although not required by the API, some clients may depend on always // receiving ParquetInputSplit. Translation is required at some point. for (InputSplit split : super.getSplits(jobContext)) { Preconditions.checkArgument(split instanceof FileSplit, "Cannot wrap non-FileSplit: " + split); splits.add(ParquetInputSplit.from((FileSplit) split)); } return splits; } else { splits.addAll(getSplits(configuration, getFooters(jobContext))); } return splits; }