public static VectorizedParquetRecordReader createTestParquetReader(String schemaString, Configuration conf) throws IOException, InterruptedException, HiveException { conf.set(PARQUET_READ_SCHEMA, schemaString); HiveConf.setBoolVar(conf, HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, true); HiveConf.setVar(conf, HiveConf.ConfVars.PLAN, "//tmp"); Job vectorJob = new Job(conf, "read vector"); ParquetInputFormat.setInputPaths(vectorJob, file); initialVectorizedRowBatchCtx(conf); return new VectorizedParquetRecordReader(getFileSplit(vectorJob), new JobConf(conf)); }
@Test public void testNullSplitForParquetReader() throws Exception { Configuration conf = new Configuration(); conf.set(IOConstants.COLUMNS,"int32_field"); conf.set(IOConstants.COLUMNS_TYPES,"int"); conf.setBoolean(ColumnProjectionUtils.READ_ALL_COLUMNS, false); conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, "0"); conf.set(PARQUET_READ_SCHEMA, "message test { required int32 int32_field;}"); HiveConf.setBoolVar(conf, HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED, true); HiveConf.setVar(conf, HiveConf.ConfVars.PLAN, "//tmp"); Job vectorJob = new Job(conf, "read vector"); ParquetInputFormat.setInputPaths(vectorJob, file); initialVectorizedRowBatchCtx(conf); FileSplit fsplit = getFileSplit(vectorJob); JobConf jobConf = new JobConf(conf); TestVectorizedParquetRecordReader testReader = new TestVectorizedParquetRecordReader(fsplit, jobConf); Assert.assertNull("Test should return null split from getSplit() method", testReader.getSplit(fsplit, jobConf)); } }
@Override public void setInputFile( String file ) throws Exception { inClassloader( () -> { S3NCredentialUtils.applyS3CredentialsToHadoopConfigurationIfNecessary( file, job.getConfiguration() ); Path filePath = new Path( S3NCredentialUtils.scrubFilePathIfNecessary( file ) ); FileSystem fs = FileSystem.get( filePath.toUri(), job.getConfiguration() ); if ( !fs.exists( filePath ) ) { throw new NoSuchFileException( file ); } if ( fs.getFileStatus( filePath ).isDirectory() ) { // directory ParquetInputFormat.setInputPaths( job, filePath ); ParquetInputFormat.setInputDirRecursive( job, true ); } else { // file ParquetInputFormat.setInputPaths( job, filePath.getParent() ); ParquetInputFormat.setInputDirRecursive( job, false ); ParquetInputFormat.setInputPathFilter( job, ReadFileFilter.class ); job.getConfiguration().set( ReadFileFilter.FILTER_DIR, filePath.getParent().toString() ); job.getConfiguration().set( ReadFileFilter.FILTER_FILE, filePath.toString() ); } } ); }
job.setInputFormatClass(ParquetInputFormat.class); ParquetInputFormat.setReadSupportClass(job, ParquetReadSupport.class); ParquetInputFormat.setInputPaths(job, inputDir);