org.apache.parquet.ParquetReadOptions java code examples

 public ParquetReadOptions build() {
  return new ParquetReadOptions(
    useSignedStringMinMax, useStatsFilter, useDictionaryFilter, useRecordFilter, useColumnIndexFilter,
    recordFilter, metadataFilter, codecFactory, allocator, maxAllocationSize, properties);
 }
}

/**
 * @param f file to read the blocks from
 * @return the ByteBuffer blocks
 * @throws IOException if there is an error while reading from the stream
 */
List<ByteBuffer> readBlocks(SeekableInputStream f, long offset, int length) throws IOException {
 f.seek(offset);
 int fullAllocations = length / options.getMaxAllocationSize();
 int lastAllocationSize = length % options.getMaxAllocationSize();
 int numAllocations = fullAllocations + (lastAllocationSize > 0 ? 1 : 0);
 List<ByteBuffer> buffers = new ArrayList<>(numAllocations);
 for (int i = 0; i < fullAllocations; i++) {
  buffers.add(options.getAllocator().allocate(options.getMaxAllocationSize()));
 }
 if (lastAllocationSize > 0) {
  buffers.add(options.getAllocator().allocate(lastAllocationSize));
 }
 for (ByteBuffer buffer : buffers) {
  f.readFully(buffer);
  buffer.flip();
 }
 return buffers;
}

 private static float getFloat(ParquetReadOptions options, String key, float defaultValue) {
  String value = options.getProperty(key);
  if (value != null) {
   return Float.valueOf(value);
  } else {
   return defaultValue;
  }
 }
}

private List<BlockMetaData> filterRowGroups(List<BlockMetaData> blocks) throws IOException {
 // set up data filters based on configured levels
 List<RowGroupFilter.FilterLevel> levels = new ArrayList<>();
 if (options.useStatsFilter()) {
  levels.add(STATISTICS);
 }
 if (options.useDictionaryFilter()) {
  levels.add(DICTIONARY);
 }
 FilterCompat.Filter recordFilter = options.getRecordFilter();
 if (recordFilter != null) {
  return RowGroupFilter.filterRowGroups(levels, recordFilter, blocks, this);
 }
 return blocks;
}

public void initialize(ParquetFileReader reader, ParquetReadOptions options) {
 // copy custom configuration to the Configuration passed to the ReadSupport
 Configuration conf = new Configuration();
 if (options instanceof HadoopReadOptions) {
  conf = ((HadoopReadOptions) options).getConf();
 }
 for (String property : options.getPropertyNames()) {
  conf.set(property, options.getProperty(property));
 }
 // initialize a ReadContext for this file
 this.reader = reader;
 FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
 this.fileSchema = parquetFileMetadata.getSchema();
 Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
 ReadSupport.ReadContext readContext = readSupport.init(new InitContext(conf, toSetMultiMap(fileMetadata), fileSchema));
 this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
 this.requestedSchema = readContext.getRequestedSchema();
 this.columnCount = requestedSchema.getPaths().size();
 this.recordConverter = readSupport.prepareForRead(conf, fileMetadata, fileSchema, readContext);
 this.strictTypeChecking = options.isEnabled(STRICT_TYPE_CHECKING, true);
 this.total = reader.getRecordCount();
 this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(options, total);
 this.filterRecords = options.useRecordFilter();
 reader.setRequestedSchema(requestedSchema);
 LOG.info("RecordReader initialized will read a total of {} records.", total);
}

public ParquetReadOptions toReadOptions() {
 return ParquetReadOptions.builder()
  .useSignedStringMinMax(enableStringsSignedMinMax)
  .build();
}

@Override
public void close() throws IOException {
 try {
  if (f != null) {
   f.close();
  }
 } finally {
  options.getCodecFactory().release();
 }
}

private RowRanges getRowRanges(int blockIndex) {
 RowRanges rowRanges = blockRowRanges.get(blockIndex);
 if (rowRanges == null) {
  rowRanges = ColumnIndexFilter.calculateRowRanges(options.getRecordFilter(), getColumnIndexStore(blockIndex),
    paths.keySet(), blocks.get(blockIndex).getRowCount());
  blockRowRanges.set(blockIndex, rowRanges);
 }
 return rowRanges;
}

private static final ParquetMetadata readFooter(InputFile file, ParquetReadOptions options, SeekableInputStream f, ParquetMetadataConverter converter) throws IOException {
 long fileLen = file.getLength();
 String filePath = file.toString();
 LOG.debug("File length {}", fileLen);
 int FOOTER_LENGTH_SIZE = 4;
 if (fileLen < MAGIC.length + FOOTER_LENGTH_SIZE + MAGIC.length) { // MAGIC + data + footer + footerIndex + MAGIC
  throw new RuntimeException(filePath + " is not a Parquet file (too small length: " + fileLen + ")");
 }
 long footerLengthIndex = fileLen - FOOTER_LENGTH_SIZE - MAGIC.length;
 LOG.debug("reading footer index at {}", footerLengthIndex);
 f.seek(footerLengthIndex);
 int footerLength = readIntLittleEndian(f);
 byte[] magic = new byte[MAGIC.length];
 f.readFully(magic);
 if (!Arrays.equals(MAGIC, magic)) {
  throw new RuntimeException(filePath + " is not a Parquet file. expected magic number at tail " + Arrays.toString(MAGIC) + " but found " + Arrays.toString(magic));
 }
 long footerIndex = footerLengthIndex - footerLength;
 LOG.debug("read footer length: {}, footer index: {}", footerLength, footerIndex);
 if (footerIndex < MAGIC.length || footerIndex >= footerLengthIndex) {
  throw new RuntimeException("corrupted file: the footer index is not within the file: " + footerIndex);
 }
 f.seek(footerIndex);
 return converter.readParquetMetadata(f, options.getMetadataFilter());
}

public void initialize(ParquetFileReader reader, ParquetReadOptions options) {
 // copy custom configuration to the Configuration passed to the ReadSupport
 Configuration conf = new Configuration();
 if (options instanceof HadoopReadOptions) {
  conf = ((HadoopReadOptions) options).getConf();
 }
 for (String property : options.getPropertyNames()) {
  conf.set(property, options.getProperty(property));
 }
 // initialize a ReadContext for this file
 this.reader = reader;
 FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
 this.fileSchema = parquetFileMetadata.getSchema();
 Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
 ReadSupport.ReadContext readContext = readSupport.init(new InitContext(conf, toSetMultiMap(fileMetadata), fileSchema));
 this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
 this.requestedSchema = readContext.getRequestedSchema();
 this.columnCount = requestedSchema.getPaths().size();
 this.recordConverter = readSupport.prepareForRead(conf, fileMetadata, fileSchema, readContext);
 this.strictTypeChecking = options.isEnabled(STRICT_TYPE_CHECKING, true);
 this.total = reader.getFilteredRecordCount();
 this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(options, total);
 this.filterRecords = options.useRecordFilter();
 reader.setRequestedSchema(requestedSchema);
 LOG.info("RecordReader initialized will read a total of {} records.", total);
}

/**
 * Open a {@link InputFile file}.
 *
 * @param file an input file
 * @return an open ParquetFileReader
 * @throws IOException if there is an error while opening the file
 */
public static ParquetFileReader open(InputFile file) throws IOException {
 return new ParquetFileReader(file, ParquetReadOptions.builder().build());
}

private List<BlockMetaData> filterRowGroups(List<BlockMetaData> blocks) throws IOException {
 // set up data filters based on configured levels
 List<RowGroupFilter.FilterLevel> levels = new ArrayList<>();
 if (options.useStatsFilter()) {
  levels.add(STATISTICS);
 }
 if (options.useDictionaryFilter()) {
  levels.add(DICTIONARY);
 }
 FilterCompat.Filter recordFilter = options.getRecordFilter();
 if (recordFilter != null) {
  return RowGroupFilter.filterRowGroups(levels, recordFilter, blocks, this);
 }
 return blocks;
}

@Override
public void close() throws IOException {
 try {
  if (f != null) {
   f.close();
  }
 } finally {
  options.getCodecFactory().release();
 }
}

private void initReader() throws IOException {
 if (reader != null) {
  reader.close();
  reader = null;
 }
 if (filesIterator.hasNext()) {
  InputFile file = filesIterator.next();
  ParquetFileReader fileReader = ParquetFileReader.open(file, options);
  reader = new InternalParquetRecordReader<>(readSupport, options.getRecordFilter());
  reader.initialize(fileReader, options);
 }
}

private static final ParquetMetadata readFooter(InputFile file, ParquetReadOptions options, SeekableInputStream f, ParquetMetadataConverter converter) throws IOException {
 long fileLen = file.getLength();
 String filePath = file.toString();
 LOG.debug("File length {}", fileLen);
 int FOOTER_LENGTH_SIZE = 4;
 if (fileLen < MAGIC.length + FOOTER_LENGTH_SIZE + MAGIC.length) { // MAGIC + data + footer + footerIndex + MAGIC
  throw new RuntimeException(filePath + " is not a Parquet file (too small length: " + fileLen + ")");
 }
 long footerLengthIndex = fileLen - FOOTER_LENGTH_SIZE - MAGIC.length;
 LOG.debug("reading footer index at {}", footerLengthIndex);
 f.seek(footerLengthIndex);
 int footerLength = readIntLittleEndian(f);
 byte[] magic = new byte[MAGIC.length];
 f.readFully(magic);
 if (!Arrays.equals(MAGIC, magic)) {
  throw new RuntimeException(filePath + " is not a Parquet file. expected magic number at tail " + Arrays.toString(MAGIC) + " but found " + Arrays.toString(magic));
 }
 long footerIndex = footerLengthIndex - footerLength;
 LOG.debug("read footer length: {}, footer index: {}", footerLength, footerIndex);
 if (footerIndex < MAGIC.length || footerIndex >= footerLengthIndex) {
  throw new RuntimeException("corrupted file: the footer index is not within the file: " + footerIndex);
 }
 f.seek(footerIndex);
 return converter.readParquetMetadata(f, options.getMetadataFilter());
}

/**
 * Open a {@link InputFile file}.
 *
 * @param file an input file
 * @return an open ParquetFileReader
 * @throws IOException if there is an error while opening the file
 */
public static ParquetFileReader open(InputFile file) throws IOException {
 return new ParquetFileReader(file, ParquetReadOptions.builder().build());
}

f.seek(offset);
int fullAllocations = length / options.getMaxAllocationSize();
int lastAllocationSize = length % options.getMaxAllocationSize();
 buffers.add(options.getAllocator().allocate(options.getMaxAllocationSize()));
 buffers.add(options.getAllocator().allocate(lastAllocationSize));

/**
 * Reads and decompresses a dictionary page for the given column chunk.
 *
 * Returns null if the given column chunk has no dictionary page.
 *
 * @param meta a column's ColumnChunkMetaData to read the dictionary from
 * @return an uncompressed DictionaryPage or null
 * @throws IOException if there is an error while reading the dictionary
 */
DictionaryPage readDictionary(ColumnChunkMetaData meta) throws IOException {
 if (!meta.getEncodings().contains(Encoding.PLAIN_DICTIONARY) &&
   !meta.getEncodings().contains(Encoding.RLE_DICTIONARY)) {
  return null;
 }
 // TODO: this should use getDictionaryPageOffset() but it isn't reliable.
 if (f.getPos() != meta.getStartingPos()) {
  f.seek(meta.getStartingPos());
 }
 PageHeader pageHeader = Util.readPageHeader(f);
 if (!pageHeader.isSetDictionary_page_header()) {
  return null; // TODO: should this complain?
 }
 DictionaryPage compressedPage = readCompressedDictionary(pageHeader, f);
 BytesInputDecompressor decompressor = options.getCodecFactory().getDecompressor(meta.getCodec());
 return new DictionaryPage(
   decompressor.decompress(compressedPage.getBytes(), compressedPage.getUncompressedSize()),
   compressedPage.getDictionarySize(),
   compressedPage.getEncoding());
}

@Override
public String getProperty(String property) {
 String value = super.getProperty(property);
 if (value != null) {
  return value;
 }
 return conf.get(property);
}

private void initReader() throws IOException {
 if (reader != null) {
  reader.close();
  reader = null;
 }
 if (filesIterator.hasNext()) {
  InputFile file = filesIterator.next();
  ParquetFileReader fileReader = ParquetFileReader.open(file, options);
  reader = new InternalParquetRecordReader<>(readSupport, options.getRecordFilter());
  reader.initialize(fileReader, options);
 }
}

Most used methods

Popular in Java

Parsing JSON documents to java classes using gson
scheduleAtFixedRate (ScheduledExecutorService)
getSharedPreferences (Context)
notifyDataSetChanged (ArrayAdapter)
SQLException (java.sql)
An exception that indicates a failed JDBC operation. It provides the following information about pro
Timestamp (java.sql)
A Java representation of the SQL TIMESTAMP type. It provides the capability of representing the SQL
Hashtable (java.util)
A plug-in replacement for JDK1.5 java.util.Hashtable. This version is based on org.cliffc.high_scale
BlockingQueue (java.util.concurrent)
A java.util.Queue that additionally supports operations that wait for the queue to become non-empty
CountDownLatch (java.util.concurrent)
A synchronization aid that allows one or more threads to wait until a set of operations being perfor
Pattern (java.util.regex)
Patterns are compiled regular expressions. In many cases, convenience methods such as String#matches
Top plugins for WebStorm

How to useParquetReadOptions in org.apache.parquet

Best Java code snippets using org.apache.parquet.ParquetReadOptions (Showing top 20 results out of 315)

How to use
ParquetReadOptions
in
org.apache.parquet