org.apache.parquet.io.SeekableInputStream java code examples

long footerLengthIndex = stat.getLen()
  - ParquetFooterInputFromCache.FOOTER_LENGTH_SIZE - ParquetFileWriter.MAGIC.length;
stream.seek(footerLengthIndex);
int footerLength = BytesUtils.readIntLittleEndian(stream);
stream.seek(footerLengthIndex - footerLength);
if (LOG.isInfoEnabled()) {
 LOG.info("Caching the footer of length " + footerLength + " for " + cacheKey);

private static final ParquetMetadata readFooter(InputFile file, ParquetReadOptions options, SeekableInputStream f, ParquetMetadataConverter converter) throws IOException {
 long fileLen = file.getLength();
 String filePath = file.toString();
 LOG.debug("File length {}", fileLen);
 int FOOTER_LENGTH_SIZE = 4;
 if (fileLen < MAGIC.length + FOOTER_LENGTH_SIZE + MAGIC.length) { // MAGIC + data + footer + footerIndex + MAGIC
  throw new RuntimeException(filePath + " is not a Parquet file (too small length: " + fileLen + ")");
 }
 long footerLengthIndex = fileLen - FOOTER_LENGTH_SIZE - MAGIC.length;
 LOG.debug("reading footer index at {}", footerLengthIndex);
 f.seek(footerLengthIndex);
 int footerLength = readIntLittleEndian(f);
 byte[] magic = new byte[MAGIC.length];
 f.readFully(magic);
 if (!Arrays.equals(MAGIC, magic)) {
  throw new RuntimeException(filePath + " is not a Parquet file. expected magic number at tail " + Arrays.toString(MAGIC) + " but found " + Arrays.toString(magic));
 }
 long footerIndex = footerLengthIndex - footerLength;
 LOG.debug("read footer length: {}, footer index: {}", footerLength, footerIndex);
 if (footerIndex < MAGIC.length || footerIndex >= footerLengthIndex) {
  throw new RuntimeException("corrupted file: the footer index is not within the file: " + footerIndex);
 }
 f.seek(footerIndex);
 return converter.readParquetMetadata(f, options.getMetadataFilter());
}

/**
 * Copy from a FS input stream to an output stream. Thread-safe
 *
 * @param from a {@link SeekableInputStream}
 * @param to any {@link PositionOutputStream}
 * @param start where in the from stream to start copying
 * @param length the number of bytes to copy
 * @throws IOException if there is an error while reading or writing
 */
private static void copy(SeekableInputStream from, PositionOutputStream to,
             long start, long length) throws IOException{
 LOG.debug("Copying {} bytes at {} to {}" ,length , start , to.getPos());
 from.seek(start);
 long bytesCopied = 0;
 byte[] buffer = COPY_BUFFER.get();
 while (bytesCopied < length) {
  long bytesLeft = length - bytesCopied;
  int bytesRead = from.read(buffer, 0,
    (buffer.length < bytesLeft ? buffer.length : (int) bytesLeft));
  if (bytesRead < 0) {
   throw new IllegalArgumentException(
     "Unexpected end of input file at " + start + bytesCopied);
  }
  to.write(buffer, 0, bytesRead);
  bytesCopied += bytesRead;
 }
}

/**
 * Reads and decompresses a dictionary page for the given column chunk.
 *
 * Returns null if the given column chunk has no dictionary page.
 *
 * @param meta a column's ColumnChunkMetaData to read the dictionary from
 * @return an uncompressed DictionaryPage or null
 * @throws IOException if there is an error while reading the dictionary
 */
DictionaryPage readDictionary(ColumnChunkMetaData meta) throws IOException {
 if (!meta.getEncodings().contains(Encoding.PLAIN_DICTIONARY) &&
   !meta.getEncodings().contains(Encoding.RLE_DICTIONARY)) {
  return null;
 }
 // TODO: this should use getDictionaryPageOffset() but it isn't reliable.
 if (f.getPos() != meta.getStartingPos()) {
  f.seek(meta.getStartingPos());
 }
 PageHeader pageHeader = Util.readPageHeader(f);
 if (!pageHeader.isSetDictionary_page_header()) {
  return null; // TODO: should this complain?
 }
 DictionaryPage compressedPage = readCompressedDictionary(pageHeader, f);
 BytesInputDecompressor decompressor = options.getCodecFactory().getDecompressor(meta.getCodec());
 return new DictionaryPage(
   decompressor.decompress(compressedPage.getBytes(), compressedPage.getUncompressedSize()),
   compressedPage.getDictionarySize(),
   compressedPage.getEncoding());
}

public BytesInput readAsBytesInput(int size) throws IOException {
 int available = stream.available();
 if (size > available) {
  // this is to workaround a bug where the compressedLength
  // of the chunk is missing the size of the header of the dictionary
  // to allow reading older files (using dictionary) we need this.
  // usually 13 to 19 bytes are missing
  int missingBytes = size - available;
  LOG.info("completed the column chunk with {} bytes", missingBytes);
  List<ByteBuffer> buffers = new ArrayList<>();
  buffers.addAll(stream.sliceBuffers(available));
  ByteBuffer lastBuffer = ByteBuffer.allocate(missingBytes);
  f.readFully(lastBuffer);
  buffers.add(lastBuffer);
  return BytesInput.from(buffers);
 }
 return super.readAsBytesInput(size);
}

public void loadPage(DrillBuf target, int pageLength) throws IOException {
 target.clear();
 HadoopStreams.wrap(input).read(target.nioBuffer(0, pageLength));
 target.writerIndex(pageLength);
}

@Override
public void close() throws IOException {
 try {
  if (f != null) {
   f.close();
  }
 } finally {
  options.getCodecFactory().release();
 }
}

private static final ParquetMetadata readFooter(InputFile file, ParquetReadOptions options, SeekableInputStream f, ParquetMetadataConverter converter) throws IOException {
 long fileLen = file.getLength();
 String filePath = file.toString();
 LOG.debug("File length {}", fileLen);
 int FOOTER_LENGTH_SIZE = 4;
 if (fileLen < MAGIC.length + FOOTER_LENGTH_SIZE + MAGIC.length) { // MAGIC + data + footer + footerIndex + MAGIC
  throw new RuntimeException(filePath + " is not a Parquet file (too small length: " + fileLen + ")");
 }
 long footerLengthIndex = fileLen - FOOTER_LENGTH_SIZE - MAGIC.length;
 LOG.debug("reading footer index at {}", footerLengthIndex);
 f.seek(footerLengthIndex);
 int footerLength = readIntLittleEndian(f);
 byte[] magic = new byte[MAGIC.length];
 f.readFully(magic);
 if (!Arrays.equals(MAGIC, magic)) {
  throw new RuntimeException(filePath + " is not a Parquet file. expected magic number at tail " + Arrays.toString(MAGIC) + " but found " + Arrays.toString(magic));
 }
 long footerIndex = footerLengthIndex - footerLength;
 LOG.debug("read footer length: {}, footer index: {}", footerLength, footerIndex);
 if (footerIndex < MAGIC.length || footerIndex >= footerLengthIndex) {
  throw new RuntimeException("corrupted file: the footer index is not within the file: " + footerIndex);
 }
 f.seek(footerIndex);
 return converter.readParquetMetadata(f, options.getMetadataFilter());
}

/**
 * Copy from a FS input stream to an output stream. Thread-safe
 *
 * @param from a {@link SeekableInputStream}
 * @param to any {@link PositionOutputStream}
 * @param start where in the from stream to start copying
 * @param length the number of bytes to copy
 * @throws IOException if there is an error while reading or writing
 */
private static void copy(SeekableInputStream from, PositionOutputStream to,
             long start, long length) throws IOException{
 LOG.debug("Copying {} bytes at {} to {}" ,length , start , to.getPos());
 from.seek(start);
 long bytesCopied = 0;
 byte[] buffer = COPY_BUFFER.get();
 while (bytesCopied < length) {
  long bytesLeft = length - bytesCopied;
  int bytesRead = from.read(buffer, 0,
    (buffer.length < bytesLeft ? buffer.length : (int) bytesLeft));
  if (bytesRead < 0) {
   throw new IllegalArgumentException(
     "Unexpected end of input file at " + start + bytesCopied);
  }
  to.write(buffer, 0, bytesRead);
  bytesCopied += bytesRead;
 }
}

/**
 * Reads and decompresses a dictionary page for the given column chunk.
 *
 * Returns null if the given column chunk has no dictionary page.
 *
 * @param meta a column's ColumnChunkMetaData to read the dictionary from
 * @return an uncompressed DictionaryPage or null
 * @throws IOException if there is an error while reading the dictionary
 */
DictionaryPage readDictionary(ColumnChunkMetaData meta) throws IOException {
 if (!meta.getEncodings().contains(Encoding.PLAIN_DICTIONARY) &&
   !meta.getEncodings().contains(Encoding.RLE_DICTIONARY)) {
  return null;
 }
 // TODO: this should use getDictionaryPageOffset() but it isn't reliable.
 if (f.getPos() != meta.getStartingPos()) {
  f.seek(meta.getStartingPos());
 }
 PageHeader pageHeader = Util.readPageHeader(f);
 if (!pageHeader.isSetDictionary_page_header()) {
  return null; // TODO: should this complain?
 }
 DictionaryPage compressedPage = readCompressedDictionary(pageHeader, f);
 BytesInputDecompressor decompressor = options.getCodecFactory().getDecompressor(meta.getCodec());
 return new DictionaryPage(
   decompressor.decompress(compressedPage.getBytes(), compressedPage.getUncompressedSize()),
   compressedPage.getDictionarySize(),
   compressedPage.getEncoding());
}

public BytesInput readAsBytesInput(int size) throws IOException {
 int available = stream.available();
 if (size > available) {
  // this is to workaround a bug where the compressedLength
  // of the chunk is missing the size of the header of the dictionary
  // to allow reading older files (using dictionary) we need this.
  // usually 13 to 19 bytes are missing
  int missingBytes = size - available;
  LOG.info("completed the column chunk with {} bytes", missingBytes);
  List<ByteBuffer> buffers = new ArrayList<>();
  buffers.addAll(stream.sliceBuffers(available));
  ByteBuffer lastBuffer = ByteBuffer.allocate(missingBytes);
  f.readFully(lastBuffer);
  buffers.add(lastBuffer);
  return BytesInput.from(buffers);
 }
 return super.readAsBytesInput(size);
}

public synchronized int read(DrillBuf buf, int off, int len) throws IOException {
 buf.clear();
 ByteBuffer directBuffer = buf.nioBuffer(0, len);
 int lengthLeftToRead = len;
 SeekableInputStream seekableInputStream = HadoopStreams.wrap(getInputStream());
 while (lengthLeftToRead > 0) {
  if(logger.isTraceEnabled()) {
   logger.trace("PERF: Disk read start. {}, StartOffset: {}, TotalByteSize: {}", this.streamId, this.startOffset, this.totalByteSize);
  }
  Stopwatch timer = Stopwatch.createStarted();
  int bytesRead = seekableInputStream.read(directBuffer);
  if (bytesRead < 0) {
   return bytesRead;
  }
  lengthLeftToRead -= bytesRead;
  if(logger.isTraceEnabled()) {
   logger.trace(
     "PERF: Disk read complete. {}, StartOffset: {}, TotalByteSize: {}, BytesRead: {}, Time: {} ms",
     this.streamId, this.startOffset, this.totalByteSize, bytesRead,
     ((double) timer.elapsed(TimeUnit.MICROSECONDS)) / 1000);
  }
 }
 buf.writerIndex(len);
 return len;
}

@Override
public void close() throws IOException {
 try {
  if (f != null) {
   f.close();
  }
 } finally {
  options.getCodecFactory().release();
 }
}

/**
 * @param f file to read the blocks from
 * @return the ByteBuffer blocks
 * @throws IOException if there is an error while reading from the stream
 */
List<ByteBuffer> readBlocks(SeekableInputStream f, long offset, int length) throws IOException {
 f.seek(offset);
 int fullAllocations = length / options.getMaxAllocationSize();
 int lastAllocationSize = length % options.getMaxAllocationSize();
 int numAllocations = fullAllocations + (lastAllocationSize > 0 ? 1 : 0);
 List<ByteBuffer> buffers = new ArrayList<>(numAllocations);
 for (int i = 0; i < fullAllocations; i++) {
  buffers.add(options.getAllocator().allocate(options.getMaxAllocationSize()));
 }
 if (lastAllocationSize > 0) {
  buffers.add(options.getAllocator().allocate(lastAllocationSize));
 }
 for (ByteBuffer buffer : buffers) {
  f.readFully(buffer);
  buffer.flip();
 }
 return buffers;
}

private DictionaryPage readCompressedDictionary(
  PageHeader pageHeader, SeekableInputStream fin) throws IOException {
 DictionaryPageHeader dictHeader = pageHeader.getDictionary_page_header();
 int uncompressedPageSize = pageHeader.getUncompressed_page_size();
 int compressedPageSize = pageHeader.getCompressed_page_size();
 byte [] dictPageBytes = new byte[compressedPageSize];
 fin.readFully(dictPageBytes);
 BytesInput bin = BytesInput.from(dictPageBytes);
 return new DictionaryPage(
   bin, uncompressedPageSize, dictHeader.getNum_values(),
   converter.getEncoding(dictHeader.getEncoding()));
}

/**
 * @param column
 *          the column chunk which the offset index is to be returned for
 * @return the offset index for the specified column chunk or {@code null} if there is no index
 * @throws IOException
 *           if any I/O error occurs during reading the file
 */
@Private
public OffsetIndex readOffsetIndex(ColumnChunkMetaData column) throws IOException {
 IndexReference ref = column.getOffsetIndexReference();
 if (ref == null) {
  return null;
 }
 f.seek(ref.getOffset());
 return ParquetMetadataConverter.fromParquetOffsetIndex(Util.readOffsetIndex(f));
}

if (bytesToRead > 0) {
 try {
  nBytes = HadoopStreams.wrap(getInputStream()).read(directBuffer);
 } catch (Exception e) {
  logger.error("Error reading from stream {}. Error was : {}", this.streamId, e.getMessage());

public ParquetFileReader(InputFile file, ParquetReadOptions options) throws IOException {
 this.converter = new ParquetMetadataConverter(options);
 this.file = file;
 this.f = file.newStream();
 this.options = options;
 try {
  this.footer = readFooter(file, options, f, converter);
 } catch (Exception e) {
  // In case that reading footer throws an exception in the constructor, the new stream
  // should be closed. Otherwise, there's no way to close this outside.
  f.close();
  throw e;
 }
 this.fileMetaData = footer.getFileMetaData();
 this.blocks = filterRowGroups(footer.getBlocks());
 this.blockIndexStores = listWithNulls(this.blocks.size());
 this.blockRowRanges = listWithNulls(this.blocks.size());
 for (ColumnDescriptor col : footer.getFileMetaData().getSchema().getColumns()) {
  paths.put(ColumnPath.get(col.getPath()), col);
 }
}

f.seek(offset);
 f.readFully(buffer);
 buffer.flip();

private DictionaryPage readCompressedDictionary(
  PageHeader pageHeader, SeekableInputStream fin) throws IOException {
 DictionaryPageHeader dictHeader = pageHeader.getDictionary_page_header();
 int uncompressedPageSize = pageHeader.getUncompressed_page_size();
 int compressedPageSize = pageHeader.getCompressed_page_size();
 byte [] dictPageBytes = new byte[compressedPageSize];
 fin.readFully(dictPageBytes);
 BytesInput bin = BytesInput.from(dictPageBytes);
 return new DictionaryPage(
   bin, uncompressedPageSize, dictHeader.getNum_values(),
   converter.getEncoding(dictHeader.getEncoding()));
}

Javadoc

SeekableInputStream is an interface with the methods needed by Parquet to read data from a file or Hadoop data stream.

Most used methods

read
readFully
Read len bytes of data into an array, at position start. This method will block until len bytes are
seek
Seek to a new position in the InputStream.
close
getPos
Return the current position in the InputStream.

Popular in Java

Updating database using SQL prepared statement
getSystemService (Context)
onCreateOptionsMenu (Activity)
startActivity (Activity)
FileWriter (java.io)
A specialized Writer that writes to a file in the file system. All write requests made by calling me
SecureRandom (java.security)
This class generates cryptographically secure pseudo-random numbers. It is best to invoke SecureRand
NumberFormat (java.text)
The abstract base class for all number formats. This class provides the interface for formatting and
ArrayList (java.util)
ArrayList is an implementation of List, backed by an array. All optional operations including adding
Scanner (java.util)
A parser that parses a text string of primitive types and strings with the help of regular expressio
Rectangle (java.awt)
A Rectangle specifies an area in a coordinate space that is enclosed by the Rectangle object's top-
Best plugins for Eclipse

How to useSeekableInputStream in org.apache.parquet.io

Best Java code snippets using org.apache.parquet.io.SeekableInputStream (Showing top 20 results out of 315)

How to use
SeekableInputStream
in
org.apache.parquet.io