org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader java code examples

/**
 * Returns the ColumnarBatch object that will be used for all rows returned by this reader.
 * This object is reused. Calling this enables the vectorized reader. This should be called
 * before any calls to nextKeyValue/nextBatch.
 */
public ColumnarBatch resultBatch() {
 if (columnarBatch == null) initBatch();
 return columnarBatch;
}

@Override
public boolean nextKeyValue() throws IOException {
 resultBatch();
 if (returnColumnarBatch) return nextBatch();
 if (batchIdx >= numBatched) {
  if (!nextBatch()) return false;
 }
 ++batchIdx;
 return true;
}

/**
 * Utility API that will read all the data in path. This circumvents the need to create Hadoop
 * objects to use this class. `columns` can contain the list of columns to project.
 */
@Override
public void initialize(String path, List<String> columns) throws IOException,
  UnsupportedOperationException {
 super.initialize(path, columns);
 initializeInternal();
}

/**
 * Advances to the next batch of rows. Returns false if there are no more.
 */
public boolean nextBatch() throws IOException {
 for (WritableColumnVector vector : columnVectors) {
  vector.reset();
 }
 columnarBatch.setNumRows(0);
 if (rowsReturned >= totalRowCount) return false;
 checkEndOfRowGroup();
 int num = (int) Math.min((long) capacity, totalCountLoadedSoFar - rowsReturned);
 for (int i = 0; i < columnReaders.length; ++i) {
  if (columnReaders[i] == null) continue;
  columnReaders[i].readBatch(num, columnVectors[i]);
 }
 rowsReturned += num;
 columnarBatch.setNumRows(num);
 numBatched = num;
 batchIdx = 0;
 return true;
}

/**
 * Utility API that will read all the data in path. This circumvents the need to create Hadoop
 * objects to use this class. `columns` can contain the list of columns to project.
 */
@Override
public void initialize(String path, List<String> columns) throws IOException,
  UnsupportedOperationException {
 super.initialize(path, columns);
 initializeInternal();
}

/**
 * Advances to the next batch of rows. Returns false if there are no more.
 */
public boolean nextBatch() throws IOException {
 for (WritableColumnVector vector : columnVectors) {
  vector.reset();
 }
 columnarBatch.setNumRows(0);
 if (rowsReturned >= totalRowCount) return false;
 checkEndOfRowGroup();
 int num = (int) Math.min((long) capacity, totalCountLoadedSoFar - rowsReturned);
 for (int i = 0; i < columnReaders.length; ++i) {
  if (columnReaders[i] == null) continue;
  columnReaders[i].readBatch(num, columnVectors[i]);
 }
 rowsReturned += num;
 columnarBatch.setNumRows(num);
 numBatched = num;
 batchIdx = 0;
 return true;
}

@Override
public boolean nextKeyValue() throws IOException {
 resultBatch();
 if (returnColumnarBatch) return nextBatch();
 if (batchIdx >= numBatched) {
  if (!nextBatch()) return false;
 }
 ++batchIdx;
 return true;
}

/**
 * Returns the ColumnarBatch object that will be used for all rows returned by this reader.
 * This object is reused. Calling this enables the vectorized reader. This should be called
 * before any calls to nextKeyValue/nextBatch.
 */
public ColumnarBatch resultBatch() {
 if (columnarBatch == null) initBatch();
 return columnarBatch;
}

/**
 * Implementation of RecordReader API.
 */
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
  throws IOException, InterruptedException, UnsupportedOperationException {
 super.initialize(inputSplit, taskAttemptContext);
 initializeInternal();
}

/**
 * Advances to the next batch of rows. Returns false if there are no more.
 */
public boolean nextBatch() throws IOException {
 columnarBatch.reset();
 if (rowsReturned >= totalRowCount) return false;
 checkEndOfRowGroup();
 int num = (int) Math.min((long) columnarBatch.capacity(), totalCountLoadedSoFar - rowsReturned);
 for (int i = 0; i < columnReaders.length; ++i) {
  if (columnReaders[i] == null) continue;
  columnReaders[i].readBatch(num, columnarBatch.column(i));
 }
 rowsReturned += num;
 columnarBatch.setNumRows(num);
 numBatched = num;
 batchIdx = 0;
 return true;
}

@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
 resultBatch();
 if (returnColumnarBatch) return nextBatch();
 if (batchIdx >= numBatched) {
  if (!nextBatch()) return false;
 }
 ++batchIdx;
 return true;
}

private void initBatch() {
 initBatch(MEMORY_MODE, null, null);
}

/**
 * Implementation of RecordReader API.
 */
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
  throws IOException, InterruptedException, UnsupportedOperationException {
 super.initialize(inputSplit, taskAttemptContext);
 initializeInternal();
}

/**
 * Advances to the next batch of rows. Returns false if there are no more.
 */
public boolean nextBatch() throws IOException {
 columnarBatch.reset();
 if (rowsReturned >= totalRowCount) return false;
 checkEndOfRowGroup();
 int num = (int) Math.min((long) columnarBatch.capacity(), totalCountLoadedSoFar - rowsReturned);
 for (int i = 0; i < columnReaders.length; ++i) {
  if (columnReaders[i] == null) continue;
  columnReaders[i].readBatch(num, columnarBatch.column(i));
 }
 rowsReturned += num;
 columnarBatch.setNumRows(num);
 numBatched = num;
 batchIdx = 0;
 return true;
}

@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
 resultBatch();
 if (returnColumnarBatch) return nextBatch();
 if (batchIdx >= numBatched) {
  if (!nextBatch()) return false;
 }
 ++batchIdx;
 return true;
}

public ColumnarBatch resultBatch() {
 if (columnarBatch == null) initBatch();
 return columnarBatch;
}

/**
 * Implementation of RecordReader API.
 */
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
  throws IOException, InterruptedException, UnsupportedOperationException {
 super.initialize(inputSplit, taskAttemptContext);
 initializeInternal();
}

public void initBatch() {
 initBatch(DEFAULT_MEMORY_MODE, null, null);
}

/**
 * Utility API that will read all the data in path. This circumvents the need to create Hadoop
 * objects to use this class. `columns` can contain the list of columns to project.
 */
@Override
public void initialize(String path, List<String> columns) throws IOException,
  UnsupportedOperationException {
 super.initialize(path, columns);
 initializeInternal();
}

public void initBatch(StructType partitionColumns, InternalRow partitionValues) {
 initBatch(DEFAULT_MEMORY_MODE, partitionColumns, partitionValues);
}

Javadoc

A specialized RecordReader that reads into InternalRows or ColumnarBatches directly using the Parquet column APIs. This is somewhat based on parquet-mr's ColumnReader. TODO: handle complex types, decimal requiring more than 8 bytes, INT96. Schema mismatch. All of these can be handled efficiently and easily with codegen. This class can either return InternalRows or ColumnarBatches. With whole stage codegen enabled, this class returns ColumnarBatches which offers significant performance gains. TODO: make this always return ColumnarBatches.

Most used methods

checkEndOfRowGroup
initBatch
initializeInternal
nextBatch
Advances to the next batch of rows. Returns false if there are no more.
resultBatch
Returns the ColumnarBatch object that will be used for all rows returned by this reader. This object

Popular in Java

Creating JSON documents from java classes using gson
findViewById (Activity)
getExternalFilesDir (Context)
requestLocationUpdates (LocationManager)
BufferedReader (java.io)
Wraps an existing Reader and buffers the input. Expensive interaction with the underlying reader is
File (java.io)
An "abstract" representation of a file system entity identified by a pathname. The pathname may be a
Time (java.sql)
Java representation of an SQL TIME value. Provides utilities to format and parse the time's represen
Timestamp (java.sql)
A Java representation of the SQL TIMESTAMP type. It provides the capability of representing the SQL
Comparator (java.util)
A Comparator is used to compare two objects to determine their ordering with respect to each other.
Kernel (java.awt.image)
Top Vim plugins

How to useVectorizedParquetRecordReader in org.apache.spark.sql.execution.datasources.parquet

Best Java code snippets using org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader (Showing top 20 results out of 315)

How to use
VectorizedParquetRecordReader
in
org.apache.spark.sql.execution.datasources.parquet