org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase java code examples

@Override
public void close() throws IOException {
 if (columnarBatch != null) {
  columnarBatch.close();
  columnarBatch = null;
 }
 super.close();
}

/**
 * Implementation of RecordReader API.
 */
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
  throws IOException, InterruptedException, UnsupportedOperationException {
 super.initialize(inputSplit, taskAttemptContext);
 initializeInternal();
}

/**
 * Returns the list of files at 'path' recursively. This skips files that are ignored normally
 * by MapReduce.
 */
public static List<String> listDirectory(File path) throws IOException {
 List<String> result = new ArrayList<>();
 if (path.isDirectory()) {
  for (File f: path.listFiles()) {
   result.addAll(listDirectory(f));
  }
 } else {
  char c = path.getName().charAt(0);
  if (c != '.' && c != '_') {
   result.add(path.getAbsolutePath());
  }
 }
 return result;
}

ReadSupport<T> readSupport = getReadSupportInstance(getReadSupportClass(configuration));
ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
  taskAttemptContext.getConfiguration(), toSetMultiMap(fileMetadata), fileSchema));
this.requestedSchema = readContext.getRequestedSchema();
String sparkRequestedSchemaString =

 private void readPageV2(DataPageV2 page) throws IOException {
  this.pageValueCount = page.getValueCount();
  this.repetitionLevelColumn = createRLEIterator(descriptor.getMaxRepetitionLevel(),
    page.getRepetitionLevels(), descriptor);

  int bitWidth = BytesUtils.getWidthFromMaxInt(descriptor.getMaxDefinitionLevel());
  this.defColumn = new VectorizedRleValuesReader(bitWidth);
  this.definitionLevelColumn = new ValuesReaderIntIterator(this.defColumn);
  this.defColumn.initFromBuffer(
    this.pageValueCount, page.getDefinitionLevels().toByteArray());
  try {
   initDataReader(page.getDataEncoding(), page.getData().toByteArray(), 0);
  } catch (IOException e) {
   throw new IOException("could not read page " + page + " in col " + descriptor, e);
  }
 }
}

ReadSupport<T> readSupport = getReadSupportInstance(getReadSupportClass(configuration));
ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
  taskAttemptContext.getConfiguration(), toSetMultiMap(fileMetadata), fileSchema));
this.requestedSchema = readContext.getRequestedSchema();
String sparkRequestedSchemaString =

 private void readPageV2(DataPageV2 page) throws IOException {
  this.pageValueCount = page.getValueCount();
  this.repetitionLevelColumn = createRLEIterator(descriptor.getMaxRepetitionLevel(),
    page.getRepetitionLevels(), descriptor);

  int bitWidth = BytesUtils.getWidthFromMaxInt(descriptor.getMaxDefinitionLevel());
  // do not read the length from the stream. v2 pages handle dividing the page bytes.
  this.defColumn = new VectorizedRleValuesReader(bitWidth, false);
  this.definitionLevelColumn = new ValuesReaderIntIterator(this.defColumn);
  this.defColumn.initFromPage(
    this.pageValueCount, page.getDefinitionLevels().toInputStream());
  try {
   initDataReader(page.getDataEncoding(), page.getData().toInputStream());
  } catch (IOException e) {
   throw new IOException("could not read page " + page + " in col " + descriptor, e);
  }
 }
}

ReadSupport<T> readSupport = getReadSupportInstance(getReadSupportClass(configuration));
ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
  taskAttemptContext.getConfiguration(), toSetMultiMap(fileMetadata), fileSchema));
this.requestedSchema = readContext.getRequestedSchema();
String sparkRequestedSchemaString =

/**
 * Utility API that will read all the data in path. This circumvents the need to create Hadoop
 * objects to use this class. `columns` can contain the list of columns to project.
 */
@Override
public void initialize(String path, List<String> columns) throws IOException,
  UnsupportedOperationException {
 super.initialize(path, columns);
 initializeInternal();
}

@Override
public void close() throws IOException {
 if (columnarBatch != null) {
  columnarBatch.close();
  columnarBatch = null;
 }
 super.close();
}

 private void readPageV2(DataPageV2 page) throws IOException {
  this.pageValueCount = page.getValueCount();
  this.repetitionLevelColumn = createRLEIterator(descriptor.getMaxRepetitionLevel(),
    page.getRepetitionLevels(), descriptor);

  int bitWidth = BytesUtils.getWidthFromMaxInt(descriptor.getMaxDefinitionLevel());
  // do not read the length from the stream. v2 pages handle dividing the page bytes.
  this.defColumn = new VectorizedRleValuesReader(bitWidth, false);
  this.definitionLevelColumn = new ValuesReaderIntIterator(this.defColumn);
  this.defColumn.initFromPage(
    this.pageValueCount, page.getDefinitionLevels().toInputStream());
  try {
   initDataReader(page.getDataEncoding(), page.getData().toInputStream());
  } catch (IOException e) {
   throw new IOException("could not read page " + page + " in col " + descriptor, e);
  }
 }
}

/**
 * Returns the list of files at 'path' recursively. This skips files that are ignored normally
 * by MapReduce.
 */
public static List<String> listDirectory(File path) {
 List<String> result = new ArrayList<>();
 if (path.isDirectory()) {
  for (File f: path.listFiles()) {
   result.addAll(listDirectory(f));
  }
 } else {
  char c = path.getName().charAt(0);
  if (c != '.' && c != '_') {
   result.add(path.getAbsolutePath());
  }
 }
 return result;
}

ReadSupport<T> readSupport = getReadSupportInstance(getReadSupportClass(configuration));
ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
  taskAttemptContext.getConfiguration(), toSetMultiMap(fileMetadata), fileSchema));
this.requestedSchema = readContext.getRequestedSchema();
String sparkRequestedSchemaString =

/**
 * Utility API that will read all the data in path. This circumvents the need to create Hadoop
 * objects to use this class. `columns` can contain the list of columns to project.
 */
@Override
public void initialize(String path, List<String> columns) throws IOException,
  UnsupportedOperationException {
 super.initialize(path, columns);
 initializeInternal();
}

@Override
public void close() throws IOException {
 if (columnarBatch != null) {
  columnarBatch.close();
  columnarBatch = null;
 }
 super.close();
}

 private void readPageV2(DataPageV2 page) throws IOException {
  this.pageValueCount = page.getValueCount();
  this.repetitionLevelColumn = createRLEIterator(descriptor.getMaxRepetitionLevel(),
    page.getRepetitionLevels(), descriptor);

  int bitWidth = BytesUtils.getWidthFromMaxInt(descriptor.getMaxDefinitionLevel());
  this.defColumn = new VectorizedRleValuesReader(bitWidth);
  this.definitionLevelColumn = new ValuesReaderIntIterator(this.defColumn);
  this.defColumn.initFromBuffer(
    this.pageValueCount, page.getDefinitionLevels().toByteArray());
  try {
   initDataReader(page.getDataEncoding(), page.getData().toByteArray(), 0);
  } catch (IOException e) {
   throw new IOException("could not read page " + page + " in col " + descriptor, e);
  }
 }
}

/**
 * Returns the list of files at 'path' recursively. This skips files that are ignored normally
 * by MapReduce.
 */
public static List<String> listDirectory(File path) {
 List<String> result = new ArrayList<>();
 if (path.isDirectory()) {
  for (File f: path.listFiles()) {
   result.addAll(listDirectory(f));
  }
 } else {
  char c = path.getName().charAt(0);
  if (c != '.' && c != '_') {
   result.add(path.getAbsolutePath());
  }
 }
 return result;
}

/**
 * Implementation of RecordReader API.
 */
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
  throws IOException, InterruptedException, UnsupportedOperationException {
 super.initialize(inputSplit, taskAttemptContext);
 initializeInternal();
}

@Override
public void close() throws IOException {
 if (columnarBatch != null) {
  columnarBatch.close();
  columnarBatch = null;
 }
 super.close();
}

/**
 * Returns the list of files at 'path' recursively. This skips files that are ignored normally
 * by MapReduce.
 */
public static List<String> listDirectory(File path) throws IOException {
 List<String> result = new ArrayList<>();
 if (path.isDirectory()) {
  for (File f: path.listFiles()) {
   result.addAll(listDirectory(f));
  }
 } else {
  char c = path.getName().charAt(0);
  if (c != '.' && c != '_') {
   result.add(path.getAbsolutePath());
  }
 }
 return result;
}

Javadoc

Base class for custom RecordReaders for Parquet that directly materialize to `T`. This class handles computing row groups, filtering on them, setting up the column readers, etc. This is heavily based on parquet-mr's RecordReader. TODO: move this to the parquet-mr project. There are performance benefits of doing it this way, albeit at a higher cost to implement. This base class is reusable.

Most used methods

close
createRLEIterator
Creates a reader for definition and repetition levels, returning an optimized one if the levels are
getReadSupportClass
getReadSupportInstance
initialize
listDirectory
Returns the list of files at 'path' recursively. This skips files that are ignored normally by MapRe
toSetMultiMap

Popular in Java

Parsing JSON documents to java classes using gson
putExtra (Intent)
onRequestPermissionsResult (Fragment)
getSystemService (Context)
FileNotFoundException (java.io)
Thrown when a file specified by a program cannot be found.
System (java.lang)
Provides access to system-related information and resources including standard input and output. Ena
ByteBuffer (java.nio)
A buffer for bytes. A byte buffer can be created in either one of the following ways: * #allocate
StringTokenizer (java.util)
Breaks a string into tokens; new code should probably use String#split.> // Legacy code: StringTo
Base64 (org.apache.commons.codec.binary)
Provides Base64 encoding and decoding as defined by RFC 2045.This class implements section 6.8. Base
Point (java.awt)
A point representing a location in (x,y) coordinate space, specified in integer precision.
Top Vim plugins

How to useSpecificParquetRecordReaderBase in org.apache.spark.sql.execution.datasources.parquet

Best Java code snippets using org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase (Showing top 20 results out of 315)

How to use
SpecificParquetRecordReaderBase
in
org.apache.spark.sql.execution.datasources.parquet