org.apache.spark.sql.execution.datasources.parquet java code examples

/**
 * Utility API that will read all the data in path. This circumvents the need to create Hadoop
 * objects to use this class. `columns` can contain the list of columns to project.
 */
@Override
public void initialize(String path, List<String> columns) throws IOException,
  UnsupportedOperationException {
 super.initialize(path, columns);
 initializeInternal();
}

public VectorizedRleValuesReader(int bitWidth) {
 this.fixedWidth = true;
 this.readLength = bitWidth != 0;
 init(bitWidth);
}

/**
 * Returns the ColumnarBatch object that will be used for all rows returned by this reader.
 * This object is reused. Calling this enables the vectorized reader. This should be called
 * before any calls to nextKeyValue/nextBatch.
 */
public ColumnarBatch resultBatch() {
 if (columnarBatch == null) initBatch();
 return columnarBatch;
}

@Override
public boolean nextKeyValue() throws IOException {
 resultBatch();
 if (returnColumnarBatch) return nextBatch();
 if (batchIdx >= numBatched) {
  if (!nextBatch()) return false;
 }
 ++batchIdx;
 return true;
}

/**
 * For all the read*Batch functions, reads `num` values from this columnReader into column. It
 * is guaranteed that num is smaller than the number of values left in the current page.
 */
private void readBooleanBatch(int rowId, int num, WritableColumnVector column)
  throws IOException {
 if (column.dataType() != DataTypes.BooleanType) {
  throw constructConvertNotSupportedException(descriptor, column);
 }
 defColumn.readBooleans(
   num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
}

private void readFloatBatch(int rowId, int num, WritableColumnVector column) throws IOException {
 // This is where we implement support for the valid type conversions.
 // TODO: support implicit cast to double?
 if (column.dataType() == DataTypes.FloatType) {
  defColumn.readFloats(
    num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
 } else {
  throw constructConvertNotSupportedException(descriptor, column);
 }
}

private void readDoubleBatch(int rowId, int num, WritableColumnVector column) throws IOException {
 // This is where we implement support for the valid type conversions.
 // TODO: implement remaining type conversions
 if (column.dataType() == DataTypes.DoubleType) {
  defColumn.readDoubles(
    num, column, rowId, maxDefLevel, (VectorizedValuesReader) dataColumn);
 } else {
  throw constructConvertNotSupportedException(descriptor, column);
 }
}

@Override
public int readValueDictionaryId() {
 return readInteger();
}

/**
 * Implementation of RecordReader API.
 */
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
  throws IOException, InterruptedException, UnsupportedOperationException {
 super.initialize(inputSplit, taskAttemptContext);
 initializeInternal();
}

@Override
public boolean nextKeyValue() throws IOException {
 resultBatch();
 if (returnColumnarBatch) return nextBatch();
 if (batchIdx >= numBatched) {
  if (!nextBatch()) return false;
 }
 ++batchIdx;
 return true;
}

public VectorizedRleValuesReader(int bitWidth) {
 fixedWidth = true;
 init(bitWidth);
}

@Override
public boolean readBoolean() {
 return this.readInteger() != 0;
}

/**
 * Utility API that will read all the data in path. This circumvents the need to create Hadoop
 * objects to use this class. `columns` can contain the list of columns to project.
 */
@Override
public void initialize(String path, List<String> columns) throws IOException,
  UnsupportedOperationException {
 super.initialize(path, columns);
 initializeInternal();
}

@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
 resultBatch();
 if (returnColumnarBatch) return nextBatch();
 if (batchIdx >= numBatched) {
  if (!nextBatch()) return false;
 }
 ++batchIdx;
 return true;
}

public VectorizedRleValuesReader(int bitWidth, boolean readLength) {
 this.fixedWidth = true;
 this.readLength = readLength;
 init(bitWidth);
}

/**
 * Implementation of RecordReader API.
 */
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
  throws IOException, InterruptedException, UnsupportedOperationException {
 super.initialize(inputSplit, taskAttemptContext);
 initializeInternal();
}

public VectorizedRleValuesReader(int bitWidth) {
 this.fixedWidth = true;
 this.readLength = bitWidth != 0;
 init(bitWidth);
}

/**
 * Implementation of RecordReader API.
 */
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
  throws IOException, InterruptedException, UnsupportedOperationException {
 super.initialize(inputSplit, taskAttemptContext);
 initializeInternal();
}

public VectorizedRleValuesReader(int bitWidth, boolean readLength) {
 this.fixedWidth = true;
 this.readLength = readLength;
 init(bitWidth);
}

/**
 * Utility API that will read all the data in path. This circumvents the need to create Hadoop
 * objects to use this class. `columns` can contain the list of columns to project.
 */
@Override
public void initialize(String path, List<String> columns) throws IOException,
  UnsupportedOperationException {
 super.initialize(path, columns);
 initializeInternal();
}

How to use org.apache.spark.sql.execution.datasources.parquet

Best Java code snippets using org.apache.spark.sql.execution.datasources.parquet (Showing top 20 results out of 315)