org.apache.spark.sql.execution.vectorized.ColumnarBatch java code examples

 /**
  * Converts an iterator of rows into a single ColumnBatch.
  */
 public static ColumnarBatch toBatch(
   StructType schema, MemoryMode memMode, Iterator<Row> row) {
  ColumnarBatch batch = ColumnarBatch.allocate(schema, memMode);
  int n = 0;
  while (row.hasNext()) {
   Row r = row.next();
   for (int i = 0; i < schema.fields().length; i++) {
    appendValue(batch.column(i), schema.fields()[i].dataType(), r, i);
   }
   n++;
  }
  batch.setNumRows(n);
  return batch;
 }
}

/**
 * Advances to the next batch of rows. Returns false if there are no more.
 */
public boolean nextBatch() throws IOException {
 columnarBatch.reset();
 if (rowsReturned >= totalRowCount) return false;
 checkEndOfRowGroup();
 int num = (int) Math.min((long) columnarBatch.capacity(), totalCountLoadedSoFar - rowsReturned);
 for (int i = 0; i < columnReaders.length; ++i) {
  if (columnReaders[i] == null) continue;
  columnReaders[i].readBatch(num, columnarBatch.column(i));
 }
 rowsReturned += num;
 columnarBatch.setNumRows(num);
 numBatched = num;
 batchIdx = 0;
 return true;
}

@Override
public void close() throws IOException {
 if (columnarBatch != null) {
  columnarBatch.close();
  columnarBatch = null;
 }
 super.close();
}

public ColumnarBatch.Row findOrInsert(long key) {
 int idx = find(key);
 if (idx != -1 && buckets[idx] == -1) {
  batch.column(0).putLong(numRows, key);
  batch.column(1).putLong(numRows, 0);
  buckets[idx] = numRows++;
 }
 return batch.getRow(buckets[idx]);
}

columnarBatch = ColumnarBatch.allocate(batchSchema, memMode);
if (partitionColumns != null) {
 int partitionIdx = sparkSchema.fields().length;
 for (int i = 0; i < partitionColumns.fields().length; i++) {
  ColumnVectorUtils.populate(columnarBatch.column(i + partitionIdx), partitionValues, i);
  columnarBatch.column(i + partitionIdx).setIsConstant();
  columnarBatch.column(i).putNulls(0, columnarBatch.capacity());
  columnarBatch.column(i).setIsConstant();

/**
 * Adapter class which handles the columnar vector reading of the carbondata
 * based on the spark ColumnVector and ColumnarBatch API. This proxy class
 * handles the complexity of spark 2.3 version related api changes since
 * spark ColumnVector and ColumnarBatch interfaces are still evolving.
 *
 * @param memMode       which represent the type onheap or offheap vector.
 * @param outputSchema, metadata related to current schema of table.
 * @param rowNum        rows number for vector reading
 * @param useLazyLoad   Whether to use lazy load while getting the data.
 */
public CarbonVectorProxy(MemoryMode memMode, StructType outputSchema, int rowNum,
  boolean useLazyLoad) {
 columnarBatch = ColumnarBatch.allocate(outputSchema, memMode, rowNum);
 columnVectorProxies = new ColumnVectorProxy[columnarBatch.numCols()];
 for (int i = 0; i < columnVectorProxies.length; i++) {
  if (useLazyLoad) {
   columnVectorProxies[i] =
     new ColumnVectorProxyWithLazyLoad(columnarBatch.column(i), rowNum, memMode);
  } else {
   columnVectorProxies[i] = new ColumnVectorProxy(columnarBatch.column(i), rowNum, memMode);
  }
 }
 updateColumnVectors();
}

/**
 * Resets this column for writing. The currently stored values are no longer accessible.
 */
public void reset() {
 for (int i = 0; i < columnarBatch.numCols(); i++) {
  ((ColumnVectorProxy) columnarBatch.column(i)).reset();
 }
}

 private boolean equals(int idx, long key1) {
  return batch.column(0).getLong(buckets[idx]) == key1;
 }
}

private Row(ColumnarBatch parent) {
 this.parent = parent;
 this.fixedLenRowSize = UnsafeRow.calculateFixedPortionByteSize(parent.numCols());
 this.columns = parent.columns;
}

@Override
public Object getCurrentValue() throws IOException, InterruptedException {
 if (returnColumnarBatch) return columnarBatch;
 return columnarBatch.getRow(batchIdx - 1);
}

public static ColumnarBatch allocate(StructType schema, MemoryMode memMode) {
 return new ColumnarBatch(schema, DEFAULT_BATCH_SIZE, memMode);
}

public AggregateHashMap(StructType schema, int capacity, double loadFactor, int maxSteps) {
 // We currently only support single key-value pair that are both longs
 assert (schema.size() == 2 && schema.fields()[0].dataType() == LongType &&
   schema.fields()[1].dataType() == LongType);
 // capacity should be a power of 2
 assert (capacity > 0 && ((capacity & (capacity - 1)) == 0));
 this.maxSteps = maxSteps;
 numBuckets = (int) (capacity / loadFactor);
 batch = ColumnarBatch.allocate(schema, MemoryMode.ON_HEAP, capacity);
 buckets = new int[numBuckets];
 Arrays.fill(buckets, -1);
}

/**
 * Returns the number of rows for read, including filtered rows.
 */
public int numRows() {
 return columnarBatch.capacity();
}

/**
 * Sets the number of rows in this batch.
 */
public void setNumRows(int numRows) {
 columnarBatch.setNumRows(numRows);
}

/**
 * Marks this row as being filtered out. This means a subsequent iteration over the rows
 * in this batch will not include this row.
 */
public void markFiltered() {
 parent.markFiltered(rowId);
}

final int maxRows = ColumnarBatch.this.numRows();
final Row row = new Row(this);
return new Iterator<Row>() {

columnarBatch = ColumnarBatch.allocate(batchSchema, memMode);
if (partitionColumns != null) {
 int partitionIdx = sparkSchema.fields().length;
 for (int i = 0; i < partitionColumns.fields().length; i++) {
  ColumnVectorUtils.populate(columnarBatch.column(i + partitionIdx), partitionValues, i);
  columnarBatch.column(i + partitionIdx).setIsConstant();
  columnarBatch.column(i).putNulls(0, columnarBatch.capacity());
  columnarBatch.column(i).setIsConstant();

public ColumnarBatch.Row findOrInsert(long key) {
 int idx = find(key);
 if (idx != -1 && buckets[idx] == -1) {
  batch.column(0).putLong(numRows, key);
  batch.column(1).putLong(numRows, 0);
  buckets[idx] = numRows++;
 }
 return batch.getRow(buckets[idx]);
}

/**
 * This API will return a columnvector from a batch of column vector rows
 * based on the ordinal
 *
 * @param ordinal
 * @return
 */
public ColumnVector column(int ordinal) {
 return columnarBatch.column(ordinal);
}

/**
 * Resets the batch for writing.
 */
public void reset() {
 for (int i = 0; i < numCols(); ++i) {
  columns[i].reset();
 }
 if (this.numRowsFiltered > 0) {
  Arrays.fill(filteredRows, false);
 }
 this.numRows = 0;
 this.numRowsFiltered = 0;
}

Javadoc

This class is the in memory representation of rows as they are streamed through operators. It is designed to maximize CPU efficiency and not storage footprint. Since it is expected that each operator allocates one of these objects, the storage footprint on the task is negligible. The layout is a columnar with values encoded in their native format. Each RowBatch contains a horizontal partitioning of the data, split into columns. The ColumnarBatch supports either on heap or offheap modes with (mostly) the identical API. TODO: - There are many TODOs for the existing APIs. They should throw a not implemented exception. - Compaction: The batch and columns should be able to compact based on a selection vector.

Most used methods

allocate
capacity
Returns the max capacity (in number of rows) for this batch.
close
Called to close all the columns in this batch. It is not valid to access the data after calling this
column
Returns the column at `ordinal`.
getRow
Returns the row in this batch at `rowId`. Returned row is reused across calls.
numCols
Returns the number of columns that make up this batch.
setNumRows
Sets the number of rows that are valid. Additionally, marks all rows as "filtered" if one or more of
<init>
markFiltered
Marks this row as being filtered out. This means a subsequent iteration over the rows in this batch
numRows
Returns the number of rows for read, including filtered rows.
reset
Resets the batch for writing.

reset

Popular in Java

Updating database using SQL prepared statement
getApplicationContext (Context)
requestLocationUpdates (LocationManager)
setScale (BigDecimal)
SecureRandom (java.security)
This class generates cryptographically secure pseudo-random numbers. It is best to invoke SecureRand
SAXParseException (org.xml.sax)
Encapsulate an XML parse error or warning.> This module, both source code and documentation, is in t
VirtualMachine (com.sun.tools.attach)
A Java virtual machine. A VirtualMachine represents a Java virtual machine to which this Java vir
Menu (java.awt)
JList (javax.swing)
Reflections (org.reflections)
Reflections one-stop-shop objectReflections scans your classpath, indexes the metadata, allows you t
Best plugins for Eclipse

How to useColumnarBatch in org.apache.spark.sql.execution.vectorized

Best Java code snippets using org.apache.spark.sql.execution.vectorized.ColumnarBatch (Showing top 20 results out of 315)

How to use
ColumnarBatch
in
org.apache.spark.sql.execution.vectorized