/** * Converts an iterator of rows into a single ColumnBatch. */ public static ColumnarBatch toBatch( StructType schema, MemoryMode memMode, Iterator<Row> row) { ColumnarBatch batch = ColumnarBatch.allocate(schema, memMode); int n = 0; while (row.hasNext()) { Row r = row.next(); for (int i = 0; i < schema.fields().length; i++) { appendValue(batch.column(i), schema.fields()[i].dataType(), r, i); } n++; } batch.setNumRows(n); return batch; } }
/** * Advances to the next batch of rows. Returns false if there are no more. */ public boolean nextBatch() throws IOException { columnarBatch.reset(); if (rowsReturned >= totalRowCount) return false; checkEndOfRowGroup(); int num = (int) Math.min((long) columnarBatch.capacity(), totalCountLoadedSoFar - rowsReturned); for (int i = 0; i < columnReaders.length; ++i) { if (columnReaders[i] == null) continue; columnReaders[i].readBatch(num, columnarBatch.column(i)); } rowsReturned += num; columnarBatch.setNumRows(num); numBatched = num; batchIdx = 0; return true; }
@Override public void close() throws IOException { if (columnarBatch != null) { columnarBatch.close(); columnarBatch = null; } super.close(); }
columnarBatch = ColumnarBatch.allocate(batchSchema, memMode); if (partitionColumns != null) { int partitionIdx = sparkSchema.fields().length; for (int i = 0; i < partitionColumns.fields().length; i++) { ColumnVectorUtils.populate(columnarBatch.column(i + partitionIdx), partitionValues, i); columnarBatch.column(i + partitionIdx).setIsConstant(); columnarBatch.column(i).putNulls(0, columnarBatch.capacity()); columnarBatch.column(i).setIsConstant();
/** * Adapter class which handles the columnar vector reading of the carbondata * based on the spark ColumnVector and ColumnarBatch API. This proxy class * handles the complexity of spark 2.3 version related api changes since * spark ColumnVector and ColumnarBatch interfaces are still evolving. * * @param memMode which represent the type onheap or offheap vector. * @param outputSchema, metadata related to current schema of table. * @param rowNum rows number for vector reading * @param useLazyLoad Whether to use lazy load while getting the data. */ public CarbonVectorProxy(MemoryMode memMode, StructType outputSchema, int rowNum, boolean useLazyLoad) { columnarBatch = ColumnarBatch.allocate(outputSchema, memMode, rowNum); columnVectorProxies = new ColumnVectorProxy[columnarBatch.numCols()]; for (int i = 0; i < columnVectorProxies.length; i++) { if (useLazyLoad) { columnVectorProxies[i] = new ColumnVectorProxyWithLazyLoad(columnarBatch.column(i), rowNum, memMode); } else { columnVectorProxies[i] = new ColumnVectorProxy(columnarBatch.column(i), rowNum, memMode); } } updateColumnVectors(); }
private boolean equals(int idx, long key1) { return batch.column(0).getLong(buckets[idx]) == key1; } }
private Row(ColumnarBatch parent) { this.parent = parent; this.fixedLenRowSize = UnsafeRow.calculateFixedPortionByteSize(parent.numCols()); this.columns = parent.columns; }
@Override public Object getCurrentValue() throws IOException, InterruptedException { if (returnColumnarBatch) return columnarBatch; return columnarBatch.getRow(batchIdx - 1); }
public static ColumnarBatch allocate(StructType schema, MemoryMode memMode) { return new ColumnarBatch(schema, DEFAULT_BATCH_SIZE, memMode); }
public AggregateHashMap(StructType schema, int capacity, double loadFactor, int maxSteps) { // We currently only support single key-value pair that are both longs assert (schema.size() == 2 && schema.fields()[0].dataType() == LongType && schema.fields()[1].dataType() == LongType); // capacity should be a power of 2 assert (capacity > 0 && ((capacity & (capacity - 1)) == 0)); this.maxSteps = maxSteps; numBuckets = (int) (capacity / loadFactor); batch = ColumnarBatch.allocate(schema, MemoryMode.ON_HEAP, capacity); buckets = new int[numBuckets]; Arrays.fill(buckets, -1); }
/** * Returns the number of rows for read, including filtered rows. */ public int numRows() { return columnarBatch.capacity(); }
/** * Sets the number of rows in this batch. */ public void setNumRows(int numRows) { columnarBatch.setNumRows(numRows); }
/** * Marks this row as being filtered out. This means a subsequent iteration over the rows * in this batch will not include this row. */ public void markFiltered() { parent.markFiltered(rowId); }
final int maxRows = ColumnarBatch.this.numRows(); final Row row = new Row(this); return new Iterator<Row>() {
columnarBatch = ColumnarBatch.allocate(batchSchema, memMode); if (partitionColumns != null) { int partitionIdx = sparkSchema.fields().length; for (int i = 0; i < partitionColumns.fields().length; i++) { ColumnVectorUtils.populate(columnarBatch.column(i + partitionIdx), partitionValues, i); columnarBatch.column(i + partitionIdx).setIsConstant(); columnarBatch.column(i).putNulls(0, columnarBatch.capacity()); columnarBatch.column(i).setIsConstant();
/** * This API will return a columnvector from a batch of column vector rows * based on the ordinal * * @param ordinal * @return */ public ColumnVector column(int ordinal) { return columnarBatch.column(ordinal); }
/** * Resets the batch for writing. */ public void reset() { for (int i = 0; i < numCols(); ++i) { columns[i].reset(); } if (this.numRowsFiltered > 0) { Arrays.fill(filteredRows, false); } this.numRows = 0; this.numRowsFiltered = 0; }