@Override public Tuple2<org.apache.hadoop.io.Text, org.apache.hadoop.io.Text> call( Tuple2<ByteArray, Object[]> tuple2) throws Exception { if (initialized == false) { synchronized (SparkCubingByLayer.class) { if (initialized == false) { KylinConfig kylinConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl); try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig .setAndUnsetThreadLocalConfig(kylinConfig)) { CubeDesc desc = CubeDescManager.getInstance(kylinConfig).getCubeDesc(cubeName); codec = new BufferedMeasureCodec(desc.getMeasures()); initialized = true; } } } } ByteBuffer valueBuf = codec.encode(tuple2._2()); org.apache.hadoop.io.Text textResult = new org.apache.hadoop.io.Text(); textResult.set(valueBuf.array(), 0, valueBuf.position()); return new Tuple2<>(new org.apache.hadoop.io.Text(tuple2._1().array()), textResult); } }).saveAsNewAPIHadoopDataset(job.getConfiguration());
@Override public void doReduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { aggs.reset(); for (Text value : values) { if (vcounter++ % BatchConstants.NORMAL_RECORD_LOG_THRESHOLD == 0) { logger.info("Handling value with ordinal (This is not KV number!): " + vcounter); } codec.decode(ByteBuffer.wrap(value.getBytes(), 0, value.getLength()), input); aggs.aggregate(input, needAggrMeasures); } aggs.collectStates(result); ByteBuffer valueBuf = codec.encode(result); outputValue.set(valueBuf.array(), 0, valueBuf.position()); context.write(key, outputValue); } }
public BufferedMeasureCodec createMeasureCodec() { DataType[] metricTypes = new DataType[aggrMetrics.trueBitCount()]; for (int i = 0; i < metricTypes.length; i++) { metricTypes[i] = info.getColumnType(aggrMetrics.trueBitAt(i)); } BufferedMeasureCodec codec = new BufferedMeasureCodec(metricTypes); codec.setBufferSize(info.getMaxColumnLength(aggrMetrics)); return codec; }
@Override protected GTRecord finalizeResult(GTRecord record, Object[] aggStates) { // 1. load dimensions for (int c : dimensions) { returnRecord.cols[c] = record.cols[c]; } // 2. serialize metrics byte[] bytes = measureCodec.encode(aggStates).array(); int[] sizes = measureCodec.getMeasureSizes(); // 3. load metrics int offset = 0; for (int i = 0; i < metrics.trueBitCount(); i++) { int c = metrics.trueBitAt(i); returnRecord.cols[c].reset(bytes, offset, sizes[i]); offset += sizes[i]; } return returnRecord; } }
public BaseCuboidBuilder(KylinConfig kylinConfig, CubeDesc cubeDesc, CubeSegment cubeSegment, CubeJoinedFlatTableEnrich intermediateTableDesc, AbstractRowKeyEncoder rowKeyEncoder, MeasureIngester<?>[] aggrIngesters, Map<TblColRef, Dictionary<String>> dictionaryMap) { this.kylinConfig = kylinConfig; this.cubeDesc = cubeDesc; this.cubeSegment = cubeSegment; this.intermediateTableDesc = intermediateTableDesc; this.dictionaryMap = dictionaryMap; this.rowKeyEncoder = rowKeyEncoder; this.aggrIngesters = aggrIngesters; measureDescList = cubeDesc.getMeasures(); measureCodec = new BufferedMeasureCodec(measureDescList); kvBuilder = new KeyValueBuilder(intermediateTableDesc); }
@Test public void basicTest() { MeasureDesc[] descs = new MeasureDesc[] { measure("double"), measure("long"), measure("decimal"), measure("HLLC16"), measure("bitmap") }; BufferedMeasureCodec codec = new BufferedMeasureCodec(descs); Double d = new Double(1.0); Long l = new Long(2); BigDecimal b = new BigDecimal("333.1234"); HLLCounter hllc = new HLLCounter(16); hllc.add("1234567"); hllc.add("abcdefg"); BitmapCounter bitmap = RoaringBitmapCounterFactory.INSTANCE.newBitmap(); bitmap.add(123); bitmap.add(45678); bitmap.add(Integer.MAX_VALUE - 10); Object[] values = new Object[] { d, l, b, hllc, bitmap }; ByteBuffer buf = codec.encode(values); buf.flip(); System.out.println("size: " + buf.limit()); Object[] copy = new Object[values.length]; codec.decode(buf, copy); for (int i = 0; i < values.length; i++) { Object x = values[i]; Object y = copy[i]; assertEquals(x, y); } }
public KeyValue create(byte[] keyBytes, int keyOffset, int keyLength, Object[] measureValues) { for (int i = 0; i < colValues.length; i++) { colValues[i] = measureValues[refIndex[i]]; } ByteBuffer valueBuf = codec.encode(colValues); return create(keyBytes, keyOffset, keyLength, valueBuf.array(), 0, valueBuf.position()); }
private void enqueueFromDump(int index) { if (dumpIterators.get(index) != null && dumpIterators.get(index).hasNext()) { Pair<byte[], byte[]> pair = dumpIterators.get(index).next(); minHeap.offer(new SimpleEntry(pair.getFirst(), index)); Object[] metricValues = new Object[metrics.trueBitCount()]; measureCodec.decode(ByteBuffer.wrap(pair.getSecond()), metricValues); dumpCurrentValues.set(index, metricValues); } }
public ByteBuffer encode(Object[] values) { if (buf == null) { setBufferSize(DEFAULT_BUFFER_SIZE); } assert values.length == codec.getMeasuresCount(); while (true) { try { buf.clear(); for (int i = 0, pos = 0; i < codec.getMeasuresCount(); i++) { codec.encode(i, values[i], buf); measureSizes[i] = buf.position() - pos; pos = buf.position(); } return buf; } catch (BufferOverflowException boe) { if (buf.capacity() >= MAX_BUFFER_SIZE) throw boe; setBufferSize(buf.capacity() * 2); } } } }
@Override protected void doSetup(Context context) throws IOException { super.bindCurrentConfiguration(context.getConfiguration()); KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata(); String cubeName = context.getConfiguration().get(BatchConstants.CFG_CUBE_NAME).toUpperCase(Locale.ROOT); CubeInstance cube = CubeManager.getInstance(config).getCube(cubeName); CubeDesc cubeDesc = cube.getDescriptor(); List<MeasureDesc> measuresDescs = cubeDesc.getMeasures(); codec = new BufferedMeasureCodec(measuresDescs); aggs = new MeasureAggregators(measuresDescs); input = new Object[measuresDescs.size()]; result = new Object[measuresDescs.size()]; outputKey = new Text(); outputValue = new Text(); }
public ByteBuffer buildValue(String[] flatRow) { return measureCodec.encode(buildValueObjects(flatRow)); }
void load(byte[] key, MeasureAggregator[] value) { int offset = 0; for (int i = 0; i < dimensions.trueBitCount(); i++) { int c = dimensions.trueBitAt(i); final int columnLength = info.codeSystem.maxCodeLength(c); record.cols[c].reset(key, offset, columnLength); offset += columnLength; } for (int i = 0; i < value.length; i++) { tmpValues[i] = value[i].getState(); } byte[] bytes = measureCodec.encode(tmpValues).array(); int[] sizes = measureCodec.getMeasureSizes(); offset = 0; for (int i = 0; i < value.length; i++) { int col = metrics.trueBitAt(i); record.cols[col].reset(bytes, offset, sizes[i]); offset += sizes[i]; } } }
/** * Re-encode with measures in Object[] format. * @param key * @param value * @return * @throws IOException */ public Pair<Text, Object[]> reEncode2(Text key, Text value) throws IOException { if (initialized == false) { throw new IllegalStateException("Not initialized"); } Object[] measureObjs = new Object[measureDescs.size()]; codec.decode(ByteBuffer.wrap(value.getBytes(), 0, value.getLength()), measureObjs); // re-encode measures if dictionary is used if (dictMeasures.size() > 0) { for (Pair<Integer, MeasureIngester> pair : dictMeasures) { int i = pair.getFirst(); MeasureIngester ingester = pair.getSecond(); measureObjs[i] = ingester.reEncodeDictionary(measureObjs[i], measureDescs.get(i), oldDicts, newDicts); } } return Pair.newPair(processKey(key), measureObjs); }
public ByteBuffer encode(Object[] values) { if (buf == null) { setBufferSize(DEFAULT_BUFFER_SIZE); } assert values.length == codec.getMeasuresCount(); while (true) { try { buf.clear(); for (int i = 0, pos = 0; i < codec.getMeasuresCount(); i++) { codec.encode(i, values[i], buf); measureSizes[i] = buf.position() - pos; pos = buf.position(); } return buf; } catch (BufferOverflowException boe) { if (buf.capacity() >= MAX_BUFFER_SIZE) throw boe; setBufferSize(buf.capacity() * 2); } } } }
@Override public Tuple2<org.apache.hadoop.io.Text, org.apache.hadoop.io.Text> call(Tuple2<Text, Object[]> tuple2) throws Exception { if (initialized == false) { synchronized (SparkCubingMerge.class) { if (initialized == false) { synchronized (SparkCubingMerge.class) { if (initialized == false) { KylinConfig kylinConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl); try (KylinConfig.SetAndUnsetThreadLocalConfig autoUnset = KylinConfig .setAndUnsetThreadLocalConfig(kylinConfig)) { CubeDesc desc = CubeDescManager.getInstance(kylinConfig).getCubeDesc(cubeName); codec = new BufferedMeasureCodec(desc.getMeasures()); initialized = true; } } } } } } ByteBuffer valueBuf = codec.encode(tuple2._2()); byte[] encodedBytes = new byte[valueBuf.position()]; System.arraycopy(valueBuf.array(), 0, encodedBytes, 0, valueBuf.position()); return new Tuple2<>(tuple2._1(), new org.apache.hadoop.io.Text(encodedBytes)); } };
@Override protected void doSetup(Context context) throws IOException { super.bindCurrentConfiguration(context.getConfiguration()); cubeName = context.getConfiguration().get(BatchConstants.CFG_CUBE_NAME).toUpperCase(Locale.ROOT); // only used in Build job, not in Merge job cuboidLevel = context.getConfiguration().getInt(BatchConstants.CFG_CUBE_CUBOID_LEVEL, 0); KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata(); cubeDesc = CubeManager.getInstance(config).getCube(cubeName).getDescriptor(); measuresDescs = cubeDesc.getMeasures(); codec = new BufferedMeasureCodec(measuresDescs); aggs = new MeasureAggregators(measuresDescs); input = new Object[measuresDescs.size()]; result = new Object[measuresDescs.size()]; List<Integer> needAggMeasuresList = Lists.newArrayList(); for (int i = 0; i < measuresDescs.size(); i++) { if (cuboidLevel == 0) { needAggMeasuresList.add(i); } else { if (!measuresDescs.get(i).getFunction().getMeasureType().onlyAggrInBaseCuboid()) { needAggMeasuresList.add(i); } } } needAggrMeasures = new int[needAggMeasuresList.size()]; for (int i = 0; i < needAggMeasuresList.size(); i++) { needAggrMeasures[i] = needAggMeasuresList.get(i); } }
@Override public void doReduce(ByteArrayWritable key, Iterable<ByteArrayWritable> values, Context context) throws IOException, InterruptedException { aggs.reset(); for (ByteArrayWritable value : values) { if (vcounter++ % BatchConstants.NORMAL_RECORD_LOG_THRESHOLD == 0) { logger.info("Handling value with ordinal (This is not KV number!): " + vcounter); } codec.decode(value.asBuffer(), input); aggs.aggregate(input); } aggs.collectStates(result); // output key outputKey.set(key.array(), key.offset(), key.length()); // output value ByteBuffer valueBuf = codec.encode(result); outputValue.set(valueBuf.array(), 0, valueBuf.position()); context.write(outputKey, outputValue); }
public void flush() throws IOException { logger.info("AggregationCache(size={} est_mem_size={} threshold={}) will spill to {}", buffMap.size(), estMemSize, spillThreshold, dumpedFile.getAbsolutePath()); ByteArrayOutputStream baos = new ByteArrayOutputStream(MAX_BUFFER_SIZE); if (buffMap != null) { DataOutputStream bos = new DataOutputStream(baos); Object[] aggrResult = null; try { bos.writeInt(buffMap.size()); for (Entry<byte[], MeasureAggregator[]> entry : buffMap.entrySet()) { MeasureAggregators aggs = new MeasureAggregators(entry.getValue()); aggrResult = new Object[metrics.trueBitCount()]; aggs.collectStates(aggrResult); ByteBuffer metricsBuf = measureCodec.encode(aggrResult); bos.writeInt(entry.getKey().length); bos.write(entry.getKey()); bos.writeInt(metricsBuf.position()); bos.write(metricsBuf.array(), 0, metricsBuf.position()); } } finally { buffMap = null; IOUtils.closeQuietly(bos); } } spillBuffer = baos.toByteArray(); IOUtils.closeQuietly(baos); logger.info("Accurately spill data size = {}", spillBuffer.length); }
@Override protected GTRecord finalizeResult(GTRecord record, Object[] aggStates) { // 1. load dimensions for (int c : dimensions) { returnRecord.cols[c] = record.cols[c]; } // 2. serialize metrics byte[] bytes = measureCodec.encode(aggStates).array(); int[] sizes = measureCodec.getMeasureSizes(); // 3. load metrics int offset = 0; for (int i = 0; i < metrics.trueBitCount(); i++) { int c = metrics.trueBitAt(i); returnRecord.cols[c].reset(bytes, offset, sizes[i]); offset += sizes[i]; } return returnRecord; } }
public BufferedMeasureCodec createMeasureCodec() { DataType[] metricTypes = new DataType[aggrMetrics.trueBitCount()]; for (int i = 0; i < metricTypes.length; i++) { metricTypes[i] = info.getColumnType(aggrMetrics.trueBitAt(i)); } BufferedMeasureCodec codec = new BufferedMeasureCodec(metricTypes); codec.setBufferSize(info.getMaxColumnLength(aggrMetrics)); return codec; }