public long getAvgRecordSize() { long avgRecordSize = 0; long numEntries = 0; for (final SinkStat stat : this.sinkStatQ) { final Optional<String> avgRecordSizeStat = stat.get(SinkStat.AVG_RECORD_SIZE); if (avgRecordSizeStat.isPresent()) { avgRecordSize += Long.parseLong(avgRecordSizeStat.get()); numEntries += 1; } } return avgRecordSize / Math.max(numEntries, 1); }
private void setupMetrics() { this.sink.setJobMetrics(getJobMetrics().get()); this.sink.setDataFeedMetrics(getDataFeedMetrics().get()); } }
public HoodieSink(@NonNull final HoodieConfiguration hoodieConf, @NonNull final HoodieSinkDataConverter hoodieSinkDataConverter, @NonNull final JavaSparkContext jsc, @NonNull final HoodieSinkOp op, @NonNull final IMetadataManager metadataMgr, final boolean shouldSaveChangesInFuture) { this.hoodieConf = hoodieConf; this.hoodieSinkDataConverter = hoodieSinkDataConverter; this.jsc = jsc; this.op = op; this.metadataMgr = metadataMgr; this.sinkStatMgr = new SinkStatManager(this.hoodieConf.getTableName(), this.metadataMgr); this.sinkStatMgr.init(); this.shouldSaveChangesInFuture = shouldSaveChangesInFuture; }
@Test public void testMaxStatHistory() { final String tableName = "testTable"; final IMetadataManager metadataManager = new MemoryMetadataManager(); final SinkStatManager sinkStatManager1 = new SinkStatManager(tableName, metadataManager); sinkStatManager1.init(); final int initialValue = SinkStatManager.MAX_HISTORY_SIZE * 2; sinkStatManager1.getCurrentStat().put(SinkStat.AVG_RECORD_SIZE, Integer.toString(initialValue)); sinkStatManager1.persist(); final int targetValue = 1; for (int i = 0; i < SinkStatManager.MAX_HISTORY_SIZE; i++) { final SinkStatManager tempSinkStatManager = new SinkStatManager(tableName, metadataManager); tempSinkStatManager.init(); tempSinkStatManager.getCurrentStat().put(SinkStat.AVG_RECORD_SIZE, Integer.toString(targetValue)); Assert.assertNotEquals(targetValue, tempSinkStatManager.getAvgRecordSize()); tempSinkStatManager.persist(); } // After SinkStatManager.MAX_HISTORY_SIZE runs very first stat should get dropped. final SinkStatManager sinkStatManager2 = new SinkStatManager(tableName, metadataManager); sinkStatManager2.init(); Assert.assertEquals(targetValue, sinkStatManager2.getAvgRecordSize()); } }
/** * {@link #updateSinkStat(Optional)} will compute {@link SinkStat} and persist changes into {@link IMetadataManager}. * As a part of {@link SinkStat} computation; it will compute avg record size for current run. * @param writesStatuses */ private void updateSinkStat(final Optional<JavaRDD<WriteStatus>> writesStatuses) { if (writesStatuses.isPresent()) { final LongAccumulator avgRecordSizeCounter = writesStatuses.get().rdd().sparkContext().longAccumulator(); writesStatuses.get().foreach( writeStatus -> { final long writeBytes = writeStatus.getStat().getTotalWriteBytes(); final long numInserts = writeStatus.getStat().getNumWrites() - writeStatus.getStat().getNumUpdateWrites(); if (writeBytes > 0 && numInserts > 0) { avgRecordSizeCounter.add(writeBytes / numInserts); } } ); final long avgRecordSize = (int) avgRecordSizeCounter.avg(); if (avgRecordSize > 0) { log.info("Updating Sink Stat manager : avgRecordSize : {}", avgRecordSize); this.sinkStatMgr.getCurrentStat().put(SinkStat.AVG_RECORD_SIZE, Long.toString(avgRecordSize)); } } this.sinkStatMgr.persist(); }
public void persist() { final Map<String, String> stats = new HashMap<>(); if (!this.currentStat.isEmpty()) { this.sinkStatQ.add(this.currentStat); } while (this.sinkStatQ.size() > MAX_HISTORY_SIZE) { this.sinkStatQ.poll(); } for (int i = 0; !this.sinkStatQ.isEmpty(); i++) { stats.put(Integer.toString(i), SinkStat.serialize(this.sinkStatQ.poll())); } this.metadataManager.set(getMetakey(), new StringValue(MapUtil.serializeMap(stats))); }
public void init() { final Optional<StringValue> serialisedStats = this.metadataManager.get(getMetakey()); if (serialisedStats.isPresent()) { final Map<String, String> statHistory = MapUtil.deserializeMap(serialisedStats.get().getValue()); for (int i = 0; i < statHistory.size(); i++) { this.sinkStatQ.add(SinkStat.deserialize(statHistory.get(Integer.toString(i)))); } } }
@VisibleForTesting protected int calculateNewBulkInsertParallelism(final long numRecords) { final long avgRecordSize = this.sinkStatMgr.getAvgRecordSize(); final long targetFileSize = this.hoodieConf.getTargetFileSize(); final int newParallelism = (int) Math.ceil((numRecords * avgRecordSize * 1.0) / Math.max(1, targetFileSize)); final int currentParallelism = this.hoodieConf.getBulkInsertParallelism(); log.info( "StatsManager:targetFileSize:{}:avgRecordSize:{}:numRecords:{}:" + "newBulkInsertParallelism:{}:currentBulkInsertParallelism:{}", targetFileSize, avgRecordSize, numRecords, newParallelism, currentParallelism); return newParallelism; }
public static SinkStat deserialize(@NonNull final String serializedStat) { final SinkStat sinkStat = new SinkStat(); sinkStat.stats.putAll(MapUtil.deserializeMap(serializedStat)); return sinkStat; }
@Override protected void executeNode(@NonNull final Optional<IPayload> data) { Preconditions.checkState(data.isPresent() && (data.get() instanceof DagPayload), "Invalid payload :" + (data.isPresent() ? data.get().getClass() : null)); // setup job and topic metrics. setupMetrics(); this.sink.write(((DagPayload) data.get()).getData()); }
@Test public void testSerDser() { final String tableName = "testTable"; final IMetadataManager metadataManager = new MemoryMetadataManager(); final SinkStatManager sinkStatManager1 = new SinkStatManager(tableName, metadataManager); // Initially nothing will be found; it should not crash. sinkStatManager1.init(); Assert.assertEquals(0, sinkStatManager1.getAvgRecordSize()); Assert.assertFalse(sinkStatManager1.isStatHistoryAvailable()); final int avgRecordSize1 = 30; sinkStatManager1.getCurrentStat().put(SinkStat.AVG_RECORD_SIZE, Integer.toString(avgRecordSize1)); // nothing is saved to metadata manager before persist. Assert.assertEquals(0, metadataManager.getAllKeys().size()); sinkStatManager1.persist(); Assert.assertEquals(1, metadataManager.getAllKeys().size()); final SinkStatManager sinkStatManager2 = new SinkStatManager(tableName, metadataManager); sinkStatManager2.init(); Assert.assertEquals(avgRecordSize1, sinkStatManager2.getAvgRecordSize()); final int avgRecordSize2 = 20; sinkStatManager2.getCurrentStat().put(SinkStat.AVG_RECORD_SIZE, Integer.toString(avgRecordSize2)); sinkStatManager2.persist(); final SinkStatManager sinkStatManager3 = new SinkStatManager(tableName, metadataManager); sinkStatManager3.init(); Assert.assertEquals((avgRecordSize1 + avgRecordSize2) / 2, sinkStatManager3.getAvgRecordSize()); }