/** * Creates a new costs object using the given values for the network and storage cost. * * @param networkCost The network cost, in bytes to be transferred. * @param diskCost The cost for disk, in bytes to be written and read. */ public Costs(double networkCost, double diskCost) { setNetworkCost(networkCost); setDiskCost(diskCost); }
/** * Creates a new optimizer instance that uses the statistics object to determine properties about the input. * Given those statistics, the optimizer can make better choices for the execution strategies. * * @param stats * The statistics to be used to determine the input properties. */ public Optimizer(DataStatistics stats, Configuration config) { this(stats, new DefaultCostEstimator(), config); }
/** * Creates a new costs object using the given values for the network and storage cost. * * @param networkCost The network cost, in bytes to be transferred. * @param diskCost The cost for disk, in bytes to be written and read. * @param cpuCost The cost for CPU operations. */ public Costs(double networkCost, double diskCost, double cpuCost) { setNetworkCost(networkCost); setDiskCost(diskCost); setCpuCost(cpuCost); }
@Override public void addLocalSortCost(EstimateProvider estimates, Costs costs) { final long s = estimates.getEstimatedOutputSize(); // we assume a two phase merge sort, so all in all 2 I/O operations per block if (s <= 0) { costs.setDiskCost(Costs.UNKNOWN); costs.setCpuCost(Costs.UNKNOWN); } else { costs.addDiskCost(2 * s); costs.addCpuCost((long) (s * SORTING_CPU_FACTOR)); } costs.addHeuristicDiskCost(2 * HEURISTIC_COST_BASE); costs.addHeuristicCpuCost((long) (HEURISTIC_COST_BASE * SORTING_CPU_FACTOR)); }
private void testJoinCostFormulasWithWeights(EstimateProvider e1, EstimateProvider e2) { Costs hf1 = new Costs(); Costs hf5 = new Costs(); Costs hs1 = new Costs(); Costs hs5 = new Costs(); Costs mm1 = new Costs(); Costs mm5 = new Costs(); costEstimator.addHybridHashCosts(e1, e2, hf1, 1); costEstimator.addHybridHashCosts(e1, e2, hf5, 5); costEstimator.addHybridHashCosts(e2, e1, hs1, 1); costEstimator.addHybridHashCosts(e2, e1, hs5, 5); costEstimator.addLocalSortCost(e1, mm1); costEstimator.addLocalSortCost(e2, mm1); costEstimator.addLocalMergeCost(e1, e2, mm1, 1); costEstimator.addLocalSortCost(e1, mm5); costEstimator.addLocalSortCost(e2, mm5); mm5.multiplyWith(5); costEstimator.addLocalMergeCost(e1, e2, mm5, 5); assertTrue(hf1.compareTo(hf5) < 0); assertTrue(hs1.compareTo(hs5) < 0); assertTrue(mm1.compareTo(mm5) < 0); assertTrue(hf1.compareTo(mm1) < 0); assertTrue(hs1.compareTo(mm1) < 0); assertTrue(hf5.compareTo(mm5) < 0); assertTrue(hs5.compareTo(mm5) < 0);
private void testShipStrategiesIsolated(EstimateProvider estimates, int targetParallelism) { Costs random = new Costs(); costEstimator.addRandomPartitioningCost(estimates, random); Costs hash = new Costs(); costEstimator.addHashPartitioningCost(estimates, hash); Costs range = new Costs(); costEstimator.addRangePartitionCost(estimates, range); Costs broadcast = new Costs(); costEstimator.addBroadcastCost(estimates, targetParallelism, broadcast); int randomVsHash = random.compareTo(hash); int hashVsRange = hash.compareTo(range); int hashVsBroadcast = hash.compareTo(broadcast); int rangeVsBroadcast = range.compareTo(broadcast);
private void testShipStrategyCombinationsWithUnknowns(EstimateProvider knownEstimates) { Costs hashBoth = new Costs(); Costs bcKnown10 = new Costs(); Costs bcUnknown10 = new Costs(); Costs bcKnown1000 = new Costs(); Costs bcUnknown1000 = new Costs(); costEstimator.addHashPartitioningCost(knownEstimates, hashBoth); costEstimator.addHashPartitioningCost(UNKNOWN_ESTIMATES, hashBoth); costEstimator.addBroadcastCost(knownEstimates, 10, bcKnown10); costEstimator.addBroadcastCost(UNKNOWN_ESTIMATES, 10, bcUnknown10); costEstimator.addBroadcastCost(knownEstimates, 1000, bcKnown1000); costEstimator.addBroadcastCost(UNKNOWN_ESTIMATES, 1000, bcUnknown1000); assertTrue(hashBoth.compareTo(bcKnown10) < 0); assertTrue(hashBoth.compareTo(bcUnknown10) < 0); assertTrue(hashBoth.compareTo(bcKnown1000) < 0); assertTrue(hashBoth.compareTo(bcUnknown1000) < 0); assertTrue(bcKnown10.compareTo(bcUnknown10) == 0); assertTrue(bcKnown1000.compareTo(bcUnknown1000) == 0); assertTrue(bcKnown10.compareTo(bcKnown1000) < 0); assertTrue(bcUnknown10.compareTo(bcUnknown1000) < 0);
@Override public void addRangePartitionCost(EstimateProvider estimates, Costs costs) { final long dataSize = estimates.getEstimatedOutputSize(); if (dataSize > 0) { // Assume sampling of 10% of the data and spilling it to disk final long sampled = (long) (dataSize * 0.1f); // set shipping costs costs.addNetworkCost(dataSize + sampled); } else { costs.setNetworkCost(Costs.UNKNOWN); } // no costs known. use the same assumption as above on the heuristic costs final long sampled = (long) (HEURISTIC_COST_BASE * 0.1f); costs.addHeuristicNetworkCost(HEURISTIC_COST_BASE + sampled); costs.addHeuristicDiskCost(2 * sampled); }
@Override public void addRandomPartitioningCost(EstimateProvider estimates, Costs costs) { // conservative estimate: we need ship the whole data over the network to establish the // partitioning. no disk costs. final long estOutShipSize = estimates.getEstimatedOutputSize(); if (estOutShipSize <= 0) { costs.setNetworkCost(Costs.UNKNOWN); } else { costs.addNetworkCost(estOutShipSize); } costs.addHeuristicNetworkCost(HEURISTIC_COST_BASE); }
@Override public void addFileInputCost(long fileSizeInBytes, Costs costs) { if (fileSizeInBytes >= 0) { costs.addDiskCost(fileSizeInBytes); } else { costs.setDiskCost(Costs.UNKNOWN); } costs.addHeuristicDiskCost(HEURISTIC_COST_BASE); }
/** * @param template */ public NAryUnionPlanNode(BinaryUnionNode template, List<Channel> inputs, GlobalProperties gProps, Costs cumulativeCosts) { super(template, "Union", DriverStrategy.NONE); this.inputs = inputs; this.globalProps = gProps; this.localProps = new LocalProperties(); this.nodeCosts = new Costs(); this.cumulativeCosts = cumulativeCosts; }
public void setCosts(Costs nodeCosts) { // add the costs from the step function nodeCosts.addCosts(this.rootOfStepFunction.getCumulativeCosts()); // add the costs for the termination criterion, if it exists // the costs are divided at branches, so we can simply add them up if (rootOfTerminationCriterion != null) { nodeCosts.addCosts(this.rootOfTerminationCriterion.getCumulativeCosts()); } super.setCosts(nodeCosts); }
@Test public void testShipStrategiesIsolated() { testShipStrategiesIsolated(UNKNOWN_ESTIMATES, 1); testShipStrategiesIsolated(UNKNOWN_ESTIMATES, 10); testShipStrategiesIsolated(ZERO_ESTIMATES, 1); testShipStrategiesIsolated(ZERO_ESTIMATES, 10); testShipStrategiesIsolated(SMALL_ESTIMATES, 1); testShipStrategiesIsolated(SMALL_ESTIMATES, 10); testShipStrategiesIsolated(BIG_ESTIMATES, 1); testShipStrategiesIsolated(BIG_ESTIMATES, 10); }
@Test public void testJoinCostFormulasWithWeights() { testJoinCostFormulasWithWeights(UNKNOWN_ESTIMATES, SMALL_ESTIMATES); testJoinCostFormulasWithWeights(SMALL_ESTIMATES, UNKNOWN_ESTIMATES); testJoinCostFormulasWithWeights(UNKNOWN_ESTIMATES, MEDIUM_ESTIMATES); testJoinCostFormulasWithWeights(MEDIUM_ESTIMATES, UNKNOWN_ESTIMATES); testJoinCostFormulasWithWeights(BIG_ESTIMATES, MEDIUM_ESTIMATES); testJoinCostFormulasWithWeights(MEDIUM_ESTIMATES, BIG_ESTIMATES); }
@Test public void testShipStrategyCombinationsWithUnknowns() { testShipStrategyCombinationsWithUnknowns(UNKNOWN_ESTIMATES); testShipStrategyCombinationsWithUnknowns(ZERO_ESTIMATES); testShipStrategyCombinationsWithUnknowns(SMALL_ESTIMATES); testShipStrategyCombinationsWithUnknowns(MEDIUM_ESTIMATES); testShipStrategyCombinationsWithUnknowns(BIG_ESTIMATES); }
@Override public void addArtificialDamCost(EstimateProvider estimates, long bufferSize, Costs costs) { final long s = estimates.getEstimatedOutputSize(); // we assume spilling and re-reading if (s <= 0) { costs.setDiskCost(Costs.UNKNOWN); costs.setCpuCost(Costs.UNKNOWN); } else { costs.addDiskCost(2 * s); costs.setCpuCost((long) (s * MATERIALIZATION_CPU_FACTOR)); } costs.addHeuristicDiskCost(2 * HEURISTIC_COST_BASE); costs.addHeuristicCpuCost((long) (HEURISTIC_COST_BASE * MATERIALIZATION_CPU_FACTOR)); } }
/** * Creates a new optimizer instance. The optimizer has no access to statistics about the * inputs and can hence not determine any properties. It will perform all optimization with * unknown sizes and hence use only the heuristic cost functions, which result in the selection * of the most robust execution strategies. */ public Optimizer(Configuration config) { this(null, new DefaultCostEstimator(), config); }
@Override public void addHashPartitioningCost(EstimateProvider estimates, Costs costs) { // conservative estimate: we need ship the whole data over the network to establish the // partitioning. no disk costs. final long estOutShipSize = estimates.getEstimatedOutputSize(); if (estOutShipSize <= 0) { costs.setNetworkCost(Costs.UNKNOWN); } else { costs.addNetworkCost(estOutShipSize); } costs.addHeuristicNetworkCost(HEURISTIC_COST_BASE); }
@Override public void addHybridHashCosts(EstimateProvider buildSideInput, EstimateProvider probeSideInput, Costs costs, int costWeight) { long bs = buildSideInput.getEstimatedOutputSize(); long ps = probeSideInput.getEstimatedOutputSize(); if (bs > 0 && ps > 0) { long overall = 2*bs + ps; costs.addDiskCost(overall); costs.addCpuCost((long) (overall * HASHING_CPU_FACTOR)); } else { costs.setDiskCost(Costs.UNKNOWN); costs.setCpuCost(Costs.UNKNOWN); } costs.addHeuristicDiskCost(2 * HEURISTIC_COST_BASE); costs.addHeuristicCpuCost((long) (2 * HEURISTIC_COST_BASE * HASHING_CPU_FACTOR)); // cost weight applies to everything costs.multiplyWith(costWeight); }