/** * Pack the list of {@code WorkUnit}s into {@code MultiWorkUnit}s. * * TODO: this is currently a simple round-robin packing. More sophisticated bin packing may be necessary * if the round-robin approach leads to mapper skew. */ private static List<WorkUnit> pack(List<WorkUnit> workUnits, int numOfMultiWorkunits) { Preconditions.checkArgument(numOfMultiWorkunits > 0); if (workUnits.size() <= numOfMultiWorkunits) { return workUnits; } List<WorkUnit> result = Lists.newArrayListWithCapacity(numOfMultiWorkunits); for (int i = 0; i < numOfMultiWorkunits; i++) { result.add(MultiWorkUnit.createEmpty()); } for (int i = 0; i < workUnits.size(); i++) { ((MultiWorkUnit) result.get(i % numOfMultiWorkunits)).addWorkUnit(workUnits.get(i)); } return result; }
@Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { WorkUnit workUnit = (value.toString().endsWith(MULTI_WORK_UNIT_FILE_EXTENSION) ? MultiWorkUnit.createEmpty() : WorkUnit.createEmpty()); SerializationUtils.deserializeState(this.fs, new Path(value.toString()), workUnit); if (workUnit instanceof MultiWorkUnit) { List<WorkUnit> flattenedWorkUnits = JobLauncherUtils.flattenWorkUnits(((MultiWorkUnit) workUnit).getWorkUnits()); this.workUnits.addAll(flattenedWorkUnits); } else { this.workUnits.add(workUnit); } } }
/** * Group {@link WorkUnit}s into groups. Each group is a {@link MultiWorkUnit}. Each group has a capacity of * avgGroupSize. If there's a single {@link WorkUnit} whose size is larger than avgGroupSize, it forms a group itself. */ private static List<MultiWorkUnit> bestFitDecreasingBinPacking(List<WorkUnit> workUnits, double avgGroupSize) { // Sort workunits by data size desc Collections.sort(workUnits, LOAD_DESC_COMPARATOR); PriorityQueue<MultiWorkUnit> pQueue = new PriorityQueue<>(workUnits.size(), LOAD_DESC_COMPARATOR); for (WorkUnit workUnit : workUnits) { MultiWorkUnit bestGroup = findAndPopBestFitGroup(workUnit, pQueue, avgGroupSize); if (bestGroup != null) { addWorkUnitToMultiWorkUnit(workUnit, bestGroup); } else { bestGroup = MultiWorkUnit.createEmpty(); addWorkUnitToMultiWorkUnit(workUnit, bestGroup); } pQueue.add(bestGroup); } return Lists.newArrayList(pQueue); }
MultiWorkUnit mwu = MultiWorkUnit.createEmpty(); try { mwu.readFields(workUnitFileCloser.register(new DataInputStream(fs.open(status.getPath()))));
@Override public List<WorkUnit> pack(Map<String, List<WorkUnit>> workUnitsByTopic, int numContainers) { double totalEstDataSize = setWorkUnitEstSizes(workUnitsByTopic); double avgGroupSize = totalEstDataSize / numContainers / getPreGroupingSizeFactor(this.state); List<MultiWorkUnit> mwuGroups = Lists.newArrayList(); for (List<WorkUnit> workUnitsForTopic : workUnitsByTopic.values()) { double estimatedDataSizeForTopic = calcTotalEstSizeForTopic(workUnitsForTopic); if (estimatedDataSizeForTopic < avgGroupSize) { // If the total estimated size of a topic is smaller than group size, put all partitions of this // topic in a single group. MultiWorkUnit mwuGroup = MultiWorkUnit.createEmpty(); addWorkUnitsToMultiWorkUnit(workUnitsForTopic, mwuGroup); mwuGroups.add(mwuGroup); } else { // Use best-fit-decreasing to group workunits for a topic into multiple groups. mwuGroups.addAll(bestFitDecreasingBinPacking(workUnitsForTopic, avgGroupSize)); } } List<WorkUnit> groups = squeezeMultiWorkUnits(mwuGroups); return worstFitDecreasingBinPacking(groups, numContainers); }
@Override public List<WorkUnit> pack(Map<String, List<WorkUnit>> workUnitsByTopic, int numContainers) { setWorkUnitEstSizes(workUnitsByTopic); List<WorkUnit> workUnits = Lists.newArrayList(); for (List<WorkUnit> workUnitsForTopic : workUnitsByTopic.values()) { // For each topic, merge all empty workunits into a single workunit, so that a single // empty task will be created instead of many. MultiWorkUnit zeroSizeWorkUnit = MultiWorkUnit.createEmpty(); for (WorkUnit workUnit : workUnitsForTopic) { if (DoubleMath.fuzzyEquals(getWorkUnitEstSize(workUnit), 0.0, EPS)) { addWorkUnitToMultiWorkUnit(workUnit, zeroSizeWorkUnit); } else { workUnit.setWatermarkInterval(getWatermarkIntervalFromWorkUnit(workUnit)); workUnits.add(workUnit); } } if (!zeroSizeWorkUnit.getWorkUnits().isEmpty()) { workUnits.add(squeezeMultiWorkUnit(zeroSizeWorkUnit)); } } return worstFitDecreasingBinPacking(workUnits, numContainers); } }
pQueue.add(MultiWorkUnit.createEmpty()); } else { MultiWorkUnit newMultiWorkUnit = MultiWorkUnit.createEmpty(); addToMultiWorkUnit(newMultiWorkUnit, workUnit, weight); pQueue.add(newMultiWorkUnit);
@Test public void testFlattenWorkUnits() { List<WorkUnit> workUnitsOnly = Arrays.asList(WorkUnit.createEmpty(), WorkUnit.createEmpty(), WorkUnit.createEmpty()); Assert.assertEquals(JobLauncherUtils.flattenWorkUnits(workUnitsOnly).size(), 3); MultiWorkUnit multiWorkUnit1 = MultiWorkUnit.createEmpty(); multiWorkUnit1.addWorkUnits(Arrays.asList(WorkUnit.createEmpty(), WorkUnit.createEmpty(), WorkUnit.createEmpty())); MultiWorkUnit multiWorkUnit2 = MultiWorkUnit.createEmpty(); multiWorkUnit1.addWorkUnits(Arrays.asList(WorkUnit.createEmpty(), WorkUnit.createEmpty(), WorkUnit.createEmpty())); List<WorkUnit> workUnitsAndMultiWorkUnits = Arrays.asList(WorkUnit.createEmpty(), WorkUnit.createEmpty(), WorkUnit.createEmpty(), multiWorkUnit1, multiWorkUnit2); Assert.assertEquals(JobLauncherUtils.flattenWorkUnits(workUnitsAndMultiWorkUnits).size(), 9); }
@Override public List<WorkUnit> getWorkunits(SourceState state) { String nameSpace = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY); Extract extract1 = createExtract(TableType.SNAPSHOT_ONLY, nameSpace, "TestTable1"); Extract extract2 = createExtract(TableType.SNAPSHOT_ONLY, nameSpace, "TestTable2"); String sourceFileList = state.getProp(SOURCE_FILE_LIST_KEY); List<String> list = SPLITTER.splitToList(sourceFileList); List<WorkUnit> workUnits = Lists.newArrayList(); for (int i = 0; i < list.size(); i++) { WorkUnit workUnit = WorkUnit.create(i % 2 == 0 ? extract1 : extract2); workUnit.setProp(SOURCE_FILE_KEY, list.get(i)); workUnits.add(workUnit); } if (state.getPropAsBoolean("use.multiworkunit", false)) { MultiWorkUnit multiWorkUnit = MultiWorkUnit.createEmpty(); multiWorkUnit.addWorkUnits(workUnits); workUnits.clear(); workUnits.add(multiWorkUnit); } return workUnits; }
MinMaxPriorityQueue.orderedBy(LOAD_ASC_COMPARATOR).expectedSize(numOfMultiWorkUnits).create(); for (int i = 0; i < numOfMultiWorkUnits; i++) { MultiWorkUnit multiWorkUnit = MultiWorkUnit.createEmpty(); setWorkUnitEstSize(multiWorkUnit, 0); pQueue.add(multiWorkUnit);
@BeforeClass public void setupWorkUnitFiles() throws IOException { this.conf = new Configuration(); this.fs = FileSystem.getLocal(this.conf); this.stagingDirs = Lists.newArrayList(); // Create a list of WorkUnits to serialize WorkUnit wu1 = createAndSetWorkUnit("wu1"); WorkUnit wu2 = createAndSetWorkUnit("wu2"); WorkUnit wu3 = createAndSetWorkUnit("wu3"); WorkUnit wu4 = createAndSetWorkUnit("wu4"); // Create a MultiWorkUnit to serialize MultiWorkUnit mwu1 = MultiWorkUnit.createEmpty(); mwu1.setProp(ConfigurationKeys.TASK_ID_KEY, System.nanoTime()); mwu1.addWorkUnits(Arrays.asList(wu3, wu4)); Path inputDir = new Path(new Path(OUTPUT_PATH, JOB_NAME), "input"); // Writer each WorkUnit to a separate file under inputDir Closer closer = Closer.create(); try { wu1.write(closer.register(this.fs .create(new Path(inputDir, wu1.getProp(ConfigurationKeys.TASK_ID_KEY) + Path.SEPARATOR + "_").suffix("wu")))); wu2.write(closer.register(this.fs .create(new Path(inputDir, wu2.getProp(ConfigurationKeys.TASK_ID_KEY) + Path.SEPARATOR + "_").suffix("wu")))); mwu1.write(closer.register(this.fs.create( new Path(inputDir, mwu1.getProp(ConfigurationKeys.TASK_ID_KEY) + Path.SEPARATOR + "_").suffix("mwu")))); } finally { closer.close(); } }
/** * Pack the list of {@code WorkUnit}s into {@code MultiWorkUnit}s. * * TODO: this is currently a simple round-robin packing. More sophisticated bin packing may be necessary * if the round-robin approach leads to mapper skew. */ private static List<WorkUnit> pack(List<WorkUnit> workUnits, int numOfMultiWorkunits) { Preconditions.checkArgument(numOfMultiWorkunits > 0); if (workUnits.size() <= numOfMultiWorkunits) { return workUnits; } List<WorkUnit> result = Lists.newArrayListWithCapacity(numOfMultiWorkunits); for (int i = 0; i < numOfMultiWorkunits; i++) { result.add(MultiWorkUnit.createEmpty()); } for (int i = 0; i < workUnits.size(); i++) { ((MultiWorkUnit) result.get(i % numOfMultiWorkunits)).addWorkUnit(workUnits.get(i)); } return result; }
@Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { WorkUnit workUnit = (value.toString().endsWith(MULTI_WORK_UNIT_FILE_EXTENSION) ? MultiWorkUnit.createEmpty() : WorkUnit.createEmpty()); SerializationUtils.deserializeState(this.fs, new Path(value.toString()), workUnit); if (workUnit instanceof MultiWorkUnit) { List<WorkUnit> flattenedWorkUnits = JobLauncherUtils.flattenWorkUnits(((MultiWorkUnit) workUnit).getWorkUnits()); this.workUnits.addAll(flattenedWorkUnits); } else { this.workUnits.add(workUnit); } } }
/** * Group {@link WorkUnit}s into groups. Each group is a {@link MultiWorkUnit}. Each group has a capacity of * avgGroupSize. If there's a single {@link WorkUnit} whose size is larger than avgGroupSize, it forms a group itself. */ private static List<MultiWorkUnit> bestFitDecreasingBinPacking(List<WorkUnit> workUnits, double avgGroupSize) { // Sort workunits by data size desc Collections.sort(workUnits, LOAD_DESC_COMPARATOR); PriorityQueue<MultiWorkUnit> pQueue = new PriorityQueue<>(workUnits.size(), LOAD_DESC_COMPARATOR); for (WorkUnit workUnit : workUnits) { MultiWorkUnit bestGroup = findAndPopBestFitGroup(workUnit, pQueue, avgGroupSize); if (bestGroup != null) { addWorkUnitToMultiWorkUnit(workUnit, bestGroup); } else { bestGroup = MultiWorkUnit.createEmpty(); addWorkUnitToMultiWorkUnit(workUnit, bestGroup); } pQueue.add(bestGroup); } return Lists.newArrayList(pQueue); }
MultiWorkUnit mwu = MultiWorkUnit.createEmpty(); try { mwu.readFields(workUnitFileCloser.register(new DataInputStream(fs.open(status.getPath()))));
@Override public List<WorkUnit> pack(Map<String, List<WorkUnit>> workUnitsByTopic, int numContainers) { double totalEstDataSize = setWorkUnitEstSizes(workUnitsByTopic); double avgGroupSize = totalEstDataSize / numContainers / getPreGroupingSizeFactor(this.state); List<MultiWorkUnit> mwuGroups = Lists.newArrayList(); for (List<WorkUnit> workUnitsForTopic : workUnitsByTopic.values()) { double estimatedDataSizeForTopic = calcTotalEstSizeForTopic(workUnitsForTopic); if (estimatedDataSizeForTopic < avgGroupSize) { // If the total estimated size of a topic is smaller than group size, put all partitions of this // topic in a single group. MultiWorkUnit mwuGroup = MultiWorkUnit.createEmpty(); addWorkUnitsToMultiWorkUnit(workUnitsForTopic, mwuGroup); mwuGroups.add(mwuGroup); } else { // Use best-fit-decreasing to group workunits for a topic into multiple groups. mwuGroups.addAll(bestFitDecreasingBinPacking(workUnitsForTopic, avgGroupSize)); } } List<WorkUnit> groups = squeezeMultiWorkUnits(mwuGroups); return worstFitDecreasingBinPacking(groups, numContainers); }
@Override public List<WorkUnit> pack(Map<String, List<WorkUnit>> workUnitsByTopic, int numContainers) { setWorkUnitEstSizes(workUnitsByTopic); List<WorkUnit> workUnits = Lists.newArrayList(); for (List<WorkUnit> workUnitsForTopic : workUnitsByTopic.values()) { // For each topic, merge all empty workunits into a single workunit, so that a single // empty task will be created instead of many. MultiWorkUnit zeroSizeWorkUnit = MultiWorkUnit.createEmpty(); for (WorkUnit workUnit : workUnitsForTopic) { if (DoubleMath.fuzzyEquals(getWorkUnitEstSize(workUnit), 0.0, EPS)) { addWorkUnitToMultiWorkUnit(workUnit, zeroSizeWorkUnit); } else { workUnit.setWatermarkInterval(getWatermarkIntervalFromWorkUnit(workUnit)); workUnits.add(workUnit); } } if (!zeroSizeWorkUnit.getWorkUnits().isEmpty()) { workUnits.add(squeezeMultiWorkUnit(zeroSizeWorkUnit)); } } return worstFitDecreasingBinPacking(workUnits, numContainers); } }
pQueue.add(MultiWorkUnit.createEmpty()); } else { MultiWorkUnit newMultiWorkUnit = MultiWorkUnit.createEmpty(); addToMultiWorkUnit(newMultiWorkUnit, workUnit, weight); pQueue.add(newMultiWorkUnit);
MinMaxPriorityQueue.orderedBy(LOAD_ASC_COMPARATOR).expectedSize(numOfMultiWorkUnits).create(); for (int i = 0; i < numOfMultiWorkUnits; i++) { MultiWorkUnit multiWorkUnit = MultiWorkUnit.createEmpty(); setWorkUnitEstSize(multiWorkUnit, 0); pQueue.add(multiWorkUnit);