public ReduceWork createReduceWork(GenSparkProcContext context, Operator<?> root, SparkWork sparkWork) throws SemanticException { Preconditions.checkArgument(!root.getParentOperators().isEmpty(), "AssertionError: expected root.getParentOperators() to be non-empty"); ReduceWork reduceWork = new ReduceWork("Reducer " + (++sequenceNumber)); LOG.debug("Adding reduce work (" + reduceWork.getName() + ") for " + root); reduceWork.setReducer(root); reduceWork.setNeedsTagging(GenMapRedUtils.needsTagging(reduceWork)); // Pick the maximum # reducers across all parents as the # of reduce tasks. int maxExecutors = -1; for (Operator<? extends OperatorDesc> parentOfRoot : root.getParentOperators()) { Preconditions.checkArgument(parentOfRoot instanceof ReduceSinkOperator, "AssertionError: expected parentOfRoot to be an " + "instance of ReduceSinkOperator, but was " + parentOfRoot.getClass().getName()); ReduceSinkOperator reduceSink = (ReduceSinkOperator) parentOfRoot; maxExecutors = Math.max(maxExecutors, reduceSink.getConf().getNumReducers()); } reduceWork.setNumReduceTasks(maxExecutors); ReduceSinkOperator reduceSink = (ReduceSinkOperator) context.parentOfRoot; setupReduceSink(context, reduceWork, reduceSink); sparkWork.add(reduceWork); SparkEdgeProperty edgeProp = getEdgeProperty(context.conf, reduceSink, reduceWork); sparkWork.connect(context.preceedingWork, reduceWork, edgeProp); return reduceWork; }
Operator<? extends OperatorDesc> reducer = ((ReduceWork)wrk).getReducer(); if ( reducer instanceof JoinOperator || reducer instanceof CommonMergeJoinOperator ) { boolean noOuterJoin = ((JoinDesc)reducer.getConf()).isNoOuterJoin(); Map<Integer, ExtractReduceSinkInfo.Info> rsInfo = new TreeMap<Integer, ExtractReduceSinkInfo.Info>(); for(Map.Entry<Integer, String> e : rWork.getTagToInput().entrySet()) { rsInfo.putAll(getReducerInfo(tezWork, rWork.getName(), e.getValue())); if (checkForCrossProduct(rWork.getName(), reducer, rsInfo) && cartesianProductEdgeEnabled && noOuterJoin) { List<BaseWork> parents = tezWork.getParents(null == origWrk ? wrk : origWrk); || prop.getEdgeType().equals(EdgeType.CUSTOM_EDGE)) { prop.setEdgeType(EdgeType.XPROD_EDGE); rWork.setNumReduceTasks(-1); rWork.setMaxReduceTasks(-1); rWork.setMinReduceTasks(-1);
reducer = gWork.getReducer(); isTagged = gWork.getNeedsTagging(); try { keyTableDesc = gWork.getKeyDesc(); inputKeyDeserializer = ReflectionUtils.newInstance(keyTableDesc .getDeserializerClass(), null); SerDeUtils.initializeSerDe(inputKeyDeserializer, null, keyTableDesc.getProperties(), null); keyObjectInspector = inputKeyDeserializer.getObjectInspector(); valueTableDesc = new TableDesc[gWork.getTagToValueDesc().size()]; for (int tag = 0; tag < gWork.getTagToValueDesc().size(); tag++) { valueTableDesc[tag] = gWork.getTagToValueDesc().get(tag); inputValueDeserializer[tag] = ReflectionUtils.newInstance( valueTableDesc[tag].getDeserializerClass(), null);
@Override public void replaceRoots(Map<Operator<?>, Operator<?>> replacementMap) { setReducer(replacementMap.get(getReducer())); }
private void initializeMultipleSources(ReduceWork redWork, int numTags, ObjectInspector[] ois, ReduceRecordSource[] sources) throws Exception { for (int tag = 0; tag < redWork.getTagToValueDesc().size(); tag++) { if (redWork.getTagToValueDesc().get(tag) == null) { continue; } checkAbortCondition(); initializeSourceForTag(redWork, tag, ois, sources, redWork.getTagToValueDesc().get(tag), redWork.getTagToInput().get(tag)); } }
private void initializeSourceForTag(ReduceWork redWork, int tag, ObjectInspector[] ois, ReduceRecordSource[] sources, TableDesc valueTableDesc, String inputName) throws Exception { reducer = redWork.getReducer(); reducer.getParentOperators().clear(); reducer.setParentOperators(null); // clear out any parents as reducer is the root TableDesc keyTableDesc = redWork.getKeyDesc(); Reader reader = inputs.get(inputName).getReader(); sources[tag] = new ReduceRecordSource(); // Only the big table input source should be vectorized (if applicable) // Note this behavior may have to change if we ever implement a vectorized merge join boolean vectorizedRecordSource = (tag == bigTablePosition) && redWork.getVectorMode(); sources[tag].init(jconf, redWork.getReducer(), vectorizedRecordSource, keyTableDesc, valueTableDesc, reader, tag == bigTablePosition, (byte) tag, redWork.getVectorizedRowBatchCtx(), redWork.getVectorizedVertexNum(), redWork.getVectorizedTestingReducerBatchSize()); ois[tag] = sources[tag].getObjectInspector(); }
@SuppressWarnings("unchecked") private void populateMapRedPlan1(Table src) throws SemanticException { ArrayList<String> outputColumns = new ArrayList<String>(); for (int i = 0; i < 2; i++) { outputColumns.add("_col" + i); } // map-side work Operator<ReduceSinkDesc> op1 = OperatorFactory.get(ctx, PlanUtils .getReduceSinkDesc(Utilities.makeList(getStringColumn("key")), Utilities.makeList(getStringColumn("value")), outputColumns, true, -1, 1, -1, AcidUtils.Operation.NOT_ACID)); addMapWork(mr, src, "a", op1); ReduceWork rWork = new ReduceWork(); rWork.setNumReduceTasks(Integer.valueOf(1)); rWork.setKeyDesc(op1.getConf().getKeySerializeInfo()); rWork.getTagToValueDesc().add(op1.getConf().getValueSerializeInfo()); mr.setReduceWork(rWork); // reduce side work Operator<FileSinkDesc> op3 = OperatorFactory.get(ctx, new FileSinkDesc(new Path(tmpdir + File.separator + "mapredplan1.out"), Utilities.defaultTd, false)); List<ExprNodeDesc> cols = new ArrayList<ExprNodeDesc>(); cols.add(getStringColumn(Utilities.ReduceField.VALUE.toString()+"."+outputColumns.get(1))); List<String> colNames = new ArrayList<String>(); colNames.add(HiveConf.getColumnInternalName(2)); Operator<SelectDesc> op2 = OperatorFactory.get(new SelectDesc(cols, colNames), op3); rWork.setReducer(op2); }
long bytesPerReducer = context.conf.getLongVar(HiveConf.ConfVars.BYTESPERREDUCER); ReduceWork reduceWork = new ReduceWork(Utilities.REDUCENAME + context.nextSequenceNumber()); LOG.debug("Adding reduce work (" + reduceWork.getName() + ") for " + root); reduceWork.setReducer(root); reduceWork.setNeedsTagging(GenMapRedUtils.needsTagging(reduceWork)); ReduceSinkOperator reduceSink = (ReduceSinkOperator) context.parentOfRoot; reduceWork.setNumReduceTasks(reduceSink.getConf().getNumReducers()); reduceWork.setAutoReduceParallelism(true); reduceWork.setMinReduceTasks(minPartition); reduceWork.setMaxReduceTasks(maxPartition); } else if (nReducers < maxPartition) { reduceWork.setNumReduceTasks(maxPartition); if (reduceWork.isAutoReduceParallelism()) { edgeProp = new TezEdgeProperty(context.conf, edgeType, true, reduceWork.getMinReduceTasks(), reduceWork.getMaxReduceTasks(), bytesPerReducer); } else { edgeProp = new TezEdgeProperty(edgeType);
long bytesPerReducer = context.conf.getLongVar(HiveConf.ConfVars.BYTESPERREDUCER); ReduceWork reduceWork = new ReduceWork(Utilities.REDUCENAME + context.nextSequenceNumber()); LOG.debug("Adding reduce work (" + reduceWork.getName() + ") for " + root); reduceWork.setReducer(root); reduceWork.setNeedsTagging(GenMapRedUtils.needsTagging(reduceWork)); reduceWork.setNumReduceTasks(reduceSink.getConf().getNumReducers()); reduceWork.setSlowStart(reduceSink.getConf().isSlowStart()); reduceWork.setUniformDistribution(reduceSink.getConf().getReducerTraits().contains(UNIFORM)); reduceWork.setAutoReduceParallelism(true); reduceWork.setMinReduceTasks(minPartition); reduceWork.setMaxReduceTasks(maxPartition); } else if (nReducers < maxPartition) { reduceWork.setNumReduceTasks(maxPartition); if (reduceWork.isAutoReduceParallelism()) { edgeProp = new TezEdgeProperty(context.conf, edgeType, true, reduceWork.isSlowStart(), reduceWork.getMinReduceTasks(), reduceWork.getMaxReduceTasks(), bytesPerReducer); } else { edgeProp = new TezEdgeProperty(edgeType); edgeProp.setSlowStart(reduceWork.isSlowStart()); reduceWork.setEdgePropRef(edgeProp);
reducer = gWork.getReducer(); vectorized = gWork.getVectorMode(); isTagged = gWork.getNeedsTagging(); try { keyTableDesc = gWork.getKeyDesc(); inputKeyDeserializer = ReflectionUtils.newInstance(keyTableDesc .getDeserializerClass(), null); SerDeUtils.initializeSerDe(inputKeyDeserializer, null, keyTableDesc.getProperties(), null); keyObjectInspector = inputKeyDeserializer.getObjectInspector(); valueTableDesc = new TableDesc[gWork.getTagToValueDesc().size()]; final int maxTags = gWork.getTagToValueDesc().size(); keyStructInspector = (StructObjectInspector) keyObjectInspector; batches = new VectorizedRowBatch[maxTags]; for (int tag = 0; tag < gWork.getTagToValueDesc().size(); tag++) { valueTableDesc[tag] = gWork.getTagToValueDesc().get(tag); inputValueDeserializer[tag] = ReflectionUtils.newInstance( valueTableDesc[tag].getDeserializerClass(), null); batches[tag] = gWork.getVectorizedRowBatchCtx().createVectorizedRowBatch(); localWork = gWork.getMapRedLocalWork(); execContext.setJc(jc); execContext.setLocalWork(localWork);
l4j.info("Main work is " + reduceWork.getName()); List<HashTableDummyOperator> workOps = reduceWork.getDummyOps(); HashSet<HashTableDummyOperator> dummyOps = workOps == null ? null : new HashSet<>(workOps); tagToReducerMap.put(redWork.getTag(), redWork); if (mergeWorkList != null) { for (BaseWork mergeWork : mergeWorkList) { reducer = mergeReduceWork.getReducer(); connectOps.put(mergeReduceWork.getTag(), dummyStoreOp); tagToReducerMap.put(mergeReduceWork.getTag(), mergeReduceWork); bigTablePosition = (byte) reduceWork.getTag(); int numTags = reduceWork.getTagToValueDesc().size(); reducer = reduceWork.getReducer(); for (int i : tagToReducerMap.keySet()) { redWork = tagToReducerMap.get(i); reducer = redWork.getReducer(); redWork.getTagToValueDesc().get(0), redWork.getTagToInput().get(0)); reducer.initializeLocalWork(jconf); reducer = reduceWork.getReducer(); reducer = redWork.getReducer(); reducer = reduceWork.getReducer();
private void adjustAutoParallelism(BaseWork work) { if (minReducersPerExec <= 0 || !(work instanceof ReduceWork)) return; ReduceWork reduceWork = (ReduceWork)work; if (reduceWork.isAutoReduceParallelism() == false && reduceWork.isUniformDistribution() == false) { return; // Not based on ARP and cannot assume uniform distribution, bail. if (reduceWork.isAutoReduceParallelism()) { Math.max(reduceWork.getMinReduceTasks(), targetCount)); if (newMin < reduceWork.getMaxReduceTasks()) { reduceWork.setMinReduceTasks(newMin); reduceWork.getEdgePropRef().setAutoReduce(conf, true, newMin, reduceWork.getMaxReduceTasks(), conf.getLongVar(HiveConf.ConfVars.BYTESPERREDUCER)); } else { reduceWork.setAutoReduceParallelism(false); reduceWork.setNumReduceTasks(newMin); reduceWork.getEdgePropRef().setAutoReduce(null, false, 0, 0, 0); reduceWork.setNumReduceTasks(Math.max(reduceWork.getNumReduceTasks(), targetCount));
job.setNumReduceTasks(rWork != null ? rWork.getNumReduceTasks().intValue() : 0); job.setReducerClass(ExecReducer.class); } catch (IllegalStateException e) { console.printInfo("Not enough sampling data.. Rolling back to single reducer task"); rWork.setNumReduceTasks(1); job.setNumReduceTasks(1); } catch (Exception e) { console.printError(e.toString(), "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e)); rWork.setNumReduceTasks(1); job.setNumReduceTasks(1); if (mWork.isGatheringStats() || (rWork != null && rWork.isGatheringStats())) { rWork.getReducer().jobClose(job, success);
@Override public Operator<? extends OperatorDesc> getAnyRootOperator() { return getReducer(); }
/** * Met cRS in pOP(parentTask with RS)-cRS-cOP(noTask) case * Create new child task for cRS-cOP and link two tasks by temporary file : pOP-FS / TS-cRS-cOP * * @param cRS * the reduce sink operator encountered * @param opProcCtx * processing context */ static void splitPlan(ReduceSinkOperator cRS, GenMRProcContext opProcCtx) throws SemanticException { // Generate a new task ParseContext parseCtx = opProcCtx.getParseCtx(); Task<? extends Serializable> parentTask = opProcCtx.getCurrTask(); MapredWork childPlan = getMapRedWork(parseCtx); Task<? extends Serializable> childTask = TaskFactory.get(childPlan); Operator<? extends OperatorDesc> reducer = cRS.getChildOperators().get(0); // Add the reducer ReduceWork rWork = new ReduceWork(); childPlan.setReduceWork(rWork); rWork.setReducer(reducer); ReduceSinkDesc desc = cRS.getConf(); childPlan.getReduceWork().setNumReduceTasks(new Integer(desc.getNumReducers())); opProcCtx.getOpTaskMap().put(reducer, childTask); splitTasks(cRS, parentTask, childTask, opProcCtx); }
ReduceWork reduceWork = mrWork.getReduceWork(); if (reduceWork == null || reduceWork.getNumReduceTasks() != 1 || mapWork.getAliasToWork().size() != 1 || mapWork.getSamplingType() > 0 || reduceWork.getReducer() == null) { continue; if (OperatorUtils.findSingleOperator(reduceWork.getReducer(), GroupByOperator.class) != null) { continue; reduceWork.setNumReduceTasks(-1); mapWork.setSamplingType(MapWork.SAMPLING_ON_START);
ArrayList<TypeInfo> reduceTypeInfos = new ArrayList<TypeInfo>(); if (reduceWork.getNeedsTagging()) { setNodeIssue("Tagging not supported"); return false; String columnNullOrder; try { TableDesc keyTableDesc = reduceWork.getKeyDesc(); if (LOG.isDebugEnabled()) { LOG.debug("Using reduce tag " + reduceWork.getTag()); TableDesc valueTableDesc = reduceWork.getTagToValueDesc().get(reduceWork.getTag());
private void vectorizeReduceWork(ReduceWork reduceWork, boolean isTez) throws SemanticException { LOG.info("Vectorizing ReduceWork..."); reduceWork.setVectorMode(true); // For some reason, the DefaultGraphWalker does not descend down from the reducer Operator as // expected. We need to descend down, otherwise it breaks our algorithm that determines // VectorizationContext... Do we use PreOrderWalker instead of DefaultGraphWalker. Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>(); ReduceWorkVectorizationNodeProcessor vnp = new ReduceWorkVectorizationNodeProcessor(reduceColumnNames, reduceTypeInfos, isTez); addReduceWorkRules(opRules, vnp); Dispatcher disp = new DefaultRuleDispatcher(vnp, opRules, null); GraphWalker ogw = new PreOrderWalker(disp); // iterator the reduce operator tree ArrayList<Node> topNodes = new ArrayList<Node>(); topNodes.add(reduceWork.getReducer()); LOG.info("vectorizeReduceWork reducer Operator: " + reduceWork.getReducer().getName() + "..."); HashMap<Node, Object> nodeOutput = new HashMap<Node, Object>(); ogw.startWalking(topNodes, nodeOutput); // Necessary since we are vectorizing the root operator in reduce. reduceWork.setReducer(vnp.getRootVectorOp()); reduceWork.setVectorColumnNameMap(vnp.getVectorColumnNameMap()); reduceWork.setVectorColumnTypeMap(vnp.getVectorColumnTypeMap()); reduceWork.setVectorScratchColumnTypeMap(vnp.getVectorScratchColumnTypeMap()); if (LOG.isDebugEnabled()) { debugDisplayAllMaps(reduceWork); } } }
/** * Set key and value descriptor * @param work RedueWork * @param rs ReduceSinkOperator */ public static void setKeyAndValueDesc(ReduceWork work, ReduceSinkOperator rs) { work.setKeyDesc(rs.getConf().getKeySerializeInfo()); int tag = Math.max(0, rs.getConf().getTag()); List<TableDesc> tagToSchema = work.getTagToValueDesc(); while (tag + 1 > tagToSchema.size()) { tagToSchema.add(null); } tagToSchema.set(tag, rs.getConf().getValueSerializeInfo()); }
Operator<? extends OperatorDesc> reducerOp = cplan.getReduceWork().getReducer(); String id = null; if (reducerOp instanceof JoinOperator) { cplan.getReduceWork().setNeedsTagging(true);