/** * Gets a map from input path to schema. Because multiple inputs are allowed, there may be multiple schemas. * Must call {@link #createPlan()} first. * * @return map from path to input schema */ public Map<String,Schema> getInputSchemasByPath() { checkPlanExists(); return _inputSchemasByPath; }
/** * Create the execution plan. * * @throws IOException IOException */ public void createPlan() throws IOException { if (_planExists) throw new RuntimeException("Plan already exists"); _planExists = true; loadInputData(); loadOutputData(); determineAvailableInputDates(); determineDateRange(); determineInputsToProcess(); determineInputSchemas(); determineNumReducers(); }
/** * Determines what output data already exists. Inputs will not be consumed if the output already exists. * * @throws IOException */ private void loadOutputData() throws IOException { _log.info(String.format("Checking output data in " + getOutputPath())); _outputPathsByDate = getDailyData(getOutputPath()); }
PartitionPreservingExecutionPlanner planner = new PartitionPreservingExecutionPlanner(getFileSystem(),getProperties()); planner.setInputPaths(getInputPaths()); planner.setOutputPath(getOutputPath()); planner.setStartDate(getStartDate()); planner.setEndDate(getEndDate()); planner.setDaysAgo(getDaysAgo()); planner.setNumDays(getNumDays()); planner.setMaxToProcess(getMaxToProcess()); planner.setFailOnMissing(isFailOnMissing()); planner.createPlan(); if (planner.getInputsToProcess().size() == 0) iterations, getMaxIterations(), planner.getInputsToProcess().size())); for (DatePath input : planner.getInputsToProcess()) PartitionPreservingSchemas fpSchemas = new PartitionPreservingSchemas(getSchemas(), planner.getInputSchemasByPath(), getOutputSchemaName(), getOutputSchemaNamespace() ); for (Date input : planner.getDatesToProcess()) numReducers = planner.getNumReducers(); _log.info(String.format("Using %d reducers (computed)",numReducers)); int avgReducersPerInput = (int)Math.ceil(numReducers/(double)planner.getDatesToProcess().size()); if (!planner.getNeedsAnotherPass())
int newDataCount = 0; Calendar cal = Calendar.getInstance(PathUtils.timeZone); for (Date currentDate=getDateRange().getBeginDate(); currentDate.compareTo(getDateRange().getEndDate()) <= 0; ) List<DatePath> inputs = getAvailableInputsByDate().get(currentDate); if (inputs != null) if (getMaxToProcess() != null && newDataCount >= getMaxToProcess()) else if (isFailOnMissing())
/** * Gets whether another pass will be required. Because there may be a limit on the number of inputs processed * in a single run, multiple runs may be required to process all data in the desired date range. * Must call {@link #createPlan()} first. * * @return true if another pass is required */ public boolean getNeedsAnotherPass() { checkPlanExists(); return _needAnotherPass; }
/** * Gets the inputs which are to be processed. * Must call {@link #createPlan()} first. * * @return inputs to process */ public List<DatePath> getInputsToProcess() { checkPlanExists(); return _inputsToProcess; }
/** * Get the number of reducers to use based on the input data size. * Must call {@link #createPlan()} first. * * @return number of reducers to use */ public int getNumReducers() { checkPlanExists(); return _numReducers; }
/** * Gets the input schemas. Because multiple inputs are allowed, there may be multiple schemas. * Must call {@link #createPlan()} first. * * @return input schemas */ public List<Schema> getInputSchemas() { checkPlanExists(); return _inputSchemas; }
/** * Gets the input dates which are to be processed. * Must call {@link #createPlan()} first. * * @return dates to process */ public List<Date> getDatesToProcess() { checkPlanExists(); Set<Date> dates = new TreeSet<Date>(); for (DatePath dp : _inputsToProcess) { dates.add(dp.getDate()); } return new ArrayList<Date>(dates); }