int count = 0; for (Text val : values) { String[] items = val.toString().toLowerCase().split("\\s+"); lastwords.add(items[items.length-1]); for (String item : items) { struct.add(0, key.get()); struct.add(1, count); List<List<Object>> lastWordInfoList = new ArrayList<List<Object>>(); struct.add(3, wordCounts); row = serde.serialize(struct, oip); context.write(NullWritable.get(), row);
@Override protected void setup(Context context) throws IOException, InterruptedException { this.outKey = new AvroKey<>(); this.deltaComparatorOptional = Optional.absent(); Configuration conf = context.getConfiguration(); String deltaSchemaProviderClassName = conf.get(DELTA_SCHEMA_PROVIDER); if (deltaSchemaProviderClassName != null) { this.deltaFieldNamesProvider = GobblinConstructorUtils.invokeConstructor(AvroDeltaFieldNameProvider.class, deltaSchemaProviderClassName, conf); this.deltaComparatorOptional = Optional.of(new AvroValueDeltaSchemaComparator(deltaFieldNamesProvider)); } }
/** * Handles common parameter initialization that a subclass might want to leverage. * @param context * @param conf */ protected void doSetup(Context context, Configuration conf) { // If a custom separator has been used, // decode it back from Base64 encoding. separator = conf.get(ImportTsv.SEPARATOR_CONF_KEY); if (separator == null) { separator = ImportTsv.DEFAULT_SEPARATOR; } else { separator = Bytes.toString(Base64.getDecoder().decode(separator)); } // Should never get 0 as we are setting this to a valid value in job configuration. ts = conf.getLong(ImportTsv.TIMESTAMP_CONF_KEY, 0); skipBadLines = context.getConfiguration().getBoolean(ImportTsv.SKIP_LINES_CONF_KEY, true); badLineCount = context.getCounter("ImportTsv", "Bad Lines"); }
@Override protected void setup(Reducer<BytesWritable,BytesWritable,BytesWritable,BytesWritable>.Context context) throws IOException ,InterruptedException { this.timeperiodCountOutputRoot = new Path(context.getConfiguration().get(PairMutualInformation.TIMEPERIOD_COUNT_OUTPUT_ROOT)); if(!HadoopToolsUtil.getFileSystem(this.timeperiodCountOutputRoot ).mkdirs(this.timeperiodCountOutputRoot)) throw new IOException("Couldn't create: " + this.timeperiodCountOutputRoot); };
@Override protected void setup(Context ctx) throws IOException, InterruptedException { similarity = ClassUtils.instantiateAs(ctx.getConfiguration().get(SIMILARITY_CLASSNAME), VectorSimilarityMeasure.class); numberOfColumns = ctx.getConfiguration().getInt(NUMBER_OF_COLUMNS, -1); Preconditions.checkArgument(numberOfColumns > 0, "Number of columns must be greater then 0! But numberOfColumns = " + numberOfColumns); excludeSelfSimilarity = ctx.getConfiguration().getBoolean(EXCLUDE_SELF_SIMILARITY, false); norms = Vectors.read(new Path(ctx.getConfiguration().get(NORMS_PATH)), ctx.getConfiguration()); treshold = Double.parseDouble(ctx.getConfiguration().get(THRESHOLD)); }
@Override public void reduce(ShortWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException { String columnName = columns[key.get()]; HashSet<ByteArray> set = new HashSet<ByteArray>(); for (Text textValue : values) { ByteArray value = new ByteArray(Bytes.copy(textValue.getBytes(), 0, textValue.getLength())); set.add(value); } Configuration conf = context.getConfiguration(); FileSystem fs = FileSystem.get(conf); String outputPath = conf.get(BatchConstants.OUTPUT_PATH); FSDataOutputStream out = fs.create(new Path(outputPath, columnName)); try { for (ByteArray value : set) { out.write(value.data); out.write('\n'); } } finally { out.close(); } }
context.progress(); context.progress(); final IncrementalIndex persistIndex = index; persistFutures.add( final FileSystem outputFS = new Path(config.getSchema().getIOConfig().getSegmentOutputPath()) .getFileSystem(context.getConfiguration()); final DataSegment segment = JobHelper.serializeOutIndex( segmentTemplate, context.getConfiguration(), context, mergedBase, JobHelper.makeFileNamePath( new Path(config.getSchema().getIOConfig().getSegmentOutputPath()), outputFS, segmentTemplate, outputFS, segmentTemplate, context.getTaskAttemptID(), config.DATA_SEGMENT_PUSHER ), FileSystem.get( descriptorPath.toUri(), context.getConfiguration() ), descriptorPath
@Override protected void cleanup(Context context) throws IOException, InterruptedException { final String output = context.getConfiguration().getStrings(TIMEINDEX_LOCATION_PROP)[0]; final Path indexOut = new Path(output + "/" + context.getTaskAttemptID()); System.out.println("Writing time index to: " + indexOut); System.out.println("Timemap contains: " + this.timeMap.size()); CountTweetsInTimeperiod.writeTimeIndex(this.timeMap, indexOut); } }
@Override protected void doSetup(Context context) throws IOException { super.bindCurrentConfiguration(context.getConfiguration()); KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata(); String cubeName = context.getConfiguration().get(BatchConstants.CFG_CUBE_NAME).toUpperCase(Locale.ROOT); CubeInstance cube = CubeManager.getInstance(config).getCube(cubeName); CubeDesc cubeDesc = cube.getDescriptor(); List<MeasureDesc> measuresDescs = cubeDesc.getMeasures(); codec = new BufferedMeasureCodec(measuresDescs); aggs = new MeasureAggregators(measuresDescs); input = new Object[measuresDescs.size()]; result = new Object[measuresDescs.size()]; outputKey = new Text(); outputValue = new Text(); }
@Override protected void doSetup(Context context) throws IOException { super.bindCurrentConfiguration(context.getConfiguration()); cubeName = context.getConfiguration().get(BatchConstants.CFG_CUBE_NAME).toUpperCase(Locale.ROOT); // only used in Build job, not in Merge job cuboidLevel = context.getConfiguration().getInt(BatchConstants.CFG_CUBE_CUBOID_LEVEL, 0); KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata(); cubeDesc = CubeManager.getInstance(config).getCube(cubeName).getDescriptor(); measuresDescs = cubeDesc.getMeasures(); codec = new BufferedMeasureCodec(measuresDescs); aggs = new MeasureAggregators(measuresDescs); input = new Object[measuresDescs.size()]; result = new Object[measuresDescs.size()]; List<Integer> needAggMeasuresList = Lists.newArrayList(); for (int i = 0; i < measuresDescs.size(); i++) { if (cuboidLevel == 0) { needAggMeasuresList.add(i); } else { if (!measuresDescs.get(i).getFunction().getMeasureType().onlyAggrInBaseCuboid()) { needAggMeasuresList.add(i); } } } needAggrMeasures = new int[needAggMeasuresList.size()]; for (int i = 0; i < needAggMeasuresList.size(); i++) { needAggrMeasures[i] = needAggMeasuresList.get(i); } }
/** * Merges the IP addresses of same Status. */ @Override protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { Set<String> addressSet = new HashSet<>(); for (Text val : values) { addressSet.add(val.toString()); } context.write(key, new Text(String.join(" ", addressSet))); } }
@Override protected void doCleanup(Context context) throws IOException, InterruptedException { long grandTotal = 0; for (HLLCounter hll : cuboidHLLMap.values()) { grandTotal += hll.getCountEstimate(); } double mapperOverlapRatio = grandTotal == 0 ? 0 : (double) totalRowsBeforeMerge / grandTotal; CubeStatsWriter.writePartialCuboidStatistics(context.getConfiguration(), new Path(output), // cuboidHLLMap, samplingPercentage, baseCuboidRowCountInMappers.size(), mapperOverlapRatio, taskId); } }
@Override public void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException { long sum = 0; for (LongWritable count : values) { sum += count.get(); } context.write(key, new LongWritable(sum)); }
@Override protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException { long count = 0; for (LongWritable value: values) { count += value.get(); } result.datum().put("key", key.toString()); result.datum().put("value", count); context.write(result, NullWritable.get()); } }
@Override protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException { long sum = 0; for (LongWritable value: values) { sum += value.get(); } resultKey.datum(key.toString()); resultValue.datum(sum); context.write(resultKey, resultValue); } }
@Override public void reduce(Text productID, Iterable<Text> locations, Context context) throws IOException, InterruptedException { // Set<String> set = new HashSet<String>(); // for (Text location: locations) { set.add(location.toString()); } // context.write(productID, new LongWritable(set.size())); } }
/** {@inheritDoc} */ @Override protected void setup(Context ctx) throws IOException, InterruptedException { X.println("___ Reducer: " + ctx.getTaskAttemptID()); String taskId = ctx.getTaskAttemptID().toString(); String workDir = FileSystem.getLocal(ctx.getConfiguration()).getWorkingDirectory().toString(); assertNull(taskWorkDirs.put(workDir, taskId)); }