if (fs.exists(outputFolder) == false) { fs.mkdirs(outputFolder); final Path hfilePartitionFile = new Path(outputFolder, "part-r-00000_hfile"); short regionCount = (short) innerRegionSplits.size(); try (SequenceFile.Writer hfilePartitionWriter = SequenceFile.createWriter(hbaseConf, SequenceFile.Writer.file(hfilePartitionFile), SequenceFile.Writer.keyClass(RowKeyWritable.class), SequenceFile.Writer.valueClass(NullWritable.class))) { hfilePartitionWriter.append( new RowKeyWritable(KeyValueUtil.createFirstOnRow(splits.get(i), 9223372036854775807L).createKeyOnly(false).getKey()), NullWritable.get());
public void testProjectionNoreducer(String inputDirectory) throws Exception { JobConf job = new JobConf(); long onel = 1; Schema readerSchema = Schema.create(Schema.Type.STRING); AvroJob.setInputSchema(job, readerSchema); Path inputPath = new Path(inputDirectory + "/myavro2-m-00000.avro"); FileStatus fileStatus = FileSystem.get(job).getFileStatus(inputPath); FileSplit fileSplit = new FileSplit(inputPath, 0, fileStatus.getLen(), job); AvroRecordReader<Utf8> recordReader = new AvroRecordReader<>(job, fileSplit); AvroWrapper<Utf8> inputPair = new AvroWrapper<>(null); NullWritable ignore = NullWritable.get(); while (recordReader.next(inputPair, ignore)) { long testl = Long.parseLong(inputPair.datum().toString().split(":")[2].replace("}", "").trim()); Assert.assertEquals(onel, testl); } } }
public void testTotalOrderBinarySearch() throws Exception { TotalOrderPartitioner<Text,NullWritable> partitioner = new TotalOrderPartitioner<Text,NullWritable>(); Configuration conf = new Configuration(); Path p = TestTotalOrderPartitioner.<Text>writePartitionFile( "totalorderbinarysearch", conf, splitStrings); conf.setBoolean(TotalOrderPartitioner.NATURAL_ORDER, false); conf.setClass(MRJobConfig.MAP_OUTPUT_KEY_CLASS, Text.class, Object.class); try { partitioner.setConf(conf); NullWritable nw = NullWritable.get(); for (Check<Text> chk : testStrings) { assertEquals(chk.data.toString(), chk.part, partitioner.getPartition(chk.data, nw, splitStrings.length + 1)); } } finally { p.getFileSystem(conf).delete(p, true); } }
FileSystem fs = FileSystem.get(context.getConfiguration()); if (EtlMultiOutputFormat.isRunMoveData(context)) { Path workPath = super.getWorkPath(); Path baseOutDir = EtlMultiOutputFormat.getDestinationPath(context); log.info("Destination base path: " + baseOutDir); for (FileStatus f : fs.listStatus(workPath)) { String file = f.getPath().getName(); log.info("work file: " + file); if (file.startsWith("data")) { getPartitionedPath(context, file, count.getEventCount(), count.getLastKey().getOffset()); Path dest = new Path(baseOutDir, partitionedFile); if (!fs.exists(dest.getParent())) { mkdirs(fs, dest.getParent()); SequenceFile.Writer offsetWriter = SequenceFile.createWriter(fs, context.getConfiguration(), new Path(super.getWorkPath(), EtlMultiOutputFormat.getUniqueFile(context, EtlMultiOutputFormat.OFFSET_PREFIX, "")), log.info("Avg record size for " + offsets.get(s).getTopic() + ":" + offsets.get(s).getPartition() + " = " + offsets.get(s).getMessageSize()); offsetWriter.append(offsets.get(s), NullWritable.get()); offsetWriter.close(); super.commitTask(context);
FileSystem fs = FileSystem.get(URI.create(sequenceFile.getAbsolutePath()), conf); Path path = new Path(sequenceFile.getAbsolutePath()); Text value = new Text(); SequenceFile.Writer writer = null; try { writer = SequenceFile.createWriter(fs, conf, path, key.getClass(), value.getClass()); for (int i = 0; i < kvCount; i++) { if (i == 1) { value.set(i + " - somestring"); writer.append(key, value); value.set(i + " - somestring"); writer.append(key, value); path = new Path(sequenceFileInPathNull); writer1 = SequenceFile.createWriter(fs, conf, path, NullWritable.class, value1.getClass()); for (int i = 0; i < kvCount; i++) { value1.set(i); writer1.append(NullWritable.get(), value1);
public void testClose() throws IOException { Configuration conf = new Configuration(); LocalFileSystem fs = FileSystem.getLocal(conf); Path path1 = new Path(System.getProperty("test.build.data",".")+"/test1.seq"); SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, path1, Text.class, NullWritable.class, CompressionType.BLOCK); writer.append(new Text("file1-1"), NullWritable.get()); writer.append(new Text("file1-2"), NullWritable.get()); writer.close(); Path path2 = new Path(System.getProperty("test.build.data",".")+"/test2.seq"); writer = SequenceFile.createWriter(fs, conf, path2, Text.class, NullWritable.class, CompressionType.BLOCK); writer.append(new Text("file2-1"), NullWritable.get()); writer.append(new Text("file2-2"), NullWritable.get()); writer.close();
final String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID); final String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH); final Path partitionFilePath = new Path(optionsHelper.getOptionValue(OPTION_PARTITION_FILE_PATH)); final String hbaseConfFile = optionsHelper.getOptionValue(AbstractHadoopJob.OPTION_HBASE_CONF_PATH); final String counterPath = optionsHelper.getOptionValue(OPTION_COUNTER_PATH); try (JavaSparkContext sc = new JavaSparkContext(conf)) { sc.sc().addSparkListener(jobListener); final FileSystem fs = partitionFilePath.getFileSystem(sc.hadoopConfiguration()); if (!fs.exists(partitionFilePath)) { throw new IllegalArgumentException("File not exist: " + partitionFilePath.toString()); try (SequenceFile.Reader reader = new SequenceFile.Reader(fs, partitionFilePath, sc.hadoopConfiguration())) { RowKeyWritable key = new RowKeyWritable(); Writable value = NullWritable.get(); while (reader.next(key, value)) { keys.add(key); final FileSystem hbaseClusterFs = hbaseConfFilePath.getFileSystem(sc.hadoopConfiguration()); try (FSDataInputStream confInput = hbaseClusterFs.open(new Path(hbaseConfFile))) { Configuration hbaseJobConf = new Configuration(); hbaseJobConf.addResource(confInput);
public static Job createTimesSquaredJob(Configuration initialConf, Vector v, int outputVectorDim, Path matrixInputPath, Path outputVectorPathBase, Class<? extends TimesSquaredMapper> mapClass, Class<? extends VectorSummingReducer> redClass) throws IOException { FileSystem fs = FileSystem.get(matrixInputPath.toUri(), initialConf); matrixInputPath = fs.makeQualified(matrixInputPath); outputVectorPathBase = fs.makeQualified(outputVectorPathBase); long now = System.nanoTime(); Path inputVectorPath = new Path(outputVectorPathBase, INPUT_VECTOR + '/' + now); SequenceFile.Writer inputVectorPathWriter = null; try { inputVectorPathWriter = new SequenceFile.Writer(fs, initialConf, inputVectorPath, NullWritable.class, VectorWritable.class); inputVectorPathWriter.append(NullWritable.get(), new VectorWritable(v)); } finally { Closeables.close(inputVectorPathWriter, false); } URI ivpURI = inputVectorPath.toUri(); DistributedCache.setCacheFiles(new URI[] { ivpURI }, initialConf); Job job = HadoopUtil.prepareJob(matrixInputPath, new Path(outputVectorPathBase, OUTPUT_VECTOR_FILENAME), SequenceFileInputFormat.class, mapClass, NullWritable.class, VectorWritable.class, redClass, NullWritable.class, VectorWritable.class, SequenceFileOutputFormat.class, initialConf); job.setCombinerClass(redClass); job.setJobName("TimesSquaredJob: " + matrixInputPath); Configuration conf = job.getConfiguration(); conf.set(INPUT_VECTOR, ivpURI.toString()); conf.setBoolean(IS_SPARSE_OUTPUT, !v.isDense()); conf.setInt(OUTPUT_VECTOR_DIMENSION, outputVectorDim); return job; }
@Override public ReadableSource<T> createSourceTarget(Configuration conf, Path path, Iterable<T> contents, int parallelism) throws IOException { FileSystem fs = FileSystem.get(conf); outputFn.setConfiguration(conf); outputFn.initialize(); if (Text.class.equals(writableClass) && parallelism > 1) { FSDataOutputStream out = fs.create(path); byte[] newLine = "\r\n".getBytes(Charsets.UTF_8); double contentSize = 0; for (T value : contents) { Text txt = (Text) outputFn.map(value); out.write(txt.toString().getBytes(Charsets.UTF_8)); out.write(newLine); contentSize++; return new NLineFileSource<T>(path, this, (int) Math.ceil(contentSize / parallelism)); } else { // Use sequence files fs.mkdirs(path); List<SequenceFile.Writer> writers = Lists.newArrayListWithExpectedSize(parallelism); for (int i = 0; i < parallelism; i++) { Path out = new Path(path, "out" + i); writers.add(SequenceFile.createWriter(fs, conf, out, NullWritable.class, writableClass)); writers.get(target).append(NullWritable.get(), outputFn.map(value)); target = (target + 1) % parallelism; writer.close();
@Test public void testNullKeys() throws Exception { JobConf conf = new JobConf(TestMapRed.class); FileSystem fs = FileSystem.getLocal(conf); HashSet<String> values = new HashSet<String>(); String m = "AAAAAAAAAAAAAA"; m = m.replace((char)('A' + i - 1), (char)('A' + i)); Path testdir = new Path( System.getProperty("test.build.data","/tmp")).makeQualified(fs); fs.delete(testdir, true); Path inFile = new Path(testdir, "nullin/blah"); SequenceFile.Writer w = SequenceFile.createWriter(fs, conf, inFile, NullWritable.class, Text.class, SequenceFile.CompressionType.NONE); Text t = new Text(); for (String s : values) { t.set(s); w.append(NullWritable.get(), t); w.close(); FileInputFormat.setInputPaths(conf, inFile); FileOutputFormat.setOutputPath(conf, new Path(testdir, "nullout")); new Path(testdir, "nullout/part-00000"), conf); m = "AAAAAAAAAAAAAA"; for (int i = 1; r.next(NullWritable.get(), t); ++i) { assertTrue("Unexpected value: " + t, values.remove(t.toString())); m = m.replace((char)('A' + i - 1), (char)('A' + i));
Configuration conf = new Configuration(); OrcOutputFormat of = new OrcOutputFormat(); FileSystem fs = FileSystem.getLocal(conf); Path root = new Path(tmpDir, "testRecordReaderDelta").makeQualified(fs); fs.delete(root, true); ObjectInspector inspector; synchronized (TestOrcFile.class) { job.set("mapred.min.split.size", "1"); job.set("mapred.max.split.size", "2"); job.set("mapred.input.dir", root.toString()); job.set("bucket_count", "1"); job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, MyRow.getColumnNamesProperty()); System.out.println("Checking " + i); String msg = "split[" + j + "] at i=" + i; assertEquals(msg, true, rr.next(NullWritable.get(), row)); assertEquals(msg, values[j][i], row.getFieldValue(0).toString()); assertEquals(false, rr.next(NullWritable.get(), row));
RawComparator<K> comparator = (RawComparator<K>) job.getGroupingComparator(); Arrays.sort(samples, comparator); Path dst = new Path(TotalOrderPartitioner.getPartitionFile(conf)); FileSystem fs = dst.getFileSystem(conf); if (fs.exists(dst)) fs.delete(dst, false); SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, dst, job.getMapOutputKeyClass(), NullWritable.class); NullWritable nullValue = NullWritable.get(); float stepSize = samples.length / (float) numPartitions; writer.append(currentKey, nullValue); lastKey = currentKey; lastKeyIndex = currentKeyOffset; splits.add(currentKey); writer.close(); LOG.info("********************************************* "); LOG.info(" START KEYs for new Regions: ");
long blockIndex = 0; int datanodeId = key.get(); String dnFile = "dn" + datanodeId + "-a-" + context.getTaskAttemptID().getId(); Iterator<BlockInfo> it = values.iterator(); long startTimestamp = System.currentTimeMillis(); String fullPath = new Path(baseOutputPath, dnFile).toString(); Text out = new Text(); while (it.hasNext()) { BlockInfo blockInfo = new BlockInfo(it.next()); String blockLine = blockInfo.getBlockId() + "," + blockInfo.getBlockGenerationStamp() + "," + blockInfo.getSize(); out.set(blockLine); multiOutputs.write(NullWritable.get(), out, fullPath); context.progress(); endTimestamp = System.currentTimeMillis(); System.out.println("Time taken to process 1000 records in ms:" + (endTimestamp - startTimestamp));
private static <T extends WritableComparable<?>> Path writePartitionFile( String testname, Configuration conf, T[] splits) throws IOException { final FileSystem fs = FileSystem.getLocal(conf); final Path testdir = new Path(System.getProperty("test.build.data", "/tmp") ).makeQualified(fs); Path p = new Path(testdir, testname + "/_partition.lst"); TotalOrderPartitioner.setPartitionFile(conf, p); conf.setInt(MRJobConfig.NUM_REDUCES, splits.length + 1); SequenceFile.Writer w = null; try { w = SequenceFile.createWriter(fs, conf, p, splits[0].getClass(), NullWritable.class, SequenceFile.CompressionType.NONE); for (int i = 0; i < splits.length; ++i) { w.append(splits[i], NullWritable.get()); } } finally { if (null != w) w.close(); } return p; }
Path intStringSeq = new Path(TEST_DATA_DIR + "/data/intString.seq"); LOG.info("Creating data file: " + intStringSeq); SequenceFile.Writer seqFileWriter = SequenceFile.createWriter( intStringSeq.getFileSystem(hiveConf), hiveConf, intStringSeq, NullWritable.class, BytesWritable.class); seqFileWriter.append(NullWritable.get(), new BytesWritable("bad record".getBytes())); } else { out.reset(); intString.write(protocol); BytesWritable bytesWritable = new BytesWritable(out.toByteArray()); seqFileWriter.append(NullWritable.get(), bytesWritable); seqFileWriter.close(); " outputformat 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'") .getResponseCode()); Assert.assertEquals(0, driver.run("load data local inpath '" + intStringSeq.getParent() + "' into table test_bad_records").getResponseCode());
@Override public void write(String line) throws IOException { if (writer == null) { tmpFile = File.createTempFile("seq-", ".dat"); writer = SequenceFile.createWriter(new Configuration(), Writer.file(new Path(tmpFile.toURI())), Writer.keyClass(NullWritable.class), Writer.valueClass(Text.class)); } text.set(line); writer.append(NullWritable.get(), text); }
public void map(LongWritable key, Text value, Context context) throws InterruptedException, IOException { Path inputPath = new Path(value.toString()); List<Path> inputPaths = null; try { inputPaths = Backup.getAllPaths(inputFs, inputPath); if (inputPaths != null && inputPaths.size() > 0) { for (Path p : inputPaths) { copyFile(p, context); } } } catch (Exception e) { LOG.error("Directory getPaths failed", e); context.getCounter(ReportStats.DIRECTORY_GET_PATHS_FAILED).increment(1L); context.write(new Text(value.toString()), NullWritable.get()); return; } }
private void createRCFile(final String fileName, final int numRecords, final int maxColumns, boolean addNullValue) throws IOException { // Write the sequence file SequenceFile.Metadata metadata = getMetadataForRCFile(); Configuration conf = new Configuration(); conf.set(RCFile.COLUMN_NUMBER_CONF_STR, String.valueOf(maxColumns)); Path inputFile = dfs.makeQualified(new Path(testDirectory, fileName)); RCFile.Writer rcFileWriter = new RCFile.Writer(dfs, conf, inputFile, null, metadata, null); for (int row = 0; row < numRecords; row++) { BytesRefArrayWritable dataWrite = new BytesRefArrayWritable(maxColumns); dataWrite.resetValid(maxColumns); for (int column = 0; column < maxColumns; column++) { Writable sampleText = new Text( "ROW-NUM:" + row + ", COLUMN-NUM:" + column); // Set the last column of the last row as null if (addNullValue && column == maxColumns - 1 && row == numRecords - 1) { sampleText = NullWritable.get(); } ByteArrayDataOutput dataOutput = ByteStreams.newDataOutput(); sampleText.write(dataOutput); dataWrite.set(column, new BytesRefWritable(dataOutput.toByteArray())); } rcFileWriter.append(dataWrite); } rcFileWriter.close(); }