public SequenceFileReader(URI uri, Configuration c) throws IOException { this(new SequenceFile.Reader(FileSystem.get(uri, c), new Path(uri.toString()), c), c); }
/** * Read the metadata from a hadoop SequenceFile * * @param fs The filesystem to read from * @param path The file to read from * @return The metadata from this file */ public static Map<String, String> getMetadataFromSequenceFile(FileSystem fs, Path path) { try { Configuration conf = new Configuration(); conf.setInt("io.file.buffer.size", 4096); SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, new Configuration()); SequenceFile.Metadata meta = reader.getMetadata(); reader.close(); TreeMap<Text, Text> map = meta.getMetadata(); Map<String, String> values = new HashMap<String, String>(); for(Map.Entry<Text, Text> entry: map.entrySet()) values.put(entry.getKey().toString(), entry.getValue().toString()); return values; } catch(IOException e) { throw new RuntimeException(e); } }
/** * Retrives a {@link Token} from a given sequence file for a specified user. The sequence file should contain a list * of key, value pairs where each key corresponds to a user and each value corresponds to a {@link Token} for that * user. * * @param userNameKey The name of the user to retrieve a {@link Token} for * @param tokenFilePath The path to the sequence file containing the {@link Token}s * * @return A {@link Token} for the given user name */ public static Optional<Token<?>> getTokenFromSeqFile(String userNameKey, Path tokenFilePath) throws IOException { log.info("Reading tokens from sequence file " + tokenFilePath); try (Closer closer = Closer.create()) { FileSystem localFs = FileSystem.getLocal(new Configuration()); @SuppressWarnings("deprecation") SequenceFile.Reader tokenReader = closer.register(new SequenceFile.Reader(localFs, tokenFilePath, localFs.getConf())); Text key = new Text(); Token<?> value = new Token<>(); while (tokenReader.next(key, value)) { log.debug("Found token for user: " + key); if (key.toString().equals(userNameKey)) { return Optional.<Token<?>> of(value); } } } log.warn("Did not find any tokens for user " + userNameKey); return Optional.absent(); }
@Override void doRecover(Path srcPath, long nBytes) throws Exception { SequenceFile.Reader reader = new SequenceFile.Reader(this.hdfsConfig, SequenceFile.Reader.file(srcPath), SequenceFile.Reader.length(nBytes)); Writable key = (Writable) this.format.keyClass().newInstance(); Writable value = (Writable) this.format.valueClass().newInstance(); while (reader.next(key, value)) { this.writer.append(key, value); } }
public void performSequenceFileRead(FileSystem fs, int count, Path file) throws IOException { SequenceFile.Reader reader = new SequenceFile.Reader(fs, file, conf); ByteWritable key = new ByteWritable(); BytesRefArrayWritable val = new BytesRefArrayWritable(); for (int i = 0; i < count; i++) { reader.next(key, val); } }
private void readPartitionFile(FileSystem fs, Configuration conf, Path path) throws IOException { @SuppressWarnings("deprecation") SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf); ImmutableBytesWritable key = new ImmutableBytesWritable(); partitions = new ArrayList<>(); while (reader.next(key)) { partitions.add(new ImmutableBytesWritable(key.copyBytes())); } reader.close(); if (!Ordering.natural().isOrdered(partitions)) { throw new IOException("Partitions are not ordered!"); } }
@Ignore("convenient trial tool for dev") @Test public void test() throws IOException, InterruptedException { Configuration hconf = HadoopUtil.getCurrentConfiguration(); HiveToBaseCuboidMapper mapper = new HiveToBaseCuboidMapper(); Context context = MockupMapContext.create(hconf, metadataUrl, cubeName, null); mapper.doSetup(context); Reader reader = new Reader(hconf, SequenceFile.Reader.file(srcPath)); Writable key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), hconf); Text value = new Text(); while (reader.next(key, value)) { mapper.map(key, value, context); } reader.close(); }
@Override public boolean validateInput(FileSystem fs, HiveConf conf, List<FileStatus> files) throws IOException { if (files.size() <= 0) { return false; } for (int fileId = 0; fileId < files.size(); fileId++) { SequenceFile.Reader reader = null; try { reader = new SequenceFile.Reader(fs, files.get( fileId).getPath(), conf); reader.close(); reader = null; } catch (IOException e) { return false; } finally{ IOUtils.closeStream(reader); } } return true; }
@Override public Void call() throws Exception { Configuration conf = new Configuration(ParallelRunner.this.fs.getConf()); WritableShimSerialization.addToHadoopConfiguration(conf); try (@SuppressWarnings("deprecation") SequenceFile.Reader reader = new SequenceFile.Reader( ParallelRunner.this.fs, inputFilePath, conf)) { Writable key = keyClass.newInstance(); T state = stateClass.newInstance(); while (reader.next(key)) { state = (T) reader.getCurrentValue(state); states.add(state); state = stateClass.newInstance(); } if (deleteAfter) { HadoopUtils.deletePath(ParallelRunner.this.fs, inputFilePath, false); } } return null; } }), "Deserialize state from file " + inputFilePath));
public void initIOContext(FileSplit split, JobConf job, Class inputFormatClass, RecordReader recordReader) throws IOException { boolean blockPointer = false; long blockStart = -1; FileSplit fileSplit = split; Path path = fileSplit.getPath(); FileSystem fs = path.getFileSystem(job); if (inputFormatClass.getName().contains("SequenceFile")) { SequenceFile.Reader in = new SequenceFile.Reader(fs, path, job); blockPointer = in.isBlockCompressed(); in.sync(fileSplit.getStart()); blockStart = in.getPosition(); in.close(); } else if (recordReader instanceof RCFileRecordReader) { blockPointer = true; blockStart = ((RCFileRecordReader) recordReader).getStart(); } else if (inputFormatClass.getName().contains("RCFile")) { blockPointer = true; RCFile.Reader in = new RCFile.Reader(fs, path, job); in.sync(fileSplit.getStart()); blockStart = in.getPosition(); in.close(); } this.jobConf = job; this.initIOContext(blockStart, blockPointer, path.makeQualified(fs)); this.initIOContextSortedProps(split, recordReader, job); }
@Override public boolean validateInput(FileSystem fs, HiveConf conf, List<FileStatus> files) throws IOException { if (files.size() <= 0) { return false; } for (int fileId = 0; fileId < files.size(); fileId++) { SequenceFile.Reader reader = null; try { reader = new SequenceFile.Reader(fs, files.get( fileId).getPath(), conf); reader.close(); reader = null; } catch (IOException e) { return false; } finally{ IOUtils.closeStream(reader); } } return true; }
public CubeStatsResult(Path path, int precision) throws IOException { Configuration hadoopConf = HadoopUtil.getCurrentConfiguration(); Option seqInput = SequenceFile.Reader.file(path); try (Reader reader = new SequenceFile.Reader(hadoopConf, seqInput)) { LongWritable key = (LongWritable) ReflectionUtils.newInstance(reader.getKeyClass(), hadoopConf); BytesWritable value = (BytesWritable) ReflectionUtils.newInstance(reader.getValueClass(), hadoopConf); while (reader.next(key, value)) { if (key.get() == 0L) { percentage = Bytes.toInt(value.getBytes()); } else if (key.get() == -1) { mapperOverlapRatio = Bytes.toDouble(value.getBytes()); } else if (key.get() == -2) { mapperNumber = Bytes.toInt(value.getBytes()); } else if (key.get() == -3) { sourceRecordCount = Bytes.toLong(value.getBytes()); } else if (key.get() > 0) { HLLCounter hll = new HLLCounter(precision); ByteArray byteArray = new ByteArray(value.getBytes()); hll.readRegisters(byteArray.asBuffer()); counterMap.put(key.get(), hll); } } } }
/** * Check if there's partition files for hfile, if yes replace the table splits, to make the job more reducers * @param conf the job configuration * @param path the hfile partition file * @throws IOException */ @SuppressWarnings("deprecation") private void reconfigurePartitions(Configuration conf, Path path) throws IOException { FileSystem fs = path.getFileSystem(conf); if (fs.exists(path)) { try (SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf)) { int partitionCount = 0; Writable key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), conf); Writable value = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), conf); while (reader.next(key, value)) { partitionCount++; } TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), path); // The reduce tasks should be one more than partition keys job.setNumReduceTasks(partitionCount + 1); } } else { logger.info("File '" + path.toString() + " doesn't exist, will not reconfigure hfile Partitions"); } }
/** * Open and read just the metadata header from a SequenceFile. * * @param fs The FileSystem the SequenceFile is on. * @param path The path to the file. * @param conf The Hadoop configuration. * @return The metadata header. * @throws IOException If the metadata cannot be read from the file. */ private static Metadata getMetadata(FileSystem fs, Path path, Configuration conf) throws IOException { SequenceFile.Reader metadataReader = null; try { metadataReader = new SequenceFile.Reader(fs, path, conf); return metadataReader.getMetadata(); } finally { if (null != metadataReader) { metadataReader.close(); } } } }
/** * Clones the attributes (like compression of the input file and creates a * corresponding Writer * @param inputFile the path of the input file whose attributes should be * cloned * @param outputFile the path of the output file * @param prog the Progressable to report status during the file write * @return Writer * @throws IOException */ public Writer cloneFileAttributes(Path inputFile, Path outputFile, Progressable prog) throws IOException { Reader reader = new Reader(conf, Reader.file(inputFile), new Reader.OnlyHeaderOption()); CompressionType compress = reader.getCompressionType(); CompressionCodec codec = reader.getCompressionCodec(); reader.close(); Writer writer = createWriter(conf, Writer.file(outputFile), Writer.keyClass(keyClass), Writer.valueClass(valClass), Writer.compression(compress, codec), Writer.progressable(prog)); return writer; }
public void initIOContext(FileSplit split, JobConf job, Class inputFormatClass, RecordReader recordReader) throws IOException { boolean blockPointer = false; long blockStart = -1; FileSplit fileSplit = split; Path path = fileSplit.getPath(); FileSystem fs = path.getFileSystem(job); if (inputFormatClass.getName().contains("SequenceFile")) { SequenceFile.Reader in = new SequenceFile.Reader(fs, path, job); blockPointer = in.isBlockCompressed(); in.sync(fileSplit.getStart()); blockStart = in.getPosition(); in.close(); } else if (recordReader instanceof RCFileRecordReader) { blockPointer = true; blockStart = ((RCFileRecordReader) recordReader).getStart(); } else if (inputFormatClass.getName().contains("RCFile")) { blockPointer = true; RCFile.Reader in = new RCFile.Reader(fs, path, job); in.sync(fileSplit.getStart()); blockStart = in.getPosition(); in.close(); } this.jobConf = job; this.initIOContext(blockStart, blockPointer, path.makeQualified(fs)); this.initIOContextSortedProps(split, recordReader, job); }
final SequenceFile.Reader reader = new SequenceFile.Reader(configuration, Reader.file(fileSystem.makeQualified(file))); final String inputfileName = file.getName() + "." + System.nanoTime() + "."; int counter = 0; Text key = new Text(); try { while (reader.next(key)) { String fileName = key.toString();
reader = new SequenceFile.Reader(configuration, Reader.file(fileSystem.makeQualified(file))); final Text key = new Text(); final KeyValueWriterCallback callback = new KeyValueWriterCallback(reader); LOG.debug("Read from SequenceFile: {} ", new Object[]{file}); try { while (reader.next(key)) { String fileName = key.toString();
in = new Reader(fs, inFiles[currentFile], conf); compressionType = in.getCompressionType(); codec = in.getCompressionCodec(); int recordLength = in.nextRaw(rawKeys, rawValue); if (recordLength == -1) { in.close(); if (deleteInput) { fs.delete(inFiles[currentFile], true); atEof = currentFile >= inFiles.length; if (!atEof) { in = new Reader(fs, inFiles[currentFile], conf); } else { in = null;
SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(p), new Reader.OnlyHeaderOption()); try { reader.close();