private void positionAtFirstRecord(FSDataInputStream stream) throws IOException { if (start > 0) { // Advance to the start of the first line in our slice. // We use a temporary LineReader to read a partial line and find the // start of the first one on or after our starting position. // In case our slice starts right at the beginning of a line, we need to back // up by one position and then discard the first line. start -= 1; stream.seek(start); LineReader reader = new LineReader(stream); int bytesRead = reader.readLine(buffer, (int)Math.min(MAX_LINE_LENGTH, end - start)); start = start + bytesRead; stream.seek(start); } // else // if start == 0 we're starting at the beginning of a line pos = start; }
/** * Skip n bytes from the InputStream. * @param n the number of bytes to skip. * @return the number of bytes skipped. * @throws IOException if the underlying stream throws. */ public long skip(long n) throws IOException { boolean end = false; long toskip = n; while (toskip > 0 && !end) { if (bufferPosn < bufferLength) { int skipped = (int)Math.min(bufferLength - bufferPosn, toskip); bufferPosn += skipped; toskip -= skipped; } if (bufferPosn >= bufferLength) { int loaded = loadBuffer(); end = loaded == 0; } } return n - toskip; }
@Test public void testSkipOnBufferedLine() throws IOException { reader = new LineReader(new ByteArrayInputStream(input22.getBytes()), 22); long skipped = reader.skip(1); assertEquals(1, skipped); reader.readLine(dest); assertEquals("123456789", dest.toString()); }
/** * Read from the InputStream into the given Text. * @param str the object to store the given line * @return the number of bytes read including the newline * @throws IOException if the underlying stream throws */ public int readLine(Text str) throws IOException { return readLine(str, Integer.MAX_VALUE, Integer.MAX_VALUE); }
public FastaRecordReader(Configuration conf, FileSplit split) throws IOException { file = split.getPath(); start = split.getStart(); end = start + split.getLength(); current_split_pos = 1; FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(file); CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(file); if (codec == null) // no codec. Uncompressed file. { positionAtFirstRecord(fileIn); inputStream = fileIn; } else { // compressed file if (start != 0) throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")"); inputStream = codec.createInputStream(fileIn); end = Long.MAX_VALUE; // read until the end of the file } lineReader = new LineReader(inputStream); }
@Test public void testSkipBeyondInput() throws IOException { reader = new LineReader(new ByteArrayInputStream(input10.getBytes()), 5); long skipped = reader.skip(11); assertEquals(10, skipped); skipped = reader.skip(11); assertEquals(0, skipped); }
protected boolean lowLevelFastqRead(Text key, SequencedFragment value) throws IOException { // ID line long skipped = lineReader.skip(1); // skip @ pos += skipped; if (skipped == 0) return false; // EOF // ID readLineInto(key); // sequence value.clear(); readLineInto(value.getSequence()); readLineInto(buffer); if (buffer.getLength() == 0 || buffer.getBytes()[0] != '+') throw new RuntimeException("unexpected fastq line separating sequence and quality at " + makePositionMessage() + ". Line: " + buffer + ". \nSequence ID: " + key); readLineInto(value.getQuality()); // look for the Illumina-formatted name. Once it isn't found lookForIlluminaIdentifier will be set to false lookForIlluminaIdentifier = lookForIlluminaIdentifier && scanIlluminaId(key, value); if (!lookForIlluminaIdentifier) scanNameForReadNumber(key, value); return true; }
@Test public void testSkipBeyondBuffer() throws IOException { reader = new LineReader(new ByteArrayInputStream(input22.getBytes()), 5); long skipped = reader.skip(11); assertEquals(11, skipped); reader.readLine(dest); assertEquals("0987654321", dest.toString()); }
/** * Read from the InputStream into the given Text. * @param str the object to store the given line * @param maxLineLength the maximum number of bytes to store into str. * @return the number of bytes read including the newline * @throws IOException if the underlying stream throws */ public int readLine(Text str, int maxLineLength) throws IOException { return readLine(str, maxLineLength, Integer.MAX_VALUE); }
public QseqRecordReader(Configuration conf, FileSplit split) throws IOException { setConf(conf); file = split.getPath(); start = split.getStart(); end = start + split.getLength(); FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(file); CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(file); if (codec == null) // no codec. Uncompressed file. { positionAtFirstRecord(fileIn); inputStream = fileIn; } else { // compressed file if (start != 0) throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")"); inputStream = codec.createInputStream(fileIn); end = Long.MAX_VALUE; // read until the end of the file } lineReader = new LineReader(inputStream); }
protected boolean lowLevelFastqRead(Text key, SequencedFragment value) throws IOException { // ID line long skipped = lineReader.skip(1); // skip @ pos += skipped; if (skipped == 0) return false; // EOF // ID readLineInto(key); // sequence value.clear(); readLineInto(value.getSequence()); readLineInto(buffer); if (buffer.getLength() == 0 || buffer.getBytes()[0] != '+') throw new RuntimeException("unexpected fastq line separating sequence and quality at " + makePositionMessage() + ". Line: " + buffer + ". \nSequence ID: " + key); readLineInto(value.getQuality()); // look for the Illumina-formatted name. Once it isn't found lookForIlluminaIdentifier will be set to false lookForIlluminaIdentifier = lookForIlluminaIdentifier && scanIlluminaId(key, value); if (!lookForIlluminaIdentifier) scanNameForReadNumber(key, value); return true; }
private void positionAtFirstRecord(FSDataInputStream stream) throws IOException { if (start > 0) { // Advance to the start of the first line in our slice. // We use a temporary LineReader to read a partial line and find the // start of the first one on or after our starting position. // In case our slice starts right at the beginning of a line, we need to back // up by one position and then discard the first line. start -= 1; stream.seek(start); LineReader reader = new LineReader(stream); int bytesRead = reader.readLine(buffer, (int)Math.min(MAX_LINE_LENGTH, end - start)); start = start + bytesRead; stream.seek(start); } // else // if start == 0 we're starting at the beginning of a line pos = start; }
/** * Read from the InputStream into the given Text. * @param str the object to store the given line * @return the number of bytes read including the newline * @throws IOException if the underlying stream throws */ public int readLine(Text str) throws IOException { return readLine(str, Integer.MAX_VALUE, Integer.MAX_VALUE); }
public FastqRecordReader(Configuration conf, FileSplit split) throws IOException { setConf(conf); file = split.getPath(); start = split.getStart(); end = start + split.getLength(); FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(file); CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(file); if (codec == null) { // no codec. Uncompressed file. positionAtFirstRecord(fileIn); inputStream = fileIn; } else { // compressed file if (start != 0) { throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")"); } inputStream = codec.createInputStream(fileIn); end = Long.MAX_VALUE; // read until the end of the file } lineReader = new LineReader(inputStream); }
protected boolean lowLevelFastqRead(Text key, SequencedFragment value) throws IOException { // ID line long skipped = lineReader.skip(1); // skip @ pos += skipped; if (skipped == 0) return false; // EOF // ID readLineInto(key); // sequence value.clear(); readLineInto(value.getSequence()); readLineInto(buffer); if (buffer.getLength() == 0 || buffer.getBytes()[0] != '+') throw new RuntimeException("unexpected fastq line separating sequence and quality at " + makePositionMessage() + ". Line: " + buffer + ". \nSequence ID: " + key); readLineInto(value.getQuality()); // look for the Illumina-formatted name. Once it isn't found lookForIlluminaIdentifier will be set to false lookForIlluminaIdentifier = lookForIlluminaIdentifier && scanIlluminaId(key, value); if (!lookForIlluminaIdentifier) scanNameForReadNumber(key, value); return true; }
/** * Skip n bytes from the InputStream. * @param n the number of bytes to skip. * @return the number of bytes skipped. * @throws IOException if the underlying stream throws. */ public long skip(long n) throws IOException { boolean end = false; long toskip = n; while (toskip > 0 && !end) { if (bufferPosn < bufferLength) { int skipped = (int)Math.min(bufferLength - bufferPosn, toskip); bufferPosn += skipped; toskip -= skipped; } if (bufferPosn >= bufferLength) { int loaded = loadBuffer(); end = loaded == 0; } } return n - toskip; }
private void positionAtFirstRecord(FSDataInputStream stream) throws IOException { if (start > 0) { // Advance to the start of the first line in our slice. // We use a temporary LineReader to read a partial line and find the // start of the first one on or after our starting position. // In case our slice starts right at the beginning of a line, we need to back // up by one position and then discard the first line. start -= 1; stream.seek(start); LineReader reader = new LineReader(stream); int bytesRead = reader.readLine(buffer, (int)Math.min(MAX_LINE_LENGTH, end - start)); start = start + bytesRead; stream.seek(start); } // else // if start == 0 we're starting at the beginning of a line pos = start; }
/** * Read from the InputStream into the given Text. * @param str the object to store the given line * @param maxLineLength the maximum number of bytes to store into str. * @return the number of bytes read including the newline * @throws IOException if the underlying stream throws */ public int readLine(Text str, int maxLineLength) throws IOException { return readLine(str, maxLineLength, Integer.MAX_VALUE); }
public FastaRecordReader(Configuration conf, FileSplit split) throws IOException { setConf(conf); file = split.getPath(); start = split.getStart(); end = start + split.getLength(); current_split_pos = 1; FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(file); CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(file); if (codec == null) // no codec. Uncompressed file. { positionAtFirstRecord(fileIn); inputStream = fileIn; } else { // compressed file if (start != 0) throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")"); inputStream = codec.createInputStream(fileIn); end = Long.MAX_VALUE; // read until the end of the file } lineReader = new LineReader(inputStream); }
protected boolean lowLevelFastqRead(Text key, SequencedFragment value) throws IOException { // ID line long skipped = lineReader.skip(1); // skip @ pos += skipped; if (skipped == 0) { return false; // EOF } // ID readLineInto(key); // sequence value.clear(); readLineInto(value.getSequence()); readLineInto(buffer); if (buffer.getLength() == 0 || buffer.getBytes()[0] != '+') { throw new RuntimeException("unexpected fastq line separating sequence and quality at " + makePositionMessage() + ". Line: " + buffer + ". \nSequence ID: " + key); } readLineInto(value.getQuality()); // look for the Illumina-formatted name. Once it isn't found lookForIlluminaIdentifier will be set to false lookForIlluminaIdentifier = lookForIlluminaIdentifier && scanIlluminaId(key, value); if (!lookForIlluminaIdentifier) { scanNameForReadNumber(key, value); } return true; }