@Test(expected=RuntimeException.class) public void testCompressedSplit() throws IOException { // write gzip-compressed data GzipCodec codec = new GzipCodec(); PrintWriter qseqOut = new PrintWriter( new BufferedOutputStream( codec.createOutputStream( new FileOutputStream(tempGz) ) ) ); qseqOut.write(twoQseq); qseqOut.close(); // now try to read it starting from the middle split = new FileSplit(new Path(tempGz.toURI().toString()), 10, twoQseq.length(), null); QseqRecordReader reader = new QseqRecordReader(conf, split); } @Test
private void scanQseqLine(Text line, Text key, SequencedFragment fragment) setFieldPositionsAndLengths(line); throw new FormatException("Invalid character format at " + makePositionMessage(this.pos - line.getLength()) + "; line: " + line);
private void qualityConfigTest() throws IOException { writeToTempQseq(sangerQseq); split = new FileSplit(new Path(tempQseq.toURI().toString()), 0, sangerQseq.length(), null); QseqRecordReader reader = new QseqRecordReader(conf, split); assertTrue(reader.next(key, fragment)); assertEquals("###########################################################################################", fragment.getQuality().toString()); }
private void scanQseqLine(Text line, Text key, SequencedFragment fragment) setFieldPositionsAndLengths(line); throw new FormatException("Invalid character format at " + makePositionMessage(this.pos - line.getLength()) + "; line: " + line);
private void scanQseqLine(Text line, Text key, SequencedFragment fragment) setFieldPositionsAndLengths(line); throw new FormatException("Invalid character format at " + makePositionMessage(this.pos - line.getLength()) + "; line: " + line);
public QseqRecordReader(Configuration conf, FileSplit split) throws IOException { setConf(conf); file = split.getPath(); start = split.getStart(); end = start + split.getLength(); FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(file); CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(file); if (codec == null) // no codec. Uncompressed file. { positionAtFirstRecord(fileIn); inputStream = fileIn; } else { // compressed file if (start != 0) throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")"); inputStream = codec.createInputStream(fileIn); end = Long.MAX_VALUE; // read until the end of the file } lineReader = new LineReader(inputStream); }
/** * Reads the next key/value pair from the input for processing. */ public boolean next(Text key, SequencedFragment value) throws IOException { if (pos >= end) return false; // past end of slice int bytesRead = 0; boolean goodRecord; do { bytesRead = lowLevelQseqRead(key, value); // if bytesRead <= 0 EOF has been reached goodRecord = (bytesRead > 0) && (!filterFailedQC || value.getFilterPassed() == null || value.getFilterPassed()); } while (bytesRead > 0 && !goodRecord); if (goodRecord) // post process the record only if it's going to be used { try { postProcessSequencedFragment(value); } catch (FormatException e) { throw new FormatException(e.getMessage() + " Position: " + makePositionMessage(this.pos - bytesRead) + "; line: " + buffer); // last line read is still in the buffer } } return goodRecord; }
/** * Reads the next key/value pair from the input for processing. */ public boolean next(Text key, SequencedFragment value) throws IOException { if (pos >= end) return false; // past end of slice int bytesRead = 0; boolean goodRecord; do { bytesRead = lowLevelQseqRead(key, value); // if bytesRead <= 0 EOF has been reached goodRecord = (bytesRead > 0) && (!filterFailedQC || value.getFilterPassed() == null || value.getFilterPassed()); } while (bytesRead > 0 && !goodRecord); if (goodRecord) // post process the record only if it's going to be used { try { postProcessSequencedFragment(value); } catch (FormatException e) { throw new FormatException(e.getMessage() + " Position: " + makePositionMessage(this.pos - bytesRead) + "; line: " + buffer); // last line read is still in the buffer } } return goodRecord; }
public QseqRecordReader(Configuration conf, FileSplit split) throws IOException { setConf(conf); file = split.getPath(); start = split.getStart(); end = start + split.getLength(); FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(file); CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(file); if (codec == null) // no codec. Uncompressed file. { positionAtFirstRecord(fileIn); inputStream = fileIn; } else { // compressed file if (start != 0) throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")"); inputStream = codec.createInputStream(fileIn); end = Long.MAX_VALUE; // read until the end of the file } lineReader = new LineReader(inputStream); }
public QseqRecordReader(Configuration conf, FileSplit split) throws IOException { setConf(conf); file = split.getPath(); start = split.getStart(); end = start + split.getLength(); FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(file); CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(file); if (codec == null) // no codec. Uncompressed file. { positionAtFirstRecord(fileIn); inputStream = fileIn; } else { // compressed file if (start != 0) throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")"); inputStream = codec.createInputStream(fileIn); end = Long.MAX_VALUE; // read until the end of the file } lineReader = new LineReader(inputStream); }
/** * Reads the next key/value pair from the input for processing. */ public boolean next(Text key, SequencedFragment value) throws IOException { if (pos >= end) return false; // past end of slice int bytesRead = 0; boolean goodRecord; do { bytesRead = lowLevelQseqRead(key, value); // if bytesRead <= 0 EOF has been reached goodRecord = (bytesRead > 0) && (!filterFailedQC || value.getFilterPassed() == null || value.getFilterPassed()); } while (bytesRead > 0 && !goodRecord); if (goodRecord) // post process the record only if it's going to be used { try { postProcessSequencedFragment(value); } catch (FormatException e) { throw new FormatException(e.getMessage() + " Position: " + makePositionMessage(this.pos - bytesRead) + "; line: " + buffer); // last line read is still in the buffer } } return goodRecord; }
@Test public void testGzCompressedInput() throws IOException { // write gzip-compressed data GzipCodec codec = new GzipCodec(); PrintWriter qseqOut = new PrintWriter( new BufferedOutputStream( codec.createOutputStream( new FileOutputStream(tempGz) ) ) ); qseqOut.write(twoQseq); qseqOut.close(); // now try to read it split = new FileSplit(new Path(tempGz.toURI().toString()), 0, twoQseq.length(), null); QseqRecordReader reader = new QseqRecordReader(conf, split); boolean retval = reader.next(key, fragment); assertTrue(retval); assertEquals("ERR020229:10880:1:1:1373:2042:1", key.toString()); assertEquals("TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT", fragment.getSequence().toString()); retval = reader.next(key, fragment); assertTrue(retval); assertEquals("ERR020229:10883:1:1:1796:2044:2", key.toString()); assertEquals("TGAGCAGATGTGCTAAAGCTGCTTCTCCCCTAGGATCATTTGTACCTACCAGACTCAGGGAAAGGGGTGAGAATTGGGCCGTGGGGCAAGG", fragment.getSequence().toString()); }
@Test public void testIlluminaMetaInfo() throws IOException { writeToTempQseq(illuminaQseq); split = new FileSplit(new Path(tempQseq.toURI().toString()), 0, illuminaQseq.length(), null); QseqRecordReader reader = new QseqRecordReader(conf, split); boolean found = reader.next(key, fragment); assertTrue(found); assertEquals("EAS139", fragment.getInstrument()); assertEquals(136, fragment.getRunNumber().intValue()); assertNull("flowcell id not null", fragment.getFlowcellId()); assertEquals(2, fragment.getLane().intValue()); assertEquals(5, fragment.getTile().intValue()); assertEquals(1000, fragment.getXpos().intValue()); assertEquals(12850, fragment.getYpos().intValue()); assertEquals(1, fragment.getRead().intValue()); assertEquals(false, fragment.getFilterPassed().booleanValue()); assertNull("control number not null", fragment.getControlNumber()); assertEquals("ATCACG", fragment.getIndexSequence()); }
private int lowLevelQseqRead(Text key, SequencedFragment value) throws IOException { int bytesRead = lineReader.readLine(buffer, MAX_LINE_LENGTH); pos += bytesRead; if (bytesRead >= MAX_LINE_LENGTH) { String line; try { line = Text.decode(buffer.getBytes(), 0, 500); } catch (java.nio.charset.CharacterCodingException e) { line = "(line not convertible to printable format)"; } throw new RuntimeException("found abnormally large line (length " + bytesRead + ") at " + makePositionMessage(pos - bytesRead) + ": " + line); } else if (bytesRead > 0) scanQseqLine(buffer, key, value); return bytesRead; }
private int lowLevelQseqRead(Text key, SequencedFragment value) throws IOException { int bytesRead = lineReader.readLine(buffer, MAX_LINE_LENGTH); pos += bytesRead; if (bytesRead >= MAX_LINE_LENGTH) { String line; try { line = Text.decode(buffer.getBytes(), 0, 500); } catch (java.nio.charset.CharacterCodingException e) { line = "(line not convertible to printable format)"; } throw new RuntimeException("found abnormally large line (length " + bytesRead + ") at " + makePositionMessage(pos - bytesRead) + ": " + line); } else if (bytesRead > 0) scanQseqLine(buffer, key, value); return bytesRead; }
@Test public void testReadStartInMiddle() throws IOException { writeToTempQseq(twoQseq); split = new FileSplit(new Path(tempQseq.toURI().toString()), 10, twoQseq.length() - 10, null); QseqRecordReader reader = new QseqRecordReader(conf, split); assertEquals(oneQseq.length() + 1, reader.getPos()); // The start of the second record. We +1 for the \n that is not in oneQseq assertEquals(0.0, reader.getProgress(), 0.01); boolean retval = reader.next(key, fragment); assertTrue(retval); assertEquals("ERR020229:10883:1:1:1796:2044:2", key.toString()); assertEquals("TGAGCAGATGTGCTAAAGCTGCTTCTCCCCTAGGATCATTTGTACCTACCAGACTCAGGGAAAGGGGTGAGAATTGGGCCGTGGGGCAAGG", fragment.getSequence().toString()); assertEquals("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%", fragment.getQuality().toString()); assertEquals(twoQseq.length(), reader.getPos()); // now should be at the end of the data assertEquals(1.0, reader.getProgress(), 0.01); retval = reader.next(key, fragment); assertFalse(retval); }
private void setFieldPositionsAndLengths(Text line) { int pos = 0; // the byte position within the record int fieldno = 0; // the field index within the record while (pos < line.getLength() && fieldno < NUM_QSEQ_COLS) // iterate over each field { int endpos = line.find(Delim, pos); // the field's end position if (endpos < 0) endpos = line.getLength(); fieldPositions[fieldno] = pos; fieldLengths[fieldno] = endpos - pos; pos = endpos + 1; // the next starting position is the current end + 1 fieldno += 1; } if (fieldno != NUM_QSEQ_COLS) throw new FormatException("found " + fieldno + " fields instead of 11 at " + makePositionMessage(this.pos - line.getLength()) + ". Line: " + line); }
private void setFieldPositionsAndLengths(Text line) { int pos = 0; // the byte position within the record int fieldno = 0; // the field index within the record while (pos < line.getLength() && fieldno < NUM_QSEQ_COLS) // iterate over each field { int endpos = line.find(Delim, pos); // the field's end position if (endpos < 0) endpos = line.getLength(); fieldPositions[fieldno] = pos; fieldLengths[fieldno] = endpos - pos; pos = endpos + 1; // the next starting position is the current end + 1 fieldno += 1; } if (fieldno != NUM_QSEQ_COLS) throw new FormatException("found " + fieldno + " fields instead of 11 at " + makePositionMessage(this.pos - line.getLength()) + ". Line: " + line); }
private void setFieldPositionsAndLengths(Text line) { int pos = 0; // the byte position within the record int fieldno = 0; // the field index within the record while (pos < line.getLength() && fieldno < NUM_QSEQ_COLS) // iterate over each field { int endpos = line.find(Delim, pos); // the field's end position if (endpos < 0) endpos = line.getLength(); fieldPositions[fieldno] = pos; fieldLengths[fieldno] = endpos - pos; pos = endpos + 1; // the next starting position is the current end + 1 fieldno += 1; } if (fieldno != NUM_QSEQ_COLS) throw new FormatException("found " + fieldno + " fields instead of 11 at " + makePositionMessage(this.pos - line.getLength()) + ". Line: " + line); }
@Test public void testReadFromStart() throws IOException { QseqRecordReader reader = createReaderForOneQseq(); assertEquals(0, reader.getPos()); assertEquals(0.0, reader.getProgress(), 0.01); boolean retval = reader.next(key, fragment); assertTrue(retval); //System.err.println("in testReadFromStart quality: " + fragment.getQuality().toString()); assertEquals("ERR020229:10880:1:1:1373:2042:1", key.toString()); assertEquals("TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT", fragment.getSequence().toString()); assertEquals("###########################################################################################", fragment.getQuality().toString()); assertEquals(oneQseq.length(), reader.getPos()); assertEquals(1.0, reader.getProgress(), 0.01); retval = reader.next(key, fragment); assertFalse(retval); }