@Test public void testGzCompressedInput() throws IOException { // write gzip-compressed data GzipCodec codec = new GzipCodec(); PrintWriter fastqOut = new PrintWriter( new BufferedOutputStream( codec.createOutputStream( new FileOutputStream(tempGz) ) ) ); fastqOut.write(twoFastq); fastqOut.close(); // now try to read it split = new FileSplit(new Path(tempGz.toURI().toString()), 0, twoFastq.length(), null); FastqRecordReader reader = new FastqRecordReader(conf, split); boolean retval = reader.next(key, fragment); assertTrue(retval); assertEquals("ERR020229.10880 HWI-ST168_161:1:1:1373:2042/1", key.toString()); assertEquals("TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT", fragment.getSequence().toString()); retval = reader.next(key, fragment); assertTrue(retval); assertEquals("ERR020229.10883 HWI-ST168_161:1:1:1796:2044/1", key.toString()); assertEquals("TGAGCAGATGTGCTAAAGCTGCTTCTCCCCTAGGATCATTTGTACCTACCAGACTCAGGGAAAGGGGTGAGAATTGGGCCGTGGGGCAAGG", fragment.getSequence().toString()); }
@Test public void testMakePositionMessage() throws IOException { writeToTempFastq(fastqWithIdTwice); split = new FileSplit(new Path(tempFastq.toURI().toString()), 0, fastqWithIdTwice.length(), null); FastqRecordReader reader = new FastqRecordReader(conf, split); assertNotNull(reader.makePositionMessage()); }
public FastqRecordReader(Configuration conf, FileSplit split) throws IOException { setConf(conf); file = split.getPath(); start = split.getStart(); end = start + split.getLength(); FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(file); CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(file); if (codec == null) // no codec. Uncompressed file. { positionAtFirstRecord(fileIn); inputStream = fileIn; } else { // compressed file if (start != 0) throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")"); inputStream = codec.createInputStream(fileIn); end = Long.MAX_VALUE; // read until the end of the file } lineReader = new LineReader(inputStream); }
boolean goodRecord; do { gotData = lowLevelFastqRead(key, value); goodRecord = gotData && (!filterFailedQC || value.getFilterPassed() == null || value.getFilterPassed()); } while (gotData && !goodRecord); throw new FormatException(e.getMessage() + " Position: " + makePositionMessage() + "; Sequence ID: " + key); (value.getQuality().getBytes()[outOfRangeElement] - FormatConstants.SANGER_OFFSET) + ").\n" + "Although Sanger format has been requested, maybe qualities are in Illumina Phred+64 format?\n" + "Position: " + makePositionMessage() + "; Sequence ID: " + key); throw new RuntimeException("unexpected end of file in fastq record at " + makePositionMessage() + ". Id: " + key.toString());
boolean goodRecord; do { gotData = lowLevelFastqRead(key, value); goodRecord = gotData && (!filterFailedQC || value.getFilterPassed() == null || value.getFilterPassed()); } while (gotData && !goodRecord); throw new FormatException(e.getMessage() + " Position: " + makePositionMessage() + "; Sequence ID: " + key); (value.getQuality().getBytes()[outOfRangeElement] - FormatConstants.SANGER_OFFSET) + ").\n" + "Although Sanger format has been requested, maybe qualities are in Illumina Phred+64 format?\n" + "Position: " + makePositionMessage() + "; Sequence ID: " + key); throw new RuntimeException("unexpected end of file in fastq record at " + makePositionMessage() + ". Id: " + key.toString());
boolean goodRecord; do { gotData = lowLevelFastqRead(key, value); goodRecord = gotData && (!filterFailedQC || value.getFilterPassed() == null || value.getFilterPassed()); } while (gotData && !goodRecord); throw new FormatException(e.getMessage() + " Position: " + makePositionMessage() + "; Sequence ID: " + key); (value.getQuality().getBytes()[outOfRangeElement] - FormatConstants.SANGER_OFFSET) + ").\n" + "Although Sanger format has been requested, maybe qualities are in Illumina Phred+64 format?\n" + "Position: " + makePositionMessage() + "; Sequence ID: " + key); throw new RuntimeException("unexpected end of file in fastq record at " + makePositionMessage() + ". Id: " + key.toString());
public FastqRecordReader(Configuration conf, FileSplit split) throws IOException { setConf(conf); file = split.getPath(); start = split.getStart(); end = start + split.getLength(); FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(file); CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(file); if (codec == null) // no codec. Uncompressed file. { positionAtFirstRecord(fileIn); inputStream = fileIn; } else { // compressed file if (start != 0) throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")"); inputStream = codec.createInputStream(fileIn); end = Long.MAX_VALUE; // read until the end of the file } lineReader = new LineReader(inputStream); }
public FastqRecordReader(Configuration conf, FileSplit split) throws IOException { setConf(conf); file = split.getPath(); start = split.getStart(); end = start + split.getLength(); FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(file); CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(file); if (codec == null) // no codec. Uncompressed file. { positionAtFirstRecord(fileIn); inputStream = fileIn; } else { // compressed file if (start != 0) throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")"); inputStream = codec.createInputStream(fileIn); end = Long.MAX_VALUE; // read until the end of the file } lineReader = new LineReader(inputStream); }
protected boolean lowLevelFastqRead(Text key, SequencedFragment value) throws IOException { // ID line long skipped = lineReader.skip(1); // skip @ pos += skipped; if (skipped == 0) return false; // EOF // ID readLineInto(key); // sequence value.clear(); readLineInto(value.getSequence()); readLineInto(buffer); if (buffer.getLength() == 0 || buffer.getBytes()[0] != '+') throw new RuntimeException("unexpected fastq line separating sequence and quality at " + makePositionMessage() + ". Line: " + buffer + ". \nSequence ID: " + key); readLineInto(value.getQuality()); // look for the Illumina-formatted name. Once it isn't found lookForIlluminaIdentifier will be set to false lookForIlluminaIdentifier = lookForIlluminaIdentifier && scanIlluminaId(key, value); if (!lookForIlluminaIdentifier) scanNameForReadNumber(key, value); return true; }
@Test public void testIlluminaMetaInfoNegativeXYpos() throws IOException { writeToTempFastq(illuminaFastqNegativeXYPos); split = new FileSplit(new Path(tempFastq.toURI().toString()), 0, illuminaFastqNegativeXYPos.length(), null); FastqRecordReader reader = new FastqRecordReader(conf, split); boolean found = reader.next(key, fragment); assertTrue(found); assertEquals("EAS139", fragment.getInstrument()); assertEquals(136, fragment.getRunNumber().intValue()); assertEquals("FC706VJ", fragment.getFlowcellId()); assertEquals(2, fragment.getLane().intValue()); assertEquals(5, fragment.getTile().intValue()); assertEquals(-1000, fragment.getXpos().intValue()); assertEquals(-12850, fragment.getYpos().intValue()); assertEquals(1, fragment.getRead().intValue()); assertEquals(false, fragment.getFilterPassed().booleanValue()); assertEquals(18, fragment.getControlNumber().intValue()); assertEquals("ATCACG", fragment.getIndexSequence()); }
@Test public void testIlluminaMetaInfoNullFC() throws IOException { writeToTempFastq(illuminaFastqNoFlowCellID); split = new FileSplit(new Path(tempFastq.toURI().toString()), 0, illuminaFastqNoFlowCellID.length(), null); FastqRecordReader reader = new FastqRecordReader(conf, split); boolean found = reader.next(key, fragment); assertTrue(found); assertEquals("EAS139", fragment.getInstrument()); assertEquals(136, fragment.getRunNumber().intValue()); assertEquals("", fragment.getFlowcellId()); assertEquals(2, fragment.getLane().intValue()); assertEquals(5, fragment.getTile().intValue()); assertEquals(1000, fragment.getXpos().intValue()); assertEquals(12850, fragment.getYpos().intValue()); assertEquals(1, fragment.getRead().intValue()); assertEquals(false, fragment.getFilterPassed().booleanValue()); assertEquals(18, fragment.getControlNumber().intValue()); assertEquals("ATCACG", fragment.getIndexSequence()); }
@Test public void testIlluminaMetaInfo() throws IOException { writeToTempFastq(illuminaFastq); split = new FileSplit(new Path(tempFastq.toURI().toString()), 0, illuminaFastq.length(), null); FastqRecordReader reader = new FastqRecordReader(conf, split); boolean found = reader.next(key, fragment); assertTrue(found); assertEquals("EAS139", fragment.getInstrument()); assertEquals(136, fragment.getRunNumber().intValue()); assertEquals("FC706VJ", fragment.getFlowcellId()); assertEquals(2, fragment.getLane().intValue()); assertEquals(5, fragment.getTile().intValue()); assertEquals(1000, fragment.getXpos().intValue()); assertEquals(12850, fragment.getYpos().intValue()); assertEquals(1, fragment.getRead().intValue()); assertEquals(false, fragment.getFilterPassed().booleanValue()); assertEquals(18, fragment.getControlNumber().intValue()); assertEquals("ATCACG", fragment.getIndexSequence()); }
protected boolean lowLevelFastqRead(Text key, SequencedFragment value) throws IOException { // ID line long skipped = lineReader.skip(1); // skip @ pos += skipped; if (skipped == 0) return false; // EOF // ID readLineInto(key); // sequence value.clear(); readLineInto(value.getSequence()); readLineInto(buffer); if (buffer.getLength() == 0 || buffer.getBytes()[0] != '+') throw new RuntimeException("unexpected fastq line separating sequence and quality at " + makePositionMessage() + ". Line: " + buffer + ". \nSequence ID: " + key); readLineInto(value.getQuality()); // look for the Illumina-formatted name. Once it isn't found lookForIlluminaIdentifier will be set to false lookForIlluminaIdentifier = lookForIlluminaIdentifier && scanIlluminaId(key, value); if (!lookForIlluminaIdentifier) scanNameForReadNumber(key, value); return true; }
protected boolean lowLevelFastqRead(Text key, SequencedFragment value) throws IOException { // ID line long skipped = lineReader.skip(1); // skip @ pos += skipped; if (skipped == 0) return false; // EOF // ID readLineInto(key); // sequence value.clear(); readLineInto(value.getSequence()); readLineInto(buffer); if (buffer.getLength() == 0 || buffer.getBytes()[0] != '+') throw new RuntimeException("unexpected fastq line separating sequence and quality at " + makePositionMessage() + ". Line: " + buffer + ". \nSequence ID: " + key); readLineInto(value.getQuality()); // look for the Illumina-formatted name. Once it isn't found lookForIlluminaIdentifier will be set to false lookForIlluminaIdentifier = lookForIlluminaIdentifier && scanIlluminaId(key, value); if (!lookForIlluminaIdentifier) scanNameForReadNumber(key, value); return true; }
@Test public void testReadStartInMiddle() throws IOException { writeToTempFastq(twoFastq); split = new FileSplit(new Path(tempFastq.toURI().toString()), 10, twoFastq.length() - 10, null); FastqRecordReader reader = new FastqRecordReader(conf, split); assertEquals(oneFastq.length() + 1, reader.getPos()); // The start of the second record. We +1 for the \n that is not in oneFastq assertEquals(0.0, reader.getProgress(), 0.01); boolean retval = reader.next(key, fragment); assertTrue(retval); assertEquals("ERR020229.10883 HWI-ST168_161:1:1:1796:2044/1", key.toString()); assertEquals("TGAGCAGATGTGCTAAAGCTGCTTCTCCCCTAGGATCATTTGTACCTACCAGACTCAGGGAAAGGGGTGAGAATTGGGCCGTGGGGCAAGG", fragment.getSequence().toString()); assertEquals("BDDCDBDD?A=?=:=7,7*@A;;53/53.:@>@@4=>@@@=?1?###############################################", fragment.getQuality().toString()); assertEquals(twoFastq.length(), reader.getPos()); // now should be at the end of the data assertEquals(1.0, reader.getProgress(), 0.01); retval = reader.next(key, fragment); assertFalse(retval); }
@Test public void testReadFromStart() throws IOException { FastqRecordReader reader = createReaderForOneFastq(); assertEquals(0, reader.getPos()); assertEquals(0.0, reader.getProgress(), 0.01); boolean retval = reader.next(key, fragment); assertTrue(retval); assertEquals("ERR020229.10880 HWI-ST168_161:1:1:1373:2042/1", key.toString()); assertEquals("TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT", fragment.getSequence().toString()); assertEquals("###########################################################################################", fragment.getQuality().toString()); assertEquals(oneFastq.length(), reader.getPos()); assertEquals(1.0, reader.getProgress(), 0.01); retval = reader.next(key, fragment); assertFalse(retval); }
@Test public void testReadFastqWithAmpersandQuality() throws IOException { writeToTempFastq(fastqWithAmpersandQuality); // split doesn't start at 0, forcing reader to advance looking for first complete record split = new FileSplit(new Path(tempFastq.toURI().toString()), 3, fastqWithAmpersandQuality.length(), null); FastqRecordReader reader = new FastqRecordReader(conf, split); boolean retval = reader.next(key, fragment); assertTrue(retval); assertEquals("ERR020229.10880 HWI-ST168_161:1:1:1373:2042/1", key.toString()); assertEquals("TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT", fragment.getSequence().toString()); assertEquals("###########################################################################################", fragment.getQuality().toString()); retval = reader.next(key, fragment); assertFalse(retval); }
@Test public void testReadFastqWithIdTwice() throws IOException { writeToTempFastq(fastqWithIdTwice); split = new FileSplit(new Path(tempFastq.toURI().toString()), 0, fastqWithIdTwice.length(), null); FastqRecordReader reader = new FastqRecordReader(conf, split); boolean retval = reader.next(key, fragment); assertTrue(retval); assertEquals("ERR020229.10880 HWI-ST168_161:1:1:1373:2042/1", key.toString()); assertEquals("TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT", fragment.getSequence().toString()); assertEquals("###########################################################################################", fragment.getQuality().toString()); retval = reader.next(key, fragment); assertFalse(retval); }
@Test public void testIlluminaNoIndex() throws IOException { writeToTempFastq(illuminaFastqNoIndex); split = new FileSplit(new Path(tempFastq.toURI().toString()), 0, illuminaFastqNoIndex.length(), null); FastqRecordReader reader = new FastqRecordReader(conf, split); boolean found = reader.next(key, fragment); assertTrue(found); // ensure all meta-data was picked up assertEquals("EAS139", fragment.getInstrument()); assertEquals(136, fragment.getRunNumber().intValue()); // now verify the index assertEquals("", fragment.getIndexSequence()); }
private void verifySkipFailedQC() throws IOException { writeToTempFastq(twoFastqWithIllumina); split = new FileSplit(new Path(tempFastq.toURI().toString()), 0, twoFastqWithIllumina.length(), null); FastqRecordReader reader = new FastqRecordReader(conf, split); boolean found = reader.next(key, fragment); assertTrue(found); assertEquals(2, (int)fragment.getRead()); found = reader.next(key, fragment); assertTrue(found); assertEquals(3, (int)fragment.getRead()); found = reader.next(key, fragment); assertFalse(found); }