public void map(Text key, SequencedFragment value, Context context) throws IOException, InterruptedException { Read read = new Read(key.toString(), value.getSequence().toString(), value.getQuality().toString()); context.write(new LongWritable(1), new AvroValue<>(read)); } }
private void postProcessSequencedFragment(SequencedFragment fragment) { byte[] bytes = fragment.getSequence().getBytes(); // replace . with N for (int i = 0; i < fieldLengths[8]; ++i) if (bytes[i] == '.') bytes[i] = 'N'; if (qualityEncoding == BaseQualityEncoding.Illumina) { // convert illumina to sanger scale SequencedFragment.convertQuality(fragment.getQuality(), BaseQualityEncoding.Illumina, BaseQualityEncoding.Sanger); } else // sanger qualities. { int outOfRangeElement = SequencedFragment.verifyQuality(fragment.getQuality(), BaseQualityEncoding.Sanger); if (outOfRangeElement >= 0) { throw new FormatException("qseq base quality score out of range for Sanger Phred+33 format (found " + (fragment.getQuality().getBytes()[outOfRangeElement] - FormatConstants.SANGER_OFFSET) + ").\n" + "Although Sanger format has been requested, maybe qualities are in Illumina Phred+64 format?\n"); } } } }
private void postProcessSequencedFragment(SequencedFragment fragment) { byte[] bytes = fragment.getSequence().getBytes(); // replace . with N for (int i = 0; i < fieldLengths[8]; ++i) if (bytes[i] == '.') bytes[i] = 'N'; if (qualityEncoding == BaseQualityEncoding.Illumina) { // convert illumina to sanger scale SequencedFragment.convertQuality(fragment.getQuality(), BaseQualityEncoding.Illumina, BaseQualityEncoding.Sanger); } else // sanger qualities. { int outOfRangeElement = SequencedFragment.verifyQuality(fragment.getQuality(), BaseQualityEncoding.Sanger); if (outOfRangeElement >= 0) { throw new FormatException("qseq base quality score out of range for Sanger Phred+33 format (found " + (fragment.getQuality().getBytes()[outOfRangeElement] - FormatConstants.SANGER_OFFSET) + ").\n" + "Although Sanger format has been requested, maybe qualities are in Illumina Phred+64 format?\n"); } } } }
@Test public void testNs() throws IOException { writeToTempQseq(nQseq); split = new FileSplit(new Path(tempQseq.toURI().toString()), 0, nQseq.length(), null); QseqRecordReader reader = new QseqRecordReader(conf, split); boolean found = reader.next(key, fragment); assertTrue(found); assertEquals("NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN", fragment.getSequence().toString()); }
protected boolean lowLevelFastqRead(Text key, SequencedFragment value) throws IOException { // ID line long skipped = lineReader.skip(1); // skip @ pos += skipped; if (skipped == 0) return false; // EOF // ID readLineInto(key); // sequence value.clear(); readLineInto(value.getSequence()); readLineInto(buffer); if (buffer.getLength() == 0 || buffer.getBytes()[0] != '+') throw new RuntimeException("unexpected fastq line separating sequence and quality at " + makePositionMessage() + ". Line: " + buffer + ". \nSequence ID: " + key); readLineInto(value.getQuality()); // look for the Illumina-formatted name. Once it isn't found lookForIlluminaIdentifier will be set to false lookForIlluminaIdentifier = lookForIlluminaIdentifier && scanIlluminaId(key, value); if (!lookForIlluminaIdentifier) scanNameForReadNumber(key, value); return true; }
protected boolean lowLevelFastqRead(Text key, SequencedFragment value) throws IOException { // ID line long skipped = lineReader.skip(1); // skip @ pos += skipped; if (skipped == 0) return false; // EOF // ID readLineInto(key); // sequence value.clear(); readLineInto(value.getSequence()); readLineInto(buffer); if (buffer.getLength() == 0 || buffer.getBytes()[0] != '+') throw new RuntimeException("unexpected fastq line separating sequence and quality at " + makePositionMessage() + ". Line: " + buffer + ". \nSequence ID: " + key); readLineInto(value.getQuality()); // look for the Illumina-formatted name. Once it isn't found lookForIlluminaIdentifier will be set to false lookForIlluminaIdentifier = lookForIlluminaIdentifier && scanIlluminaId(key, value); if (!lookForIlluminaIdentifier) scanNameForReadNumber(key, value); return true; }
protected boolean lowLevelFastqRead(Text key, SequencedFragment value) throws IOException { // ID line long skipped = lineReader.skip(1); // skip @ pos += skipped; if (skipped == 0) return false; // EOF // ID readLineInto(key); // sequence value.clear(); readLineInto(value.getSequence()); readLineInto(buffer); if (buffer.getLength() == 0 || buffer.getBytes()[0] != '+') throw new RuntimeException("unexpected fastq line separating sequence and quality at " + makePositionMessage() + ". Line: " + buffer + ". \nSequence ID: " + key); readLineInto(value.getQuality()); // look for the Illumina-formatted name. Once it isn't found lookForIlluminaIdentifier will be set to false lookForIlluminaIdentifier = lookForIlluminaIdentifier && scanIlluminaId(key, value); if (!lookForIlluminaIdentifier) scanNameForReadNumber(key, value); return true; }
@Test public void testEqualsSequence() { frag.getSequence().append("AAAA".getBytes(), 0, 4); assertFalse( frag.equals(frag2) ); frag2.getSequence().append("AAAA".getBytes(), 0, 4); assertTrue( frag.equals(frag2) ); }
@Test public void testGzCompressedInput() throws IOException { // write gzip-compressed data GzipCodec codec = new GzipCodec(); PrintWriter fastqOut = new PrintWriter( new BufferedOutputStream( codec.createOutputStream( new FileOutputStream(tempGz) ) ) ); fastqOut.write(twoFastq); fastqOut.close(); // now try to read it split = new FileSplit(new Path(tempGz.toURI().toString()), 0, twoFastq.length(), null); FastqRecordReader reader = new FastqRecordReader(conf, split); boolean retval = reader.next(key, fragment); assertTrue(retval); assertEquals("ERR020229.10880 HWI-ST168_161:1:1:1373:2042/1", key.toString()); assertEquals("TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT", fragment.getSequence().toString()); retval = reader.next(key, fragment); assertTrue(retval); assertEquals("ERR020229.10883 HWI-ST168_161:1:1:1796:2044/1", key.toString()); assertEquals("TGAGCAGATGTGCTAAAGCTGCTTCTCCCCTAGGATCATTTGTACCTACCAGACTCAGGGAAAGGGGTGAGAATTGGGCCGTGGGGCAAGG", fragment.getSequence().toString()); }
@Test public void testGzCompressedInput() throws IOException { // write gzip-compressed data GzipCodec codec = new GzipCodec(); PrintWriter qseqOut = new PrintWriter( new BufferedOutputStream( codec.createOutputStream( new FileOutputStream(tempGz) ) ) ); qseqOut.write(twoQseq); qseqOut.close(); // now try to read it split = new FileSplit(new Path(tempGz.toURI().toString()), 0, twoQseq.length(), null); QseqRecordReader reader = new QseqRecordReader(conf, split); boolean retval = reader.next(key, fragment); assertTrue(retval); assertEquals("ERR020229:10880:1:1:1373:2042:1", key.toString()); assertEquals("TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT", fragment.getSequence().toString()); retval = reader.next(key, fragment); assertTrue(retval); assertEquals("ERR020229:10883:1:1:1796:2044:2", key.toString()); assertEquals("TGAGCAGATGTGCTAAAGCTGCTTCTCCCCTAGGATCATTTGTACCTACCAGACTCAGGGAAAGGGGTGAGAATTGGGCCGTGGGGCAAGG", fragment.getSequence().toString()); }
@Test public void testReadFastqWithIdTwice() throws IOException { writeToTempFastq(fastqWithIdTwice); split = new FileSplit(new Path(tempFastq.toURI().toString()), 0, fastqWithIdTwice.length(), null); FastqRecordReader reader = new FastqRecordReader(conf, split); boolean retval = reader.next(key, fragment); assertTrue(retval); assertEquals("ERR020229.10880 HWI-ST168_161:1:1:1373:2042/1", key.toString()); assertEquals("TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT", fragment.getSequence().toString()); assertEquals("###########################################################################################", fragment.getQuality().toString()); retval = reader.next(key, fragment); assertFalse(retval); }
@Test public void testSimple() throws IOException { writer.write(null, fragment); writer.close(null); String[] lines = new String(outputBuffer.toByteArray(), "US-ASCII").split("\n"); assertEquals(4, lines.length); String idLine = lines[0]; assertTrue(idLine.startsWith("@")); compareMetadata(fragment, idLine); assertEquals(fragment.getSequence().toString(), lines[1]); assertEquals("+", lines[2]); assertEquals(fragment.getQuality().toString(), lines[3]); }
@Test public void testReadFastqWithAmpersandQuality() throws IOException { writeToTempFastq(fastqWithAmpersandQuality); // split doesn't start at 0, forcing reader to advance looking for first complete record split = new FileSplit(new Path(tempFastq.toURI().toString()), 3, fastqWithAmpersandQuality.length(), null); FastqRecordReader reader = new FastqRecordReader(conf, split); boolean retval = reader.next(key, fragment); assertTrue(retval); assertEquals("ERR020229.10880 HWI-ST168_161:1:1:1373:2042/1", key.toString()); assertEquals("TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT", fragment.getSequence().toString()); assertEquals("###########################################################################################", fragment.getQuality().toString()); retval = reader.next(key, fragment); assertFalse(retval); }
@Test public void testEquals() { assertTrue(frag.equals(frag2)); frag.getSequence().append("AAAA".getBytes(), 0, 4); assertFalse( frag.equals(frag2) ); }
@Test public void testReadStartInMiddle() throws IOException { writeToTempQseq(twoQseq); split = new FileSplit(new Path(tempQseq.toURI().toString()), 10, twoQseq.length() - 10, null); QseqRecordReader reader = new QseqRecordReader(conf, split); assertEquals(oneQseq.length() + 1, reader.getPos()); // The start of the second record. We +1 for the \n that is not in oneQseq assertEquals(0.0, reader.getProgress(), 0.01); boolean retval = reader.next(key, fragment); assertTrue(retval); assertEquals("ERR020229:10883:1:1:1796:2044:2", key.toString()); assertEquals("TGAGCAGATGTGCTAAAGCTGCTTCTCCCCTAGGATCATTTGTACCTACCAGACTCAGGGAAAGGGGTGAGAATTGGGCCGTGGGGCAAGG", fragment.getSequence().toString()); assertEquals("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%", fragment.getQuality().toString()); assertEquals(twoQseq.length(), reader.getPos()); // now should be at the end of the data assertEquals(1.0, reader.getProgress(), 0.01); retval = reader.next(key, fragment); assertFalse(retval); }
@Test public void testReadStartInMiddle() throws IOException { writeToTempFastq(twoFastq); split = new FileSplit(new Path(tempFastq.toURI().toString()), 10, twoFastq.length() - 10, null); FastqRecordReader reader = new FastqRecordReader(conf, split); assertEquals(oneFastq.length() + 1, reader.getPos()); // The start of the second record. We +1 for the \n that is not in oneFastq assertEquals(0.0, reader.getProgress(), 0.01); boolean retval = reader.next(key, fragment); assertTrue(retval); assertEquals("ERR020229.10883 HWI-ST168_161:1:1:1796:2044/1", key.toString()); assertEquals("TGAGCAGATGTGCTAAAGCTGCTTCTCCCCTAGGATCATTTGTACCTACCAGACTCAGGGAAAGGGGTGAGAATTGGGCCGTGGGGCAAGG", fragment.getSequence().toString()); assertEquals("BDDCDBDD?A=?=:=7,7*@A;;53/53.:@>@@4=>@@@=?1?###############################################", fragment.getQuality().toString()); assertEquals(twoFastq.length(), reader.getPos()); // now should be at the end of the data assertEquals(1.0, reader.getProgress(), 0.01); retval = reader.next(key, fragment); assertFalse(retval); }
@Test public void testSimple() throws IOException { writer.write(null, fragment); writer.close(null); String[] fields = new String(outputBuffer.toByteArray(), "US-ASCII").split("\t"); assertEquals(11, fields.length); assertEquals(fragment.getInstrument(), fields[0]); assertEquals(fragment.getRunNumber().toString(), fields[1]); assertEquals(fragment.getLane().toString(), fields[2]); assertEquals(fragment.getTile().toString(), fields[3]); assertEquals(fragment.getXpos().toString(), fields[4]); assertEquals(fragment.getYpos().toString(), fields[5]); assertEquals(fragment.getIndexSequence().toString(), fields[6]); assertEquals(fragment.getRead().toString(), fields[7]); assertEquals(fragment.getSequence().toString(), fields[8]); assertEquals(fragment.getQuality().toString().replace('#', 'B'), fields[9]); assertEquals(fragment.getFilterPassed() ? "1\n" : "0\n", fields[10]); }
@Test public void testReadFromStart() throws IOException { FastqRecordReader reader = createReaderForOneFastq(); assertEquals(0, reader.getPos()); assertEquals(0.0, reader.getProgress(), 0.01); boolean retval = reader.next(key, fragment); assertTrue(retval); assertEquals("ERR020229.10880 HWI-ST168_161:1:1:1373:2042/1", key.toString()); assertEquals("TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT", fragment.getSequence().toString()); assertEquals("###########################################################################################", fragment.getQuality().toString()); assertEquals(oneFastq.length(), reader.getPos()); assertEquals(1.0, reader.getProgress(), 0.01); retval = reader.next(key, fragment); assertFalse(retval); }
@Test public void testReadFromStart() throws IOException { QseqRecordReader reader = createReaderForOneQseq(); assertEquals(0, reader.getPos()); assertEquals(0.0, reader.getProgress(), 0.01); boolean retval = reader.next(key, fragment); assertTrue(retval); //System.err.println("in testReadFromStart quality: " + fragment.getQuality().toString()); assertEquals("ERR020229:10880:1:1:1373:2042:1", key.toString()); assertEquals("TTGGATGATAGGGATTATTTGACTCGAATATTGGAAATAGCTGTTTATATTTTTTAAAAATGGTCTGTAACTGGTGACAGGACGCTTCGAT", fragment.getSequence().toString()); assertEquals("###########################################################################################", fragment.getQuality().toString()); assertEquals(oneQseq.length(), reader.getPos()); assertEquals(1.0, reader.getProgress(), 0.01); retval = reader.next(key, fragment); assertFalse(retval); }
@Test public void testInitialState() { assertNotNull(frag.getSequence()); assertNotNull(frag.getQuality()); assertNull(frag.getInstrument()); assertNull(frag.getRunNumber()); assertNull(frag.getFlowcellId()); assertNull(frag.getLane()); assertNull(frag.getTile()); assertNull(frag.getXpos()); assertNull(frag.getYpos()); assertNull(frag.getRead()); assertNull(frag.getFilterPassed()); assertNull(frag.getControlNumber()); assertNull(frag.getIndexSequence()); assertNotNull(frag.toString()); }