/** * Create an object of the appropriate type to be used as a value. */ public SequencedFragment createValue() { return new SequencedFragment(); }
private void postProcessSequencedFragment(SequencedFragment fragment) { byte[] bytes = fragment.getSequence().getBytes(); // replace . with N for (int i = 0; i < fieldLengths[8]; ++i) if (bytes[i] == '.') bytes[i] = 'N'; if (qualityEncoding == BaseQualityEncoding.Illumina) { // convert illumina to sanger scale SequencedFragment.convertQuality(fragment.getQuality(), BaseQualityEncoding.Illumina, BaseQualityEncoding.Sanger); } else // sanger qualities. { int outOfRangeElement = SequencedFragment.verifyQuality(fragment.getQuality(), BaseQualityEncoding.Sanger); if (outOfRangeElement >= 0) { throw new FormatException("qseq base quality score out of range for Sanger Phred+33 format (found " + (fragment.getQuality().getBytes()[outOfRangeElement] - FormatConstants.SANGER_OFFSET) + ").\n" + "Although Sanger format has been requested, maybe qualities are in Illumina Phred+64 format?\n"); } } } }
protected String makeId(SequencedFragment seq) throws IOException { String delim = ":"; sBuilder.delete(0, sBuilder.length()); // clear sBuilder.append( seq.getInstrument() == null ? "" : seq.getInstrument() ).append(delim); sBuilder.append( seq.getRunNumber() == null ? "" : seq.getRunNumber().toString() ).append(delim); sBuilder.append( seq.getFlowcellId() == null ? "" : seq.getFlowcellId() ).append(delim); sBuilder.append( seq.getLane() == null ? "" : seq.getLane().toString() ).append(delim); sBuilder.append( seq.getTile() == null ? "" : seq.getTile().toString() ).append(delim); sBuilder.append( seq.getXpos() == null ? "" : seq.getXpos().toString() ).append(delim); sBuilder.append( seq.getYpos() == null ? "" : seq.getYpos().toString() ); sBuilder.append(" "); // space sBuilder.append( seq.getRead() == null ? "" : seq.getRead().toString() ).append(delim); sBuilder.append(seq.getFilterPassed() == null || seq.getFilterPassed() ? "N" : "Y"); sBuilder.append(delim); sBuilder.append( seq.getControlNumber() == null ? "0" : seq.getControlNumber().toString()).append(delim); sBuilder.append( seq.getIndexSequence() == null ? "" : seq.getIndexSequence()); return sBuilder.toString(); }
private boolean scanIlluminaId(Text name, SequencedFragment fragment) { Matcher m = ILLUMINA_PATTERN.matcher(name.toString()); boolean matches = m.matches(); if (matches) { fragment.setInstrument(m.group(1)); fragment.setRunNumber(Integer.parseInt(m.group(2))); fragment.setFlowcellId(m.group(3)); fragment.setLane(Integer.parseInt(m.group(4))); fragment.setTile(Integer.parseInt(m.group(5))); fragment.setXpos(Integer.parseInt(m.group(6))); fragment.setYpos(Integer.parseInt(m.group(7))); fragment.setRead(Integer.parseInt(m.group(8))); fragment.setFilterPassed("N".equals(m.group(9))); fragment.setControlNumber(Integer.parseInt(m.group(10))); fragment.setIndexSequence(m.group(11)); } return matches; }
@Test public void testToString() { String seq = "AGTAGTAGTAGTAGTAGTAGTAGTAGTAGT"; String qual = "##############################"; frag.setSequence(new Text(seq)); frag.setQuality(new Text(qual)); frag.setInstrument("machine"); frag.setRunNumber(123); frag.setFlowcellId("flowcell"); frag.setLane(3); frag.setTile(1001); frag.setXpos(1234); frag.setYpos(4321); frag.setIndexSequence("CAT"); frag.setRead(1); assertEquals("machine\t123\tflowcell\t3\t1001\t1234\t4321\tCAT\t1\t" + seq + "\t" + qual + "\t1", frag.toString()); }
fragment.clear(); fragment.setInstrument( Text.decode(line.getBytes(), fieldPositions[0], fieldLengths[0]) ); fragment.setRunNumber( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[1], fieldLengths[1])) ); fragment.setLane( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[2], fieldLengths[2])) ); fragment.setTile( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[3], fieldLengths[3])) ); fragment.setXpos( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[4], fieldLengths[4])) ); fragment.setYpos( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[5], fieldLengths[5])) ); fragment.setRead( Integer.parseInt(Text.decode(line.getBytes(), fieldPositions[7], fieldLengths[7])) ); fragment.setFilterPassed( line.getBytes()[fieldPositions[10]] != '0' ); fragment.setIndexSequence(null); else fragment.setIndexSequence(Text.decode(line.getBytes(), fieldPositions[6], fieldLengths[6]).replace('.', 'N')); fragment.getSequence().append(line.getBytes(), fieldPositions[8], fieldLengths[8]); fragment.getQuality().append(line.getBytes(), fieldPositions[9], fieldLengths[9]);
sBuilder.append( seq.getInstrument() == null ? "" : seq.getInstrument() ).append(delim); sBuilder.append( seq.getRunNumber() == null ? "" : seq.getRunNumber().toString() ).append(delim); sBuilder.append( seq.getLane() == null ? "" : seq.getLane().toString() ).append(delim); sBuilder.append( seq.getTile() == null ? "" : seq.getTile().toString() ).append(delim); sBuilder.append( seq.getXpos() == null ? "" : seq.getXpos().toString() ).append(delim); sBuilder.append( seq.getYpos() == null ? "" : seq.getYpos().toString() ).append(delim); if (seq.getIndexSequence() == null || seq.getIndexSequence().isEmpty()) index = "0"; else index = seq.getIndexSequence().replace('N', '.'); sBuilder.append( index ).append(delim); sBuilder.append( seq.getRead() == null ? "" : seq.getRead().toString() ).append(delim); sBuilder.append( seq.getSequence() == null ? "" : seq.getSequence().toString().replace('N', '.')).append(delim); if (seq.getQuality() == null) sBuilder.append(""); else sBuilder.append(seq.getQuality().toString()); if (baseQualityFormat == BaseQualityEncoding.Sanger) sBuilder.append((seq.getFilterPassed() == null || seq.getFilterPassed() ) ? 1 : 0);
@Test public void testInitialState() { assertNotNull(frag.getSequence()); assertNotNull(frag.getQuality()); assertNull(frag.getInstrument()); assertNull(frag.getRunNumber()); assertNull(frag.getFlowcellId()); assertNull(frag.getLane()); assertNull(frag.getTile()); assertNull(frag.getXpos()); assertNull(frag.getYpos()); assertNull(frag.getRead()); assertNull(frag.getFilterPassed()); assertNull(frag.getControlNumber()); assertNull(frag.getIndexSequence()); assertNotNull(frag.toString()); }
gotData = lowLevelFastqRead(key, value); goodRecord = gotData && (!filterFailedQC || value.getFilterPassed() == null || value.getFilterPassed()); } while (gotData && !goodRecord); try { SequencedFragment.convertQuality(value.getQuality(), BaseQualityEncoding.Illumina, BaseQualityEncoding.Sanger); } catch (FormatException e) { int outOfRangeElement = SequencedFragment.verifyQuality(value.getQuality(), BaseQualityEncoding.Sanger); if (outOfRangeElement >= 0) { throw new FormatException("Base quality out of range for Sanger Phred+33 format (found " + (value.getQuality().getBytes()[outOfRangeElement] - FormatConstants.SANGER_OFFSET) + ").\n" + "Maybe qualities are in Illumina Phred+64 format?\n"
@Test public void testSerializationWithFields() throws IOException { frag.setSequence(new Text("AGTAGTAGTAGTAGTAGTAGTAGTAGTAGT")); frag.setQuality(new Text("BBBBBBBBBBBBBBBBBBBBBBBBBBBBBB")); frag.setInstrument("machine"); frag.setLane(3); frag.setRead(1); frag.setIndexSequence("CAT"); assertEquals(frag, cloneBySerialization(frag)); }
public void map(Text key, SequencedFragment value, Context context) throws IOException, InterruptedException { Read read = new Read(key.toString(), value.getSequence().toString(), value.getQuality().toString()); context.write(new LongWritable(1), new AvroValue<>(read)); } }
@Test(expected=IllegalArgumentException.class) public void testConvertQualityNoop() { frag.setSequence(new Text("AGTAGTAGTAGTAGTAGTAGTAGTAGTAGT")); frag.setQuality(new Text("[[[[[[[[[[[[[[[[[[[[[[[[[[[[[[")); SequencedFragment.convertQuality(frag.getQuality(), FormatConstants.BaseQualityEncoding.Sanger, FormatConstants.BaseQualityEncoding.Sanger); }
protected boolean lowLevelFastqRead(Text key, SequencedFragment value) throws IOException { // ID line long skipped = lineReader.skip(1); // skip @ pos += skipped; if (skipped == 0) return false; // EOF // ID readLineInto(key); // sequence value.clear(); readLineInto(value.getSequence()); readLineInto(buffer); if (buffer.getLength() == 0 || buffer.getBytes()[0] != '+') throw new RuntimeException("unexpected fastq line separating sequence and quality at " + makePositionMessage() + ". Line: " + buffer + ". \nSequence ID: " + key); readLineInto(value.getQuality()); // look for the Illumina-formatted name. Once it isn't found lookForIlluminaIdentifier will be set to false lookForIlluminaIdentifier = lookForIlluminaIdentifier && scanIlluminaId(key, value); if (!lookForIlluminaIdentifier) scanNameForReadNumber(key, value); return true; }
public void write(Text key, SequencedFragment seq) throws IOException { // write the id line out.write('@'); if (key != null) out.write(key.getBytes(), 0, key.getLength()); else out.write(makeId(seq).getBytes(UTF8)); out.write('\n'); // write the sequence and separator out.write(seq.getSequence().getBytes(), 0, seq.getSequence().getLength()); out.write(PLUS_LINE); // now the quality if (baseQualityFormat == BaseQualityEncoding.Sanger) out.write(seq.getQuality().getBytes(), 0, seq.getQuality().getLength()); else if (baseQualityFormat == BaseQualityEncoding.Illumina) { buffer.set(seq.getQuality()); SequencedFragment.convertQuality(buffer, BaseQualityEncoding.Sanger, baseQualityFormat); out.write(buffer.getBytes(), 0, buffer.getLength()); } else throw new RuntimeException("FastqOutputFormat: unknown base quality format " + baseQualityFormat); // and the final newline out.write('\n'); }
@Test public void testVerifyQualitySangerOutOfRange() { frag.setSequence(new Text("AGTAGTAGTAGTAGTAGTAGTAGTAGTAGT")); frag.setQuality(new Text("#############################" + Character.toString((char)127))); // over range assertEquals(29, SequencedFragment.verifyQuality(frag.getQuality(), FormatConstants.BaseQualityEncoding.Sanger)); frag.setQuality(new Text("##### ########################")); // under range assertEquals(5, SequencedFragment.verifyQuality(frag.getQuality(), FormatConstants.BaseQualityEncoding.Sanger)); }
private static SequencedFragment cloneBySerialization(SequencedFragment original) throws IOException { ByteArrayOutputStream outputBuffer = new ByteArrayOutputStream(); DataOutputStream dataOutput = new DataOutputStream(outputBuffer); original.write(dataOutput); dataOutput.close(); SequencedFragment newFrag = new SequencedFragment(); newFrag.readFields( new DataInputStream( new ByteArrayInputStream(outputBuffer.toByteArray()))); return newFrag; }
@Test public void testEqualsSequence() { frag.getSequence().append("AAAA".getBytes(), 0, 4); assertFalse( frag.equals(frag2) ); frag2.getSequence().append("AAAA".getBytes(), 0, 4); assertTrue( frag.equals(frag2) ); }
@Test public void testNs() throws IOException { writeToTempQseq(nQseq); split = new FileSplit(new Path(tempQseq.toURI().toString()), 0, nQseq.length(), null); QseqRecordReader reader = new QseqRecordReader(conf, split); boolean found = reader.next(key, fragment); assertTrue(found); assertEquals("NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN", fragment.getSequence().toString()); }
/** * Reads the next key/value pair from the input for processing. */ public boolean next(Text key, SequencedFragment value) throws IOException { if (pos >= end) return false; // past end of slice int bytesRead = 0; boolean goodRecord; do { bytesRead = lowLevelQseqRead(key, value); // if bytesRead <= 0 EOF has been reached goodRecord = (bytesRead > 0) && (!filterFailedQC || value.getFilterPassed() == null || value.getFilterPassed()); } while (bytesRead > 0 && !goodRecord); if (goodRecord) // post process the record only if it's going to be used { try { postProcessSequencedFragment(value); } catch (FormatException e) { throw new FormatException(e.getMessage() + " Position: " + makePositionMessage(this.pos - bytesRead) + "; line: " + buffer); // last line read is still in the buffer } } return goodRecord; }
@Test public void testEqualsFlowcellId() { frag.setFlowcellId("id"); assertFalse( frag.equals(frag2) ); frag2.setFlowcellId("id"); assertTrue( frag.equals(frag2) ); }