/** * Updates the cache to include the new contig * @param contig */ private synchronized ReferenceSequence addToCache(String contig) { ReferenceSequence seq = cache.get(contig); if (seq != null) { // already populated by another thread while we were waiting to enter // this synchronized block return seq; } log.debug("Caching reference genome contig ", contig); seq = underlying.getSequence(contig); cache = ImmutableMap.<String, ReferenceSequence>builder() .putAll(cache) .put(contig, seq) .build(); referenceIndexLookup[underlying.getSequenceDictionary().getSequence(contig).getSequenceIndex()] = seq; return seq; } @Override
public ReferenceSequence queryReferenceSequence(String chromosome, int start, int end) { ReferenceSequence referenceSequence = indexedFastaSequenceFile.getSubsequenceAt(chromosome, start, end); return referenceSequence; }
@Override public ReferenceSequence nextSequence() { return underlying.nextSequence(); } @Override
public static byte[] getBasesFromReferenceFile(String referenceFilePath, String seqName, int from, int length) { ReferenceSequenceFile referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File( referenceFilePath)); ReferenceSequence sequence = referenceSequenceFile.getSequence(seqName); byte[] bases = referenceSequenceFile.getSubsequenceAt(sequence.getName(), from, from + length).getBases(); return bases; }
byte[] findBasesByName(final String name, final boolean tryVariants) { if (rsFile == null || !rsFile.isIndexed()) return null; ReferenceSequence sequence = null; try { sequence = rsFile.getSequence(name); } catch (final SAMException e) { // the only way to test if rsFile contains the sequence is to try and catch exception. } if (sequence != null) return sequence.getBases(); if (tryVariants) { for (final String variant : getVariants(name)) { try { sequence = rsFile.getSequence(variant); } catch (final SAMException e) { log.warn("Sequence not found: " + variant); } if (sequence != null) return sequence.getBases(); } } return null; }
/** * Get reference sequence without validating name or length. This is OK if the entire sequence * dictionary was validated before reading sequences. */ public ReferenceSequence get(final int sequenceIndex) { if (referenceSequence != null && referenceSequence.getContigIndex() == sequenceIndex) { return referenceSequence; } if (referenceSequence != null && referenceSequence.getContigIndex() > sequenceIndex) { throw new SAMException("Requesting earlier reference sequence: " + sequenceIndex + " < " + referenceSequence.getContigIndex()); } referenceSequence = null; if(referenceSequenceFile.isIndexed() && referenceSequenceFile.getSequenceDictionary() != null) { final SAMSequenceRecord samSequenceRecord = referenceSequenceFile.getSequenceDictionary().getSequence(sequenceIndex); if(samSequenceRecord != null) { referenceSequence = referenceSequenceFile.getSequence(samSequenceRecord.getSequenceName()) ; } // else referenceSequence will remain null } else { do { referenceSequence = referenceSequenceFile.nextSequence(); } while (referenceSequence != null && referenceSequence.getContigIndex() < sequenceIndex); } if (referenceSequence == null || referenceSequence.getContigIndex() != sequenceIndex) { throw new SAMException("Reference sequence (" + sequenceIndex + ") not found in " + referenceSequenceFile.toString()); } return referenceSequence; }
/** * set our internal reference contig order * @param refFile the reference file */ @Requires("refFile != null") public GenomeLocParser(final ReferenceSequenceFile refFile) { this(refFile.getSequenceDictionary()); }
/** * Get the reference bases from referenceReader spanned by the extended location of this active region, * including additional padding bp on either side. If this expanded region would exceed the boundaries * of the active region's contig, the returned result will be truncated to only include on-genome reference * bases * @param referenceReader the source of the reference genome bases * @param padding the padding, in BP, we want to add to either side of this active region extended region * @param genomeLoc a non-null genome loc indicating the base span of the bp we'd like to get the reference for * @return a non-null array of bytes holding the reference bases in referenceReader */ @Ensures("result != null") public byte[] getReference( final ReferenceSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) { if ( referenceReader == null ) throw new IllegalArgumentException("referenceReader cannot be null"); if ( padding < 0 ) throw new IllegalArgumentException("padding must be a positive integer but got " + padding); if ( genomeLoc == null ) throw new IllegalArgumentException("genomeLoc cannot be null"); if ( genomeLoc.size() == 0 ) throw new IllegalArgumentException("GenomeLoc must have size > 0 but got " + genomeLoc); final byte[] reference = referenceReader.getSubsequenceAt( genomeLoc.getContig(), Math.max(1, genomeLoc.getStart() - padding), Math.min(referenceReader.getSequenceDictionary().getSequence(genomeLoc.getContig()).getSequenceLength(), genomeLoc.getStop() + padding) ).getBases(); return reference; }
@Override protected int doWork() { IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE); IOUtil.assertFileIsWritable(OUTPUT); final ReferenceSequenceFile refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(REFERENCE_SEQUENCE, true); if (!refFile.isIndexed()) { throw new IllegalStateException("Reference file must be indexed, but no index file was found"); } if (refFile.getSequenceDictionary() == null) { throw new IllegalStateException("Reference file must include a dictionary, but no dictionary file was found"); } // get the intervals final IntervalList intervals = segregateReference(refFile, MAX_TO_MERGE); log.info(String.format("Found %d intervals in %d loci during %s seconds", intervalProgress.getCount(), locusProgress.getCount(), locusProgress.getElapsedSeconds())); /********************************** * Now output regions for calling * **********************************/ final IntervalList outputIntervals = new IntervalList(intervals.getHeader().clone()); log.info(String.format("Collecting requested type of intervals (%s)", OUTPUT_TYPE)); intervals.getIntervals().stream().filter(i -> OUTPUT_TYPE.accepts(i.getName())).forEach(outputIntervals::add); log.info("Writing Intervals."); outputIntervals.write(OUTPUT); log.info(String.format("Execution ending. Total time %d seconds", locusProgress.getElapsedSeconds())); return 0; }
/** * Ensures that a sequence dictionary exists for the given reference * @param referenceFile reference genome fasta * @param invoking program */ public static void ensureSequenceDictionary(File referenceFile, CommandLineProgram program) { try { ReferenceSequenceFile rsf = new FastaSequenceFile(referenceFile, false); Path path = referenceFile.toPath().toAbsolutePath(); if (rsf.getSequenceDictionary() == null) { log.info("Attempting to create sequence dictionary for " + referenceFile); Path dictPath = path.resolveSibling(path.getFileName().toString() + htsjdk.samtools.util.IOUtil.DICT_FILE_EXTENSION); picard.sam.CreateSequenceDictionary csd = new picard.sam.CreateSequenceDictionary(); if (program != null) { CommandLineProgramHelper.copyInputs(program, csd); } csd.instanceMain(new String[] { "OUTPUT=" + dictPath.toFile(), "R=" + referenceFile.getPath() }); } rsf.close(); } catch (Exception e) { log.error("Sequence dictionary creation failed. Please create using picard CreateSequenceDictionary.", e); } } public void setReference(ReferenceLookup ref) {
@Override public synchronized ReferenceSequence getSequence(String contig) { if (isClosed) throw new IllegalStateException("Underlying reference has closed"); return underlying.getSequence(contig); }
/** * Checks if the set FASTA file is indexed * @return */ public Boolean hasIndex() { Boolean hasIndex = false; if (this.indexedFastaSequenceFile != null) { hasIndex = this.indexedFastaSequenceFile.isIndexed(); } return hasIndex; }
@Override public void close() throws IOException { referenceSequenceFile.close(); } }
/** * Get reference sequence without validating name or length. This is OK if the entire sequence * dictionary was validated before reading sequences. */ public ReferenceSequence get(final int sequenceIndex) { if (referenceSequence != null && referenceSequence.getContigIndex() == sequenceIndex) { return referenceSequence; } if (referenceSequence != null && referenceSequence.getContigIndex() > sequenceIndex) { throw new SAMException("Requesting earlier reference sequence: " + sequenceIndex + " < " + referenceSequence.getContigIndex()); } referenceSequence = null; if(referenceSequenceFile.isIndexed() && referenceSequenceFile.getSequenceDictionary() != null) { final SAMSequenceRecord samSequenceRecord = referenceSequenceFile.getSequenceDictionary().getSequence(sequenceIndex); if(samSequenceRecord != null) { referenceSequence = referenceSequenceFile.getSequence(samSequenceRecord.getSequenceName()) ; } // else referenceSequence will remain null } else { do { referenceSequence = referenceSequenceFile.nextSequence(); } while (referenceSequence != null && referenceSequence.getContigIndex() < sequenceIndex); } if (referenceSequence == null || referenceSequence.getContigIndex() != sequenceIndex) { throw new SAMException("Reference sequence (" + sequenceIndex + ") not found in " + referenceSequenceFile.toString()); } return referenceSequence; }
@Override SAMSequenceDictionary extractDictionary(Path reference) { final SAMSequenceDictionary dict = ReferenceSequenceFileFactory.getReferenceSequenceFile(reference).getSequenceDictionary(); if (dict == null) throw new SAMException("Could not find dictionary next to reference file " + reference.toUri().toString()); return dict; } },
protected byte[] getReferenceBases( GenomeLoc genomeLoc ) { SAMSequenceRecord sequenceInfo = reference.getSequenceDictionary().getSequence(genomeLoc.getContig()); long start = genomeLoc.getStart(); long stop = Math.min( genomeLoc.getStop(), sequenceInfo.getSequenceLength() ); // Read with no aligned bases? Return an empty array. if(stop - start + 1 == 0) return new byte[0]; ReferenceSequence subsequence = reference.getSubsequenceAt(genomeLoc.getContig(), start, stop); int overhang = (int)(genomeLoc.getStop() - stop); if ( overhang > 0 ) { if ( overhang > BUFFER ) // todo -- this is a bit dangerous throw new ReviewedGATKException("Insufficient buffer size for Xs overhanging genome -- expand BUFFER"); byte[] all = new byte[subsequence.getBases().length + overhang]; System.arraycopy(subsequence.getBases(), 0, all, 0, subsequence.getBases().length); System.arraycopy(Xs, 0, all, subsequence.getBases().length, overhang); return all; } else { // fast path return subsequence.getBases(); } } }
private String fetchBaseString(final ReferenceSequenceFile reader, final String contig, final int start, final int stop) { if ( start == -1 ) return new String(reader.getSequence(contig).getBases()); else return new String(reader.getSubsequenceAt(contig, start, stop).getBases()); }
byte[] findBasesByName(final String name, final boolean tryVariants) { if (rsFile == null || !rsFile.isIndexed()) return null; ReferenceSequence sequence = null; try { sequence = rsFile.getSequence(name); } catch (final SAMException e) { // the only way to test if rsFile contains the sequence is to try and catch exception. } if (sequence != null) return sequence.getBases(); if (tryVariants) { for (final String variant : getVariants(name)) { try { sequence = rsFile.getSequence(variant); } catch (final SAMException e) { log.warn("Sequence not found: " + variant); } if (sequence != null) return sequence.getBases(); } } return null; }
@Override protected int doWork() { IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE); IOUtil.assertFileIsWritable(OUTPUT); final ReferenceSequenceFile refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(REFERENCE_SEQUENCE, true); if (!refFile.isIndexed()) { throw new IllegalStateException("Reference file must be indexed, but no index file was found"); } if (refFile.getSequenceDictionary() == null) { throw new IllegalStateException("Reference file must include a dictionary, but no dictionary file was found"); } // get the intervals final IntervalList intervals = segregateReference(refFile, MAX_TO_MERGE); log.info(String.format("Found %d intervals in %d loci during %s seconds", intervalProgress.getCount(), locusProgress.getCount(), locusProgress.getElapsedSeconds())); /********************************** * Now output regions for calling * **********************************/ final IntervalList outputIntervals = new IntervalList(intervals.getHeader().clone()); log.info(String.format("Collecting requested type of intervals (%s)", OUTPUT_TYPE)); intervals.getIntervals().stream().filter(i -> OUTPUT_TYPE.accepts(i.getName())).forEach(outputIntervals::add); log.info("Writing Intervals."); outputIntervals.write(OUTPUT); log.info(String.format("Execution ending. Total time %d seconds", locusProgress.getElapsedSeconds())); return 0; }
public static void checkRefMD5(SAMSequenceDictionary d, ReferenceSequenceFile refFile, boolean checkExistingMD5, boolean failIfMD5Mismatch) throws NoSuchAlgorithmException { for (SAMSequenceRecord r : d.getSequences()) { ReferenceSequence sequence = refFile.getSequence(r.getSequenceName()); if (!r.getAttributes().contains(SAMSequenceRecord.MD5_TAG)) { String md5 = calculateMD5String(sequence.getBases()); r.setAttribute(SAMSequenceRecord.MD5_TAG, md5); } else { if (checkExistingMD5) { String existingMD5 = r.getAttribute(SAMSequenceRecord.MD5_TAG); String md5 = calculateMD5String(sequence.getBases()); if (!md5.equals(existingMD5)) { String message = String.format("For sequence %s the md5 %s does not match the actual md5 %s.", r.getSequenceName(), existingMD5, md5); if (failIfMD5Mismatch) throw new RuntimeException(message); else log.warn(message); } } } } }