final List<InputSplit> origSplits = removeIndexFiles(splits); i = addIndexedSplits (origSplits, i, newSplits, cfg); } catch (IOException | ProviderNotFoundException e) { if (cfg.getBoolean(ENABLE_BAI_SPLIT_CALCULATOR, false)) { try { i = addBAISplits (origSplits, i, newSplits, cfg); } catch (IOException | ProviderNotFoundException e2) { i = addProbabilisticSplits (origSplits, i, newSplits, cfg); i = addProbabilisticSplits (origSplits, i, newSplits, cfg); return filterByInterval(newSplits, cfg);
/** * Converts a List of SimpleIntervals into the format required by the SamReader query API * @param rawIntervals SimpleIntervals to be converted * @return A sorted, merged list of QueryIntervals suitable for passing to the SamReader query API */ static QueryInterval[] prepareQueryIntervals( final List<Interval> rawIntervals, final SAMSequenceDictionary sequenceDictionary ) { if ( rawIntervals == null || rawIntervals.isEmpty() ) { return null; } // Convert each SimpleInterval to a QueryInterval final QueryInterval[] convertedIntervals = rawIntervals.stream() .map(rawInterval -> convertSimpleIntervalToQueryInterval(rawInterval, sequenceDictionary)) .toArray(QueryInterval[]::new); // Intervals must be optimized (sorted and merged) in order to use the htsjdk query API return QueryInterval.optimizeIntervals(convertedIntervals); } /**
public BAMRecordWriter( Path output, SAMFileHeader header, boolean writeHeader, TaskAttemptContext ctx) throws IOException { init( output.getFileSystem(ctx.getConfiguration()).create(output), header, writeHeader); if (ctx.getConfiguration().getBoolean(BAMOutputFormat.WRITE_SPLITTING_BAI, false)) { Path splittingIndex = BAMInputFormat.getIdxPath(output); OutputStream splittingIndexOutput = output.getFileSystem(ctx.getConfiguration()).create(splittingIndex); splittingBAMIndexer = new SplittingBAMIndexer(splittingIndexOutput); } } public BAMRecordWriter(
public List<InputSplit> getSplits( List<InputSplit> splits, Configuration cfg) throws IOException { // Align the splits so that they don't cross blocks. // addIndexedSplits() requires the given splits to be sorted by file // path, so do so. Although FileInputFormat.getSplits() does, at the time // of writing this, generate them in that order, we shouldn't rely on it. splits.sort((a, b) -> { FileSplit fa = (FileSplit) a, fb = (FileSplit) b; return fa.getPath().compareTo(fb.getPath()); }); final List<InputSplit> newSplits = new ArrayList<>(splits.size()); for (int i = 0; i < splits.size();) { try { i = addIndexedSplits (splits, i, newSplits, cfg); } catch (IOException e) { i = addProbabilisticSplits(splits, i, newSplits, cfg); } } return filterByInterval(newSplits, cfg); }
BAMInputFormat.removeIndexFiles(super.getSplits(job)); newSplits.add(split); newSplits.addAll(bamIF.getSplits(bamOrigSplits, job.getConfiguration())); newSplits.addAll(cramIF.getSplits(cramOrigSplits, job.getConfiguration())); return newSplits;
@Test public void testNoReadsInFirstSplitBug() throws Exception { input = BAMTestUtil.writeBamFileWithLargeHeader().getAbsolutePath(); completeSetup(); BAMInputFormat inputFormat = new BAMInputFormat(); List<InputSplit> splits = inputFormat.getSplits(jobContext); assertEquals(1, splits.size()); }
@Test public void testMultipleSplitsBaiEnabled() throws Exception { input = BAMTestUtil.writeBamFile(1000, SAMFileHeader.SortOrder.coordinate) .getAbsolutePath(); completeSetup(); BAMInputFormat.setEnableBAISplitCalculator(jobContext.getConfiguration(), true); jobContext.getConfiguration().setInt(FileInputFormat.SPLIT_MAXSIZE, 40000); BAMInputFormat inputFormat = new BAMInputFormat(); List<InputSplit> splits = inputFormat.getSplits(jobContext); assertEquals(3, splits.size()); List<SAMRecord> split0Records = getSAMRecordsFromSplit(inputFormat, splits.get(0)); List<SAMRecord> split1Records = getSAMRecordsFromSplit(inputFormat, splits.get(1)); List<SAMRecord> split2Records = getSAMRecordsFromSplit(inputFormat, splits.get(2)); assertEquals(1080, split0Records.size()); assertEquals(524, split1Records.size()); assertEquals(398, split2Records.size()); }
boolean useIntelInflater = BAMInputFormat.useIntelInflater(conf); boolean boundedTraversal = BAMInputFormat.isBoundedTraversal(conf); if (boundedTraversal && split.getIntervalFilePointers() != null) { List<Interval> intervals = BAMInputFormat.getIntervals(conf); QueryInterval[] queryIntervals = BAMInputFormat.prepareQueryIntervals(intervals, header.getSequenceDictionary()); iterator = bamFileReader.createIndexIterator(queryIntervals, false, split.getIntervalFilePointers()); } else if (boundedTraversal && split.getIntervalFilePointers() == null) {
private List<InputSplit> filterByInterval(List<InputSplit> splits, Configuration conf) throws IOException { if (!isBoundedTraversal(conf)) { return splits; .setUseAsyncIo(false); List<Interval> intervals = getIntervals(conf); boolean traverseUnplacedUnmapped = traverseUnplacedUnmapped(conf); QueryInterval[] queryIntervals = prepareQueryIntervals(intervals, dict); fileToSpan.put(bamFile, BAMFileReader.getFileSpan(queryIntervals, idx));
path.getFileSystem(cfg).open(getIdxPath(path))); return addProbabilisticSplits(splits, i, newSplits, cfg);
case BAM: return bamIF.createRecordReader(split, ctx); case CRAM: return cramIF.createRecordReader(split, ctx); default: assert false; return null;
List<Interval> intervals = BAMInputFormat.getIntervals(conf); if (intervals != null) { QueryInterval[] queryIntervals = BAMInputFormat.prepareQueryIntervals(intervals, header.getSequenceDictionary()); iterator = bamFileReader.createIndexIterator(queryIntervals, false, split.getIntervalFilePointers()); } else {
newSplits.add(split); newSplits.addAll(bamIF.getSplits(bamOrigSplits, job.getConfiguration())); newSplits.addAll(cramIF.getSplits(cramOrigSplits, job.getConfiguration())); return newSplits;
/** Defers to {@link BAMInputFormat}, {@link CRAMInputFormat}, or * {@link SAMInputFormat} as appropriate for the given path. */ @Override public boolean isSplitable(JobContext job, Path path) { if (this.conf == null) this.conf = job.getConfiguration(); try { final SAMFormat fmt = getFormat(path); if (fmt == null) return super.isSplitable(job, path); switch (fmt) { case SAM: return samIF.isSplitable(job, path); case BAM: return bamIF.isSplitable(job, path); case CRAM: return cramIF.isSplitable(job, path); default: assert false; return false; } } catch (PathNotFoundException e) { return super.isSplitable(job, path); } }
@Test public void testUnmapped() throws Exception { input = BAMTestUtil.writeBamFile(1000, SAMFileHeader.SortOrder.coordinate) .getAbsolutePath(); completeSetupWithBoundedTraversal(null, true); jobContext.getConfiguration().setInt(FileInputFormat.SPLIT_MAXSIZE, 40000); BAMInputFormat inputFormat = new BAMInputFormat(); List<InputSplit> splits = inputFormat.getSplits(jobContext); assertEquals(1, splits.size()); List<SAMRecord> split0Records = getSAMRecordsFromSplit(inputFormat, splits.get(0)); assertEquals(2, split0Records.size()); }
@Test public void testMultipleSplitsBaiEnabledSuffixPath() throws Exception { input = BAMTestUtil.writeBamFile(1000, SAMFileHeader.SortOrder.coordinate) .getAbsolutePath(); File index = new File(input.replaceFirst("\\.bam$", BAMIndex.BAMIndexSuffix)); index.renameTo(new File(input + BAMIndex.BAMIndexSuffix)); completeSetup(); BAMInputFormat.setEnableBAISplitCalculator(jobContext.getConfiguration(), true); jobContext.getConfiguration().setInt(FileInputFormat.SPLIT_MAXSIZE, 40000); BAMInputFormat inputFormat = new BAMInputFormat(); List<InputSplit> splits = inputFormat.getSplits(jobContext); assertEquals(3, splits.size()); List<SAMRecord> split0Records = getSAMRecordsFromSplit(inputFormat, splits.get(0)); List<SAMRecord> split1Records = getSAMRecordsFromSplit(inputFormat, splits.get(1)); List<SAMRecord> split2Records = getSAMRecordsFromSplit(inputFormat, splits.get(2)); assertEquals(1080, split0Records.size()); assertEquals(524, split1Records.size()); assertEquals(398, split2Records.size()); }
boolean useIntelInflater = BAMInputFormat.useIntelInflater(conf); boolean boundedTraversal = BAMInputFormat.isBoundedTraversal(conf); if (boundedTraversal && split.getIntervalFilePointers() != null) { List<Interval> intervals = BAMInputFormat.getIntervals(conf); QueryInterval[] queryIntervals = BAMInputFormat.prepareQueryIntervals(intervals, header.getSequenceDictionary()); iterator = bamFileReader.createIndexIterator(queryIntervals, false, split.getIntervalFilePointers()); } else if (boundedTraversal && split.getIntervalFilePointers() == null) {
private List<InputSplit> filterByInterval(List<InputSplit> splits, Configuration conf) throws IOException { if (!isBoundedTraversal(conf)) { return splits; .setUseAsyncIo(false); List<Interval> intervals = getIntervals(conf); boolean traverseUnplacedUnmapped = traverseUnplacedUnmapped(conf); QueryInterval[] queryIntervals = prepareQueryIntervals(intervals, dict); fileToSpan.put(bamFile, BAMFileReader.getFileSpan(queryIntervals, idx));
file.getFileSystem(cfg).open(getIdxPath(file))); return addProbabilisticSplits(splits, i, newSplits, cfg);
case BAM: return bamIF.createRecordReader(split, ctx); case CRAM: return cramIF.createRecordReader(split, ctx); default: assert false; return null;