newSplits.add(split); newSplits.addAll(bamIF.getSplits(bamOrigSplits, job.getConfiguration())); newSplits.addAll(cramIF.getSplits(cramOrigSplits, job.getConfiguration())); return newSplits;
newSplits.add(split); newSplits.addAll(bamIF.getSplits(bamOrigSplits, job.getConfiguration())); newSplits.addAll(cramIF.getSplits(cramOrigSplits, job.getConfiguration())); return newSplits;
newSplits.add(split); newSplits.addAll(bamIF.getSplits(bamOrigSplits, job.getConfiguration())); newSplits.addAll(cramIF.getSplits(cramOrigSplits, job.getConfiguration())); return newSplits;
public List<InputSplit> getSplits( List<InputSplit> splits, Configuration cfg) throws IOException { // Align the splits so that they don't cross blocks. // addIndexedSplits() requires the given splits to be sorted by file // path, so do so. Although FileInputFormat.getSplits() does, at the time // of writing this, generate them in that order, we shouldn't rely on it. splits.sort((a, b) -> { FileSplit fa = (FileSplit) a, fb = (FileSplit) b; return fa.getPath().compareTo(fb.getPath()); }); final List<InputSplit> newSplits = new ArrayList<>(splits.size()); for (int i = 0; i < splits.size();) { try { i = addIndexedSplits (splits, i, newSplits, cfg); } catch (IOException e) { i = addProbabilisticSplits(splits, i, newSplits, cfg); } } return filterByInterval(newSplits, cfg); }
public List<InputSplit> getSplits( List<InputSplit> splits, Configuration cfg) throws IOException { final List<InputSplit> origSplits = removeIndexFiles(splits); // Align the splits so that they don't cross blocks. // addIndexedSplits() requires the given splits to be sorted by file // path, so do so. Although FileInputFormat.getSplits() does, at the time // of writing this, generate them in that order, we shouldn't rely on it. Collections.sort(origSplits, new Comparator<InputSplit>() { public int compare(InputSplit a, InputSplit b) { FileSplit fa = (FileSplit)a, fb = (FileSplit)b; return fa.getPath().compareTo(fb.getPath()); } }); final List<InputSplit> newSplits = new ArrayList<InputSplit>(origSplits.size()); for (int i = 0; i < origSplits.size();) { try { i = addIndexedSplits (origSplits, i, newSplits, cfg); } catch (IOException | ProviderNotFoundException e) { if (cfg.getBoolean(ENABLE_BAI_SPLIT_CALCULATOR, false)) { try { i = addBAISplits (origSplits, i, newSplits, cfg); } catch (IOException | ProviderNotFoundException e2) {
public List<InputSplit> getSplits( List<InputSplit> splits, Configuration cfg) throws IOException { final List<InputSplit> origSplits = removeIndexFiles(splits); // Align the splits so that they don't cross blocks. // addIndexedSplits() requires the given splits to be sorted by file // path, so do so. Although FileInputFormat.getSplits() does, at the time // of writing this, generate them in that order, we shouldn't rely on it. Collections.sort(origSplits, new Comparator<InputSplit>() { public int compare(InputSplit a, InputSplit b) { FileSplit fa = (FileSplit)a, fb = (FileSplit)b; return fa.getPath().compareTo(fb.getPath()); } }); final List<InputSplit> newSplits = new ArrayList<InputSplit>(origSplits.size()); for (int i = 0; i < origSplits.size();) { try { i = addIndexedSplits (origSplits, i, newSplits, cfg); } catch (IOException | ProviderNotFoundException e) { if (cfg.getBoolean(ENABLE_BAI_SPLIT_CALCULATOR, false)) { try { i = addBAISplits (origSplits, i, newSplits, cfg); } catch (IOException | ProviderNotFoundException e2) {
@Test public void testUnmapped() throws Exception { input = BAMTestUtil.writeBamFile(1000, SAMFileHeader.SortOrder.coordinate) .getAbsolutePath(); completeSetupWithBoundedTraversal(null, true); jobContext.getConfiguration().setInt(FileInputFormat.SPLIT_MAXSIZE, 40000); BAMInputFormat inputFormat = new BAMInputFormat(); List<InputSplit> splits = inputFormat.getSplits(jobContext); assertEquals(1, splits.size()); List<SAMRecord> split0Records = getSAMRecordsFromSplit(inputFormat, splits.get(0)); assertEquals(2, split0Records.size()); }
@Test public void testIntervals() throws Exception { input = BAMTestUtil.writeBamFile(1000, SAMFileHeader.SortOrder.coordinate) .getAbsolutePath(); List<Interval> intervals = new ArrayList<Interval>(); intervals.add(new Interval("chr21", 5000, 9999)); // includes two unpaired fragments intervals.add(new Interval("chr21", 20000, 22999)); completeSetupWithIntervals(intervals); jobContext.getConfiguration().setInt(FileInputFormat.SPLIT_MAXSIZE, 40000); BAMInputFormat inputFormat = new BAMInputFormat(); List<InputSplit> splits = inputFormat.getSplits(jobContext); assertEquals(1, splits.size()); List<SAMRecord> split0Records = getSAMRecordsFromSplit(inputFormat, splits.get(0)); assertEquals(16, split0Records.size()); }
@Test public void testNoReadsInFirstSplitBug() throws Exception { input = BAMTestUtil.writeBamFileWithLargeHeader().getAbsolutePath(); completeSetup(); BAMInputFormat inputFormat = new BAMInputFormat(); List<InputSplit> splits = inputFormat.getSplits(jobContext); assertEquals(1, splits.size()); }
@Test public void testIntervalCoveringWholeChromosome() throws Exception { input = BAMTestUtil.writeBamFile(1000, SAMFileHeader.SortOrder.coordinate) .getAbsolutePath(); List<Interval> intervals = new ArrayList<Interval>(); intervals.add(new Interval("chr21", 1, 1000135)); completeSetupWithIntervals(intervals); jobContext.getConfiguration().setInt(FileInputFormat.SPLIT_MAXSIZE, 40000); BAMInputFormat inputFormat = new BAMInputFormat(); List<InputSplit> splits = inputFormat.getSplits(jobContext); assertEquals(2, splits.size()); List<SAMRecord> split0Records = getSAMRecordsFromSplit(inputFormat, splits.get(0)); List<SAMRecord> split1Records = getSAMRecordsFromSplit(inputFormat, splits.get(1)); assertEquals(1577, split0Records.size()); assertEquals(423, split1Records.size()); }
@Test public void testMultipleSplitsBaiEnabledSuffixPath() throws Exception { input = BAMTestUtil.writeBamFile(1000, SAMFileHeader.SortOrder.coordinate) .getAbsolutePath(); File index = new File(input.replaceFirst("\\.bam$", BAMIndex.BAMIndexSuffix)); index.renameTo(new File(input + BAMIndex.BAMIndexSuffix)); completeSetup(); BAMInputFormat.setEnableBAISplitCalculator(jobContext.getConfiguration(), true); jobContext.getConfiguration().setInt(FileInputFormat.SPLIT_MAXSIZE, 40000); BAMInputFormat inputFormat = new BAMInputFormat(); List<InputSplit> splits = inputFormat.getSplits(jobContext); assertEquals(3, splits.size()); List<SAMRecord> split0Records = getSAMRecordsFromSplit(inputFormat, splits.get(0)); List<SAMRecord> split1Records = getSAMRecordsFromSplit(inputFormat, splits.get(1)); List<SAMRecord> split2Records = getSAMRecordsFromSplit(inputFormat, splits.get(2)); assertEquals(1080, split0Records.size()); assertEquals(524, split1Records.size()); assertEquals(398, split2Records.size()); }
@Test public void testIntervalsAndUnmapped() throws Exception { input = BAMTestUtil.writeBamFile(1000, SAMFileHeader.SortOrder.coordinate) .getAbsolutePath(); List<Interval> intervals = new ArrayList<Interval>(); intervals.add(new Interval("chr21", 5000, 9999)); // includes two unpaired fragments intervals.add(new Interval("chr21", 20000, 22999)); completeSetupWithBoundedTraversal(intervals, true); jobContext.getConfiguration().setInt(FileInputFormat.SPLIT_MAXSIZE, 40000); BAMInputFormat inputFormat = new BAMInputFormat(); List<InputSplit> splits = inputFormat.getSplits(jobContext); assertEquals(2, splits.size()); List<SAMRecord> split0Records = getSAMRecordsFromSplit(inputFormat, splits.get(0)); List<SAMRecord> split1Records = getSAMRecordsFromSplit(inputFormat, splits.get(1)); assertEquals(16, split0Records.size()); assertEquals(2, split1Records.size()); }
@Test public void testMultipleSplitsBaiEnabled() throws Exception { input = BAMTestUtil.writeBamFile(1000, SAMFileHeader.SortOrder.coordinate) .getAbsolutePath(); completeSetup(); BAMInputFormat.setEnableBAISplitCalculator(jobContext.getConfiguration(), true); jobContext.getConfiguration().setInt(FileInputFormat.SPLIT_MAXSIZE, 40000); BAMInputFormat inputFormat = new BAMInputFormat(); List<InputSplit> splits = inputFormat.getSplits(jobContext); assertEquals(3, splits.size()); List<SAMRecord> split0Records = getSAMRecordsFromSplit(inputFormat, splits.get(0)); List<SAMRecord> split1Records = getSAMRecordsFromSplit(inputFormat, splits.get(1)); List<SAMRecord> split2Records = getSAMRecordsFromSplit(inputFormat, splits.get(2)); assertEquals(1080, split0Records.size()); assertEquals(524, split1Records.size()); assertEquals(398, split2Records.size()); }
@Test public void testMultipleSplitsBaiEnabledNoIndex() throws Exception { input = BAMTestUtil.writeBamFile(1000, SAMFileHeader.SortOrder.queryname) .getAbsolutePath(); completeSetup(); BAMInputFormat.setEnableBAISplitCalculator(jobContext.getConfiguration(), true); jobContext.getConfiguration().setInt(FileInputFormat.SPLIT_MAXSIZE, 40000); BAMInputFormat inputFormat = new BAMInputFormat(); List<InputSplit> splits = inputFormat.getSplits(jobContext); assertEquals(2, splits.size()); List<SAMRecord> split0Records = getSAMRecordsFromSplit(inputFormat, splits.get(0)); List<SAMRecord> split1Records = getSAMRecordsFromSplit(inputFormat, splits.get(1)); assertEquals(1577, split0Records.size()); assertEquals(425, split1Records.size()); SAMRecord lastRecordOfSplit0 = split0Records.get(split0Records.size() - 1); SAMRecord firstRecordOfSplit1 = split1Records.get(0); assertEquals(lastRecordOfSplit0.getReadName(), firstRecordOfSplit1.getReadName()); assertTrue(lastRecordOfSplit0.getFirstOfPairFlag()); assertTrue(firstRecordOfSplit1.getSecondOfPairFlag()); }
@Test public void testMultipleSplits() throws Exception { input = BAMTestUtil.writeBamFile(1000, SAMFileHeader.SortOrder.queryname) .getAbsolutePath(); completeSetup(); jobContext.getConfiguration().setInt(FileInputFormat.SPLIT_MAXSIZE, 40000); BAMInputFormat inputFormat = new BAMInputFormat(); List<InputSplit> splits = inputFormat.getSplits(jobContext); assertEquals(2, splits.size()); List<SAMRecord> split0Records = getSAMRecordsFromSplit(inputFormat, splits.get(0)); List<SAMRecord> split1Records = getSAMRecordsFromSplit(inputFormat, splits.get(1)); assertEquals(1577, split0Records.size()); assertEquals(425, split1Records.size()); SAMRecord lastRecordOfSplit0 = split0Records.get(split0Records.size() - 1); SAMRecord firstRecordOfSplit1 = split1Records.get(0); assertEquals(lastRecordOfSplit0.getReadName(), firstRecordOfSplit1.getReadName()); assertTrue(lastRecordOfSplit0.getFirstOfPairFlag()); assertTrue(firstRecordOfSplit1.getSecondOfPairFlag()); }