@Override public List<InputSplit> getSplits(JobContext job) throws IOException { return getSplits(super.getSplits(job), job.getConfiguration()); }
public List<InputSplit> getSplits(List<InputSplit> splits, Configuration conf) throws IOException { // update splits to align with CRAM container boundaries List<InputSplit> newSplits = new ArrayList<InputSplit>(); Map<Path, List<Long>> fileToOffsets = new HashMap<Path, List<Long>>(); for (InputSplit split : splits) { FileSplit fileSplit = (FileSplit) split; Path path = fileSplit.getPath(); List<Long> containerOffsets = fileToOffsets.get(path); if (containerOffsets == null) { containerOffsets = getContainerOffsets(conf, path); fileToOffsets.put(path, containerOffsets); } long newStart = nextContainerOffset(containerOffsets, fileSplit.getStart()); long newEnd = nextContainerOffset(containerOffsets, fileSplit.getStart() + fileSplit.getLength()); long newLength = newEnd - newStart; if (newLength == 0) { // split is wholly within a container continue; } FileSplit newSplit = new FileSplit(fileSplit.getPath(), newStart, newLength, fileSplit.getLocations()); newSplits.add(newSplit); } return newSplits; }
@Test public void testReader() throws Exception { int expectedCount = 0; SamReader samReader = SamReaderFactory.makeDefault() .referenceSequence(new File(URI.create(reference))).open(new File(input)); for (SAMRecord r : samReader) { expectedCount++; } CRAMInputFormat inputFormat = new CRAMInputFormat(); List<InputSplit> splits = inputFormat.getSplits(jobContext); assertEquals(1, splits.size()); RecordReader<LongWritable, SAMRecordWritable> reader = inputFormat .createRecordReader(splits.get(0), taskAttemptContext); reader.initialize(splits.get(0), taskAttemptContext); int actualCount = 0; while (reader.nextKeyValue()) { actualCount++; } assertEquals(expectedCount, actualCount); }
private void checkSplits(int splitMaxSize) throws IOException { // test.cram has containers at positions 1069 and 3403. The file length is 3433. // expected splits = 1069+2334, 3403+30 jobContext.getConfiguration().setInt(FileInputFormat.SPLIT_MAXSIZE, splitMaxSize); CRAMInputFormat inputFormat = new CRAMInputFormat(); List<InputSplit> splits = inputFormat.getSplits(jobContext); assertEquals(2, splits.size()); FileSplit split0 = (FileSplit) splits.get(0); FileSplit split1 = (FileSplit) splits.get(1); assertEquals(1069, split0.getStart()); assertEquals(2334, split0.getLength()); assertEquals(3403, split1.getStart()); assertEquals(30, split1.getLength()); }
case SAM: return samIF.createRecordReader(split, ctx); case BAM: return bamIF.createRecordReader(split, ctx); case CRAM: return cramIF.createRecordReader(split, ctx); default: assert false; return null;
/** Defers to {@link BAMInputFormat}, {@link CRAMInputFormat}, or * {@link SAMInputFormat} as appropriate for the given path. */ @Override public boolean isSplitable(JobContext job, Path path) { if (this.conf == null) this.conf = job.getConfiguration(); try { final SAMFormat fmt = getFormat(path); if (fmt == null) return super.isSplitable(job, path); switch (fmt) { case SAM: return samIF.isSplitable(job, path); case BAM: return bamIF.isSplitable(job, path); case CRAM: return cramIF.isSplitable(job, path); default: assert false; return false; } } catch (PathNotFoundException e) { return super.isSplitable(job, path); } }
case SAM: return samIF.createRecordReader(split, ctx); case BAM: return bamIF.createRecordReader(split, ctx); case CRAM: return cramIF.createRecordReader(split, ctx); default: assert false; return null;
/** Defers to {@link BAMInputFormat}, {@link CRAMInputFormat}, or * {@link SAMInputFormat} as appropriate for the given path. */ @Override public boolean isSplitable(JobContext job, Path path) { if (this.conf == null) this.conf = job.getConfiguration(); try { final SAMFormat fmt = getFormat(path); if (fmt == null) return super.isSplitable(job, path); switch (fmt) { case SAM: return samIF.isSplitable(job, path); case BAM: return bamIF.isSplitable(job, path); case CRAM: return cramIF.isSplitable(job, path); default: assert false; return false; } } catch (PathNotFoundException e) { return super.isSplitable(job, path); } }
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { return getSplits(super.getSplits(job), job.getConfiguration()); }
public List<InputSplit> getSplits(List<InputSplit> splits, Configuration conf) throws IOException { // update splits to align with CRAM container boundaries List<InputSplit> newSplits = new ArrayList<InputSplit>(); Map<Path, List<Long>> fileToOffsets = new HashMap<Path, List<Long>>(); for (InputSplit split : splits) { FileSplit fileSplit = (FileSplit) split; Path path = fileSplit.getPath(); List<Long> containerOffsets = fileToOffsets.get(path); if (containerOffsets == null) { containerOffsets = getContainerOffsets(conf, path); fileToOffsets.put(path, containerOffsets); } long newStart = nextContainerOffset(containerOffsets, fileSplit.getStart()); long newEnd = nextContainerOffset(containerOffsets, fileSplit.getStart() + fileSplit.getLength()); long newLength = newEnd - newStart; if (newLength == 0) { // split is wholly within a container continue; } FileSplit newSplit = new FileSplit(fileSplit.getPath(), newStart, newLength, fileSplit.getLocations()); newSplits.add(newSplit); } return newSplits; }
case SAM: return samIF.createRecordReader(split, ctx); case BAM: return bamIF.createRecordReader(split, ctx); case CRAM: return cramIF.createRecordReader(split, ctx); default: assert false; return null;
/** Defers to {@link BAMInputFormat}, {@link CRAMInputFormat}, or * {@link SAMInputFormat} as appropriate for the given path. */ @Override public boolean isSplitable(JobContext job, Path path) { if (this.conf == null) this.conf = job.getConfiguration(); try { final SAMFormat fmt = getFormat(path); if (fmt == null) return super.isSplitable(job, path); switch (fmt) { case SAM: return samIF.isSplitable(job, path); case BAM: return bamIF.isSplitable(job, path); case CRAM: return cramIF.isSplitable(job, path); default: assert false; return false; } } catch (PathNotFoundException e) { return super.isSplitable(job, path); } }
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { return getSplits(super.getSplits(job), job.getConfiguration()); }
public List<InputSplit> getSplits(List<InputSplit> splits, Configuration conf) throws IOException { // update splits to align with CRAM container boundaries List<InputSplit> newSplits = new ArrayList<InputSplit>(); Map<Path, List<Long>> fileToOffsets = new HashMap<Path, List<Long>>(); for (InputSplit split : splits) { FileSplit fileSplit = (FileSplit) split; Path path = fileSplit.getPath(); List<Long> containerOffsets = fileToOffsets.get(path); if (containerOffsets == null) { containerOffsets = getContainerOffsets(conf, path); fileToOffsets.put(path, containerOffsets); } long newStart = nextContainerOffset(containerOffsets, fileSplit.getStart()); long newEnd = nextContainerOffset(containerOffsets, fileSplit.getStart() + fileSplit.getLength()); long newLength = newEnd - newStart; if (newLength == 0) { // split is wholly within a container continue; } FileSplit newSplit = new FileSplit(fileSplit.getPath(), newStart, newLength, fileSplit.getLocations()); newSplits.add(newSplit); } return newSplits; }
newSplits.addAll(cramIF.getSplits(cramOrigSplits, job.getConfiguration())); return newSplits;
newSplits.addAll(cramIF.getSplits(cramOrigSplits, job.getConfiguration())); return newSplits;
newSplits.addAll(cramIF.getSplits(cramOrigSplits, job.getConfiguration())); return newSplits;