private void fixBCFSplits( List<FileSplit> splits, List<InputSplit> newSplits) throws IOException { // addGuessedSplits() requires the given splits to be sorted by file // path, so do so. Although FileInputFormat.getSplits() does, at the time // of writing this, generate them in that order, we shouldn't rely on it. splits.sort(Comparator.comparing(FileSplit::getPath)); for (int i = 0; i < splits.size();) i = addGuessedSplits(splits, i, newSplits); }
/** Defers to {@link BCFSplitGuesser} as appropriate for each individual * path. VCF paths do not require special handling, so their splits are left * unchanged. */ @Override public List<InputSplit> getSplits(JobContext job) throws IOException { if (this.conf == null) this.conf = job.getConfiguration(); final List<InputSplit> origSplits = super.getSplits(job); // We have to partition the splits by input format and hand the BCF ones // over to getBCFSplits(). final List<FileSplit> bcfOrigSplits = new ArrayList<FileSplit>(origSplits.size()); final List<InputSplit> newSplits = new ArrayList<InputSplit>(origSplits.size()); for (final InputSplit iSplit : origSplits) { final FileSplit split = (FileSplit)iSplit; if (VCFFormat.BCF.equals(getFormat(split.getPath()))) bcfOrigSplits.add(split); else newSplits.add(split); } fixBCFSplits(bcfOrigSplits, newSplits); return filterByInterval(newSplits, conf); }
private List<InputSplit> filterByInterval(List<InputSplit> splits, Configuration conf) throws IOException { List<Interval> intervals = getIntervals(conf); if (intervals == null) { return splits; long blockStart = block.getStartPosition(); long blockEnd = block.getEndPosition(); if (overlaps(splitStart, splitEnd, blockStart, blockEnd)) { filteredSplits.add(split); break; long blockStart = block.getStartPosition(); long blockEnd = block.getEndPosition(); if (overlaps(splitStart, splitEnd, blockStart, blockEnd)) { long overlapStart = Math.max(splitStart, blockStart); long overlapEnd = Math.min(splitEnd, blockEnd);
VCFInputFormat.setIntervals(conf, ImmutableList.of(interval)); JobContext ctx = new JobContextImpl(conf, taskAttemptContext.getJobID()); VCFInputFormat inputFormat = new VCFInputFormat(conf); List<InputSplit> splits = inputFormat.getSplits(ctx); switch (expectedSplits) { case EXACTLY_ONE: RecordReader<LongWritable, VariantContextWritable> reader = inputFormat.createRecordReader(split, taskAttemptContext); reader.initialize(split, taskAttemptContext); readers.add(reader);
public void checkReading(ValidationStringency validationStringency) throws Exception { String filename = "invalid_info_field.vcf"; Configuration conf = new Configuration(); String input_file = ClassLoader.getSystemClassLoader().getResource(filename).getFile(); conf.set("mapred.input.dir", "file://" + input_file); if (validationStringency != null) { VCFRecordReader.setValidationStringency(conf, validationStringency); } TaskAttemptContext taskAttemptContext = new TaskAttemptContextImpl(conf, mock(TaskAttemptID.class)); JobContext ctx = new JobContextImpl(conf, taskAttemptContext.getJobID()); VCFInputFormat inputFormat = new VCFInputFormat(conf); List<InputSplit> splits = inputFormat.getSplits(ctx); assertEquals(1, splits.size()); RecordReader<LongWritable, VariantContextWritable> reader = inputFormat.createRecordReader(splits.get(0), taskAttemptContext); int counter = 0; while (reader.nextKeyValue()) { VariantContextWritable writable = reader.getCurrentValue(); assertNotNull(writable); VariantContext vc = writable.get(); assertNotNull(vc); String value = vc.toString(); assertNotNull(value); counter++; } assertEquals(4, counter); }
this.conf = ctx.getConfiguration(); final VCFFormat fmt = getFormat(path); if (fmt == null) throw new IllegalArgumentException(
intervals = VCFInputFormat.getIntervals(ctx.getConfiguration()); if (intervals != null) { overlapDetector = OverlapDetector.create(intervals);
private List<InputSplit> filterByInterval(List<InputSplit> splits, Configuration conf) throws IOException { List<Interval> intervals = getIntervals(conf); if (intervals == null) { return splits; long blockStart = block.getStartPosition(); long blockEnd = block.getEndPosition(); if (overlaps(splitStart, splitEnd, blockStart, blockEnd)) { filteredSplits.add(split); break; long blockStart = block.getStartPosition(); long blockEnd = block.getEndPosition(); if (overlaps(splitStart, splitEnd, blockStart, blockEnd)) { long overlapStart = Math.max(splitStart, blockStart); long overlapEnd = Math.min(splitEnd, blockEnd);
this.conf = ctx.getConfiguration(); final VCFFormat fmt = getFormat(path); if (fmt == null) throw new IllegalArgumentException(
intervals = VCFInputFormat.getIntervals(ctx.getConfiguration()); if (intervals != null) { overlapDetector = OverlapDetector.create(intervals);
/** Defers to {@link BCFSplitGuesser} as appropriate for each individual * path. VCF paths do not require special handling, so their splits are left * unchanged. */ @Override public List<InputSplit> getSplits(JobContext job) throws IOException { if (this.conf == null) this.conf = job.getConfiguration(); final List<InputSplit> origSplits = super.getSplits(job); // We have to partition the splits by input format and hand the BCF ones // over to getBCFSplits(). final List<FileSplit> bcfOrigSplits = new ArrayList<>(origSplits.size()); final List<InputSplit> newSplits = new ArrayList<>(origSplits.size()); for (final InputSplit iSplit : origSplits) { final FileSplit split = (FileSplit)iSplit; if (VCFFormat.BCF.equals(getFormat(split.getPath()))) bcfOrigSplits.add(split); else newSplits.add(split); } fixBCFSplits(bcfOrigSplits, newSplits); return filterByInterval(newSplits, conf); }
private List<InputSplit> filterByInterval(List<InputSplit> splits, Configuration conf) throws IOException { List<Interval> intervals = getIntervals(conf); if (intervals == null) { return splits; long blockStart = block.getStartPosition(); long blockEnd = block.getEndPosition(); if (overlaps(splitStart, splitEnd, blockStart, blockEnd)) { filteredSplits.add(split); break; long blockStart = block.getStartPosition(); long blockEnd = block.getEndPosition(); if (overlaps(splitStart, splitEnd, blockStart, blockEnd)) { long overlapStart = Math.max(splitStart, blockStart); long overlapEnd = Math.min(splitEnd, blockEnd);
private void fixBCFSplits( List<FileSplit> splits, List<InputSplit> newSplits) throws IOException { // addGuessedSplits() requires the given splits to be sorted by file // path, so do so. Although FileInputFormat.getSplits() does, at the time // of writing this, generate them in that order, we shouldn't rely on it. Collections.sort(splits, new Comparator<FileSplit>() { public int compare(FileSplit a, FileSplit b) { return a.getPath().compareTo(b.getPath()); } }); for (int i = 0; i < splits.size();) i = addGuessedSplits(splits, i, newSplits); }
this.conf = ctx.getConfiguration(); final VCFFormat fmt = getFormat(path); if (fmt == null) throw new IllegalArgumentException(
intervals = VCFInputFormat.getIntervals(ctx.getConfiguration()); if (intervals != null) { overlapDetector = new OverlapDetector<>(0, 0);
/** Defers to {@link BCFSplitGuesser} as appropriate for each individual * path. VCF paths do not require special handling, so their splits are left * unchanged. */ @Override public List<InputSplit> getSplits(JobContext job) throws IOException { if (this.conf == null) this.conf = job.getConfiguration(); final List<InputSplit> origSplits = super.getSplits(job); // We have to partition the splits by input format and hand the BCF ones // over to getBCFSplits(). final List<FileSplit> bcfOrigSplits = new ArrayList<FileSplit>(origSplits.size()); final List<InputSplit> newSplits = new ArrayList<InputSplit>(origSplits.size()); for (final InputSplit iSplit : origSplits) { final FileSplit split = (FileSplit)iSplit; if (VCFFormat.BCF.equals(getFormat(split.getPath()))) bcfOrigSplits.add(split); else newSplits.add(split); } fixBCFSplits(bcfOrigSplits, newSplits); return filterByInterval(newSplits, conf); }
private void fixBCFSplits( List<FileSplit> splits, List<InputSplit> newSplits) throws IOException { // addGuessedSplits() requires the given splits to be sorted by file // path, so do so. Although FileInputFormat.getSplits() does, at the time // of writing this, generate them in that order, we shouldn't rely on it. Collections.sort(splits, new Comparator<FileSplit>() { public int compare(FileSplit a, FileSplit b) { return a.getPath().compareTo(b.getPath()); } }); for (int i = 0; i < splits.size();) i = addGuessedSplits(splits, i, newSplits); }