@Override public LazyGenotypesContext.LazyData parse(final Object data) { //System.out.printf("Loading genotypes... %s:%d%n", contig, start); return createGenotypeMap((String) data, alleles, contig, start); } }
/** * decode the line into a feature (VariantContext) * @param line the line * @return a VariantContext */ @Override public VariantContext decode(String line) { return decodeLine(line, true); }
public final static boolean canDecodeFile(final String potentialInput, final String MAGIC_HEADER_LINE) { try { return isVCFStream(new FileInputStream(potentialInput), MAGIC_HEADER_LINE) || isVCFStream(new GZIPInputStream(new FileInputStream(potentialInput)), MAGIC_HEADER_LINE) || isVCFStream(new BlockCompressedInputStream(new FileInputStream(potentialInput)), MAGIC_HEADER_LINE); } catch ( FileNotFoundException e ) { return false; } catch ( IOException e ) { return false; } }
builder.source(getName()); final String chr = getCachedString(parts[0]); builder.chr(chr); int pos = -1; pos = Integer.valueOf(parts[1]); } catch (NumberFormatException e) { generateException(parts[1] + " is not a valid start position in the VCF format"); generateException("The VCF specification requires a valid ID field"); else if ( parts[2].equals(VCFConstants.EMPTY_ID_FIELD) ) builder.noID(); final String ref = getCachedString(parts[3].toUpperCase()); final String alts = getCachedString(parts[4]); builder.log10PError(parseQual(parts[5])); final List<String> filters = parseFilters(getCachedString(parts[6])); if ( filters != null ) builder.filters(new HashSet<String>(filters)); final Map<String, Object> attrs = parseInfo(parts[7]); builder.attributes(attrs); builder.stop(Integer.valueOf(attrs.get(VCFConstants.END_KEY).toString())); } catch (Exception e) { generateException("the END value in the INFO field is not valid"); final List<Allele> alleles = parseAlleles(ref, alts, lineNo); builder.alleles(alleles);
generateException("there are " + (nParts-1) + " genotypes while the header requires that " + (genotypeParts.length-1) + " genotypes be present for all records at " + chr + ":" + pos, lineNo); generateException("There are too many keys for the sample " + sampleName + ", keys = " + parts[8] + ", values = " + parts[genotypeOffset]); final List<String> filters = parseFilters(getCachedString(genotypeValues.get(i))); if ( filters != null ) gb.filters(filters); } else if ( genotypeValues.get(i).equals(VCFConstants.MISSING_VALUE_v4) ) { gb.GQ((int)Math.round(Double.valueOf(genotypeValues.get(i)))); } else if (gtKey.equals(VCFConstants.GENOTYPE_ALLELE_DEPTHS)) { gb.AD(decodeInts(genotypeValues.get(i))); } else if (gtKey.equals(VCFConstants.GENOTYPE_PL_KEY)) { gb.PL(decodeInts(genotypeValues.get(i))); PlIsSet = true; } else if (gtKey.equals(VCFConstants.GENOTYPE_LIKELIHOODS_KEY)) { generateException("Unable to find the GT field for the record; the GT field is required before VCF4.1"); if ( genotypeAlleleLocation > 0 ) generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes when present"); final List<Allele> GTalleles = (genotypeAlleleLocation == -1 ? new ArrayList<Allele>(0) : parseGenotypeAlleles(genotypeValues.get(genotypeAlleleLocation), alleles, alleleMap)); gb.alleles(GTalleles); gb.phased(genotypeAlleleLocation != -1 && genotypeValues.get(genotypeAlleleLocation).indexOf(VCFConstants.PHASED) != -1);
/** * check to make sure the allele is an acceptable allele * @param allele the allele to check * @param isRef are we the reference allele? * @param lineNo the line number for this record */ private static void checkAllele(String allele, boolean isRef, int lineNo) { if ( allele == null || allele.isEmpty() ) generateException(generateExceptionTextForBadAlleleBases(""), lineNo); if ( GeneralUtils.DEBUG_MODE_ENABLED && MAX_ALLELE_SIZE_BEFORE_WARNING != -1 && allele.length() > MAX_ALLELE_SIZE_BEFORE_WARNING ) { System.err.println(String.format("Allele detected with length %d exceeding max size %d at approximately line %d, likely resulting in degraded VCF processing performance", allele.length(), MAX_ALLELE_SIZE_BEFORE_WARNING, lineNo)); } if ( isSymbolicAllele(allele) ) { if ( isRef ) { generateException("Symbolic alleles not allowed as reference allele: " + allele, lineNo); } } else { // check for VCF3 insertions or deletions if ( (allele.charAt(0) == VCFConstants.DELETION_ALLELE_v3) || (allele.charAt(0) == VCFConstants.INSERTION_ALLELE_v3) ) generateException("Insertions/Deletions are not supported when reading 3.x VCF's. Please" + " convert your file to VCF4 using VCFTools, available at http://vcftools.sourceforge.net/index.html", lineNo); if (!Allele.acceptableAlleleBases(allele, isRef)) generateException(generateExceptionTextForBadAlleleBases(allele), lineNo); if ( isRef && allele.equals(VCFConstants.EMPTY_ALLELE) ) generateException("The reference allele cannot be missing", lineNo); } }
/** * parse out the alleles * @param ref the reference base * @param alts a string of alternates to break into alleles * @param lineNo the line number for this record * @return a list of alleles, and a pair of the shortest and longest sequence */ protected static List<Allele> parseAlleles(String ref, String alts, int lineNo) { List<Allele> alleles = new ArrayList<Allele>(2); // we are almost always biallelic // ref checkAllele(ref, true, lineNo); Allele refAllele = Allele.create(ref, true); alleles.add(refAllele); if ( alts.indexOf(',') == -1 ) // only 1 alternatives, don't call string split parseSingleAltAllele(alleles, alts, lineNo); else for ( String alt : alts.split(",") ) parseSingleAltAllele(alleles, alt, lineNo); return alleles; }
/** * parse a single allele, given the allele list * @param alleles the alleles available * @param alt the allele to parse * @param lineNo the line number for this record */ private static void parseSingleAltAllele(List<Allele> alleles, String alt, int lineNo) { checkAllele(alt, false, lineNo); Allele allele = Allele.create(alt, false); if ( ! allele.isNoCall() ) alleles.add(allele); }
generateException("The VCF specification requires a valid (non-zero length) info field"); generateException("The VCF specification does not allow for whitespace in the INFO field. Offending field value was \"" + infoField + "\"");
builder.source(getName()); final String chr = getCachedString(parts[0]); builder.chr(chr); int pos = -1; pos = Integer.valueOf(parts[1]); } catch (NumberFormatException e) { generateException(parts[1] + " is not a valid start position in the VCF format"); generateException("The VCF specification requires a valid ID field"); else if ( parts[2].equals(VCFConstants.EMPTY_ID_FIELD) ) builder.noID(); final String ref = getCachedString(parts[3].toUpperCase()); final String alts = getCachedString(parts[4]); builder.log10PError(parseQual(parts[5])); final List<String> filters = parseFilters(getCachedString(parts[6])); if ( filters != null ) builder.filters(new HashSet<String>(filters)); final Map<String, Object> attrs = parseInfo(parts[7]); builder.attributes(attrs); builder.stop(Integer.valueOf(attrs.get(VCFConstants.END_KEY).toString())); } catch (Exception e) { generateException("the END value in the INFO field is not valid"); final List<Allele> alleles = parseAlleles(ref, alts, lineNo); builder.alleles(alleles);
generateException("there are " + (nParts-1) + " genotypes while the header requires that " + (genotypeParts.length-1) + " genotypes be present for all records at " + chr + ":" + pos, lineNo); generateException("There are too many keys for the sample " + sampleName + ", keys = " + parts[8] + ", values = " + parts[genotypeOffset]); final List<String> filters = parseFilters(getCachedString(genotypeValues.get(i))); if ( filters != null ) gb.filters(filters); } else if ( genotypeValues.get(i).equals(VCFConstants.MISSING_VALUE_v4) ) { gb.GQ((int)Math.round(Double.valueOf(genotypeValues.get(i)))); } else if (gtKey.equals(VCFConstants.GENOTYPE_ALLELE_DEPTHS)) { gb.AD(decodeInts(genotypeValues.get(i))); } else if (gtKey.equals(VCFConstants.GENOTYPE_PL_KEY)) { gb.PL(decodeInts(genotypeValues.get(i))); PlIsSet = true; } else if (gtKey.equals(VCFConstants.GENOTYPE_LIKELIHOODS_KEY)) { generateException("Unable to find the GT field for the record; the GT field is required before VCF4.1"); if ( genotypeAlleleLocation > 0 ) generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes when present"); final List<Allele> GTalleles = (genotypeAlleleLocation == -1 ? new ArrayList<Allele>(0) : parseGenotypeAlleles(genotypeValues.get(genotypeAlleleLocation), alleles, alleleMap)); gb.alleles(GTalleles); gb.phased(genotypeAlleleLocation != -1 && genotypeValues.get(genotypeAlleleLocation).indexOf(VCFConstants.PHASED) != -1);
/** * check to make sure the allele is an acceptable allele * @param allele the allele to check * @param isRef are we the reference allele? * @param lineNo the line number for this record */ private static void checkAllele(String allele, boolean isRef, int lineNo) { if ( allele == null || allele.isEmpty() ) generateException(generateExceptionTextForBadAlleleBases(""), lineNo); if ( GeneralUtils.DEBUG_MODE_ENABLED && MAX_ALLELE_SIZE_BEFORE_WARNING != -1 && allele.length() > MAX_ALLELE_SIZE_BEFORE_WARNING ) { System.err.println(String.format("Allele detected with length %d exceeding max size %d at approximately line %d, likely resulting in degraded VCF processing performance", allele.length(), MAX_ALLELE_SIZE_BEFORE_WARNING, lineNo)); } if ( isSymbolicAllele(allele) ) { if ( isRef ) { generateException("Symbolic alleles not allowed as reference allele: " + allele, lineNo); } } else { // check for VCF3 insertions or deletions if ( (allele.charAt(0) == VCFConstants.DELETION_ALLELE_v3) || (allele.charAt(0) == VCFConstants.INSERTION_ALLELE_v3) ) generateException("Insertions/Deletions are not supported when reading 3.x VCF's. Please" + " convert your file to VCF4 using VCFTools, available at http://vcftools.sourceforge.net/index.html", lineNo); if (!Allele.acceptableAlleleBases(allele, isRef)) generateException(generateExceptionTextForBadAlleleBases(allele), lineNo); if ( isRef && allele.equals(VCFConstants.EMPTY_ALLELE) ) generateException("The reference allele cannot be missing", lineNo); } }
/** * parse out the alleles * @param ref the reference base * @param alts a string of alternates to break into alleles * @param lineNo the line number for this record * @return a list of alleles, and a pair of the shortest and longest sequence */ protected static List<Allele> parseAlleles(String ref, String alts, int lineNo) { List<Allele> alleles = new ArrayList<Allele>(2); // we are almost always biallelic // ref checkAllele(ref, true, lineNo); Allele refAllele = Allele.create(ref, true); alleles.add(refAllele); if ( alts.indexOf(',') == -1 ) // only 1 alternatives, don't call string split parseSingleAltAllele(alleles, alts, lineNo); else for ( String alt : alts.split(",") ) parseSingleAltAllele(alleles, alt, lineNo); return alleles; }
/** * parse a single allele, given the allele list * @param alleles the alleles available * @param alt the allele to parse * @param lineNo the line number for this record */ private static void parseSingleAltAllele(List<Allele> alleles, String alt, int lineNo) { checkAllele(alt, false, lineNo); Allele allele = Allele.create(alt, false); if ( ! allele.isNoCall() ) alleles.add(allele); }
generateException("The VCF specification requires a valid (non-zero length) info field"); generateException("The VCF specification does not allow for whitespace in the INFO field. Offending field value was \"" + infoField + "\"");
builder.source(getName()); final String chr = getCachedString(parts[0]); builder.chr(chr); int pos = -1; pos = Integer.valueOf(parts[1]); } catch (NumberFormatException e) { generateException(parts[1] + " is not a valid start position in the VCF format"); generateException("The VCF specification requires a valid ID field"); else if ( parts[2].equals(VCFConstants.EMPTY_ID_FIELD) ) builder.noID(); final String ref = getCachedString(parts[3].toUpperCase()); final String alts = getCachedString(parts[4]); builder.log10PError(parseQual(parts[5])); final List<String> filters = parseFilters(getCachedString(parts[6])); if ( filters != null ) builder.filters(new HashSet<String>(filters)); final Map<String, Object> attrs = parseInfo(parts[7]); builder.attributes(attrs); builder.stop(Integer.valueOf(attrs.get(VCFConstants.END_KEY).toString())); } catch (Exception e) { generateException("the END value in the INFO field is not valid"); final List<Allele> alleles = parseAlleles(ref, alts, lineNo); builder.alleles(alleles);
generateException("there are " + (nParts-1) + " genotypes while the header requires that " + (genotypeParts.length-1) + " genotypes be present for all records at " + chr + ":" + pos, lineNo); generateException("There are too many keys for the sample " + sampleName + ", keys = " + parts[8] + ", values = " + parts[genotypeOffset]); final List<String> filters = parseFilters(getCachedString(GTValueArray[i])); if ( filters != null ) gb.filters(filters); } else if ( GTValueArray[i].equals(VCFConstants.MISSING_VALUE_v4) ) { gb.GQ((int)Math.round(Double.valueOf(GTValueArray[i]))); } else if (gtKey.equals(VCFConstants.GENOTYPE_ALLELE_DEPTHS)) { gb.AD(decodeInts(GTValueArray[i])); } else if (gtKey.equals(VCFConstants.GENOTYPE_PL_KEY)) { gb.PL(decodeInts(GTValueArray[i])); } else if (gtKey.equals(VCFConstants.GENOTYPE_LIKELIHOODS_KEY)) { gb.PL(GenotypeLikelihoods.fromGLField(GTValueArray[i]).getAsPLs()); generateException("Unable to find the GT field for the record; the GT field is required before VCF4.1"); if ( genotypeAlleleLocation > 0 ) generateException("Saw GT field at position " + genotypeAlleleLocation + ", but it must be at the first position for genotypes when present"); final List<Allele> GTalleles = (genotypeAlleleLocation == -1 ? new ArrayList<Allele>(0) : parseGenotypeAlleles(GTValueArray[genotypeAlleleLocation], alleles, alleleMap)); gb.alleles(GTalleles); gb.phased(genotypeAlleleLocation != -1 && GTValueArray[genotypeAlleleLocation].indexOf(VCFConstants.PHASED) != -1);
/** * check to make sure the allele is an acceptable allele * @param allele the allele to check * @param isRef are we the reference allele? * @param lineNo the line number for this record */ private static void checkAllele(String allele, boolean isRef, int lineNo) { if ( allele == null || allele.length() == 0 ) generateException(generateExceptionTextForBadAlleleBases(""), lineNo); if ( GeneralUtils.DEBUG_MODE_ENABLED && MAX_ALLELE_SIZE_BEFORE_WARNING != -1 && allele.length() > MAX_ALLELE_SIZE_BEFORE_WARNING ) { System.err.println(String.format("Allele detected with length %d exceeding max size %d at approximately line %d, likely resulting in degraded VCF processing performance", allele.length(), MAX_ALLELE_SIZE_BEFORE_WARNING, lineNo)); } if ( isSymbolicAllele(allele) ) { if ( isRef ) { generateException("Symbolic alleles not allowed as reference allele: " + allele, lineNo); } } else { // check for VCF3 insertions or deletions if ( (allele.charAt(0) == VCFConstants.DELETION_ALLELE_v3) || (allele.charAt(0) == VCFConstants.INSERTION_ALLELE_v3) ) generateException("Insertions/Deletions are not supported when reading 3.x VCF's. Please" + " convert your file to VCF4 using VCFTools, available at http://vcftools.sourceforge.net/index.html", lineNo); if (!Allele.acceptableAlleleBases(allele)) generateException(generateExceptionTextForBadAlleleBases(allele), lineNo); if ( isRef && allele.equals(VCFConstants.EMPTY_ALLELE) ) generateException("The reference allele cannot be missing", lineNo); } }
/** * parse out the alleles * @param ref the reference base * @param alts a string of alternates to break into alleles * @param lineNo the line number for this record * @return a list of alleles, and a pair of the shortest and longest sequence */ protected static List<Allele> parseAlleles(String ref, String alts, int lineNo) { List<Allele> alleles = new ArrayList<Allele>(2); // we are almost always biallelic // ref checkAllele(ref, true, lineNo); Allele refAllele = Allele.create(ref, true); alleles.add(refAllele); if ( alts.indexOf(",") == -1 ) // only 1 alternatives, don't call string split parseSingleAltAllele(alleles, alts, lineNo); else for ( String alt : alts.split(",") ) parseSingleAltAllele(alleles, alt, lineNo); return alleles; }
/** * the fast decode function * @param line the line of text for the record * @return a feature, (not guaranteed complete) that has the correct start and stop */ public Feature decodeLoc(String line) { return decodeLine(line, false); }