public List<String> getSampleData(String sampleName) { requireSamplesPosition(); Integer samplePosition = samplesPosition.get(sampleName); if (samplePosition == null) { return null; } else { return getSampleData(samplePosition); } }
@Override protected String featureValueOf(StudyEntry actual) { return actual.getSampleData(sampleName, formatField); } };
private boolean allSameTypeAndGT(Collection<Variant> conflicts, VariantType type) { boolean differentType = conflicts.stream().filter(v -> !v.getType().equals(type)).findAny().isPresent(); if (differentType) { return false; } StudyEntry studyEntry = conflicts.stream().findAny().get().getStudies().get(0); String sample = studyEntry.getSamplesName().stream().findFirst().get(); String gt = studyEntry.getSampleData(sample, GENOTYPE_KEY); long count = conflicts.stream().filter(v -> v.getType().equals(type) && StringUtils.equals(gt, v.getStudies().get(0).getSampleData(sample, GENOTYPE_KEY))).count(); return ((int) count) == conflicts.size(); }
/** * @param variant Variant to fill * @return Put with required changes * @throws IOException if fails reading from HBAse */ public Put fillGaps(Variant variant) throws IOException { HashSet<Integer> missingSamples = new HashSet<>(); for (Integer sampleId : samples) { if (variant.getStudies().get(0).getSampleData(studyConfiguration.getSampleIds().inverse().get(sampleId)).get(0).equals("?/?")) { missingSamples.add(sampleId); } } return fillGaps(variant, missingSamples); } /**
currAlts.addAll(currentStudy.getSecondaryAlternates()); for (String dupSample : duplicateSamples) { String currGt = getStudy(current).getSampleData(dupSample, getGtKey()); Set<Integer> gtIdxSet = Arrays.stream(currGt.split(",")) .flatMap(e -> Arrays.stream(e.split("/")))
/** * @return an array of IBS of length: (samples.size()*(samples.size() -1))/2 * which is samples.size() choose 2 */ public List<IdentityByState> countIBS(Iterator<Variant> iterator, List<String> samples) { // assumptions if (samples.size() < 1 || samples.size() > MAX_SAMPLES_ALLOWED) { throw new IllegalArgumentException("samples.size() is " + samples.size() + " and it should be between 1 and" + MAX_SAMPLES_ALLOWED); } final int studyIndex = 0; // loops List<IdentityByState> counts = new ArrayList<>(getAmountOfPairs(samples.size())); for (int i = 0; i < getAmountOfPairs(samples.size()); i++) { counts.add(new IdentityByState()); } while (iterator.hasNext()) { Variant variant = iterator.next(); forEachPair(samples, (int i, int j, int compoundIndex) -> { StudyEntry studyEntry = variant.getStudies().get(studyIndex); String gtI = studyEntry.getSampleData(samples.get(i), "GT"); String gtJ = studyEntry.getSampleData(samples.get(j), "GT"); Genotype genotypeI = new Genotype(gtI); Genotype genotypeJ = new Genotype(gtJ); int whichIBS = countSharedAlleles(genotypeI.getAllelesIdx().length, genotypeI, genotypeJ); counts.get(compoundIndex).ibs[whichIBS]++; }); } return counts; }
public void checkNewMultiAllelicVariants(VariantHadoopDBAdaptor dbAdaptor) { Variant v = dbAdaptor.get(new Query(VariantQueryParam.ID.key(), "1:10297:C:G").append(VariantQueryParam.UNKNOWN_GENOTYPE.key(), "?"), null).first(); assertEquals(1, v.getStudies().get(0).getSecondaryAlternates().size()); assertEquals("0/1", v.getStudies().get(0).getSampleData("NA12877", "GT")); assertEquals("0/2", v.getStudies().get(0).getSampleData("NA12878", "GT")); v = dbAdaptor.get(new Query(VariantQueryParam.ID.key(), "1:10297:C:T").append(VariantQueryParam.UNKNOWN_GENOTYPE.key(), "?"), null).first(); assertEquals(1, v.getStudies().get(0).getSecondaryAlternates().size()); assertEquals("0/2", v.getStudies().get(0).getSampleData("NA12877", "GT")); assertEquals("0/1", v.getStudies().get(0).getSampleData("NA12878", "GT")); }
protected void checkFillGaps(StudyConfiguration studyConfiguration, VariantHadoopDBAdaptor dbAdaptor, List<Integer> sampleIds, Set<String> variantsWithGaps) { for (Variant variant : dbAdaptor) { boolean anyUnknown = false; boolean allUnknown = true; for (Integer sampleId : sampleIds) { boolean unknown = variant.getStudies().get(0).getSampleData(studyConfiguration.getSampleIds().inverse().get(sampleId), "GT").equals("?/?"); anyUnknown |= unknown; allUnknown &= unknown; } // Fail if any, but not all samples are unknown if (anyUnknown && !allUnknown) { if (variantsWithGaps.contains(variant.toString())) { System.out.println("Gaps in variant " + variant); } else { Assert.fail("Gaps in variant " + variant); } } } }
@Test public void testFillGapsConflictingFilesNonRef() throws Exception { StudyConfiguration studyConfiguration = load(new QueryOptions(), Arrays.asList( getResourceUri("gaps2/file1.genome.vcf"), getResourceUri("gaps2/file2.genome.vcf"))); checkConflictingFiles(studyConfiguration); VariantDBAdaptor dbAdaptor = variantStorageEngine.getDBAdaptor(); Variant variantMulti = dbAdaptor.get(new Query(VariantQueryParam.ID.key(), "1:10035:A:G"), null).first(); assertEquals("0/0", variantMulti.getStudies().get(0).getSampleData("s1", "GT")); assertEquals(new AlternateCoordinate("1", 10035, 10035, "A", "<*>", VariantType.NO_VARIATION), variantMulti.getStudies().get(0).getSecondaryAlternates().get(0)); assertEquals("4,0,1", variantMulti.getStudies().get(0).getSampleData("s1", "AD")); assertEquals("0/1", variantMulti.getStudies().get(0).getSampleData("s2", "GT")); assertEquals("13,23,0", variantMulti.getStudies().get(0).getSampleData("s2", "AD")); }
public void checkNewMissingPositions(VariantHadoopDBAdaptor dbAdaptor) { Variant v; v = dbAdaptor.get(new Query(VariantQueryParam.ID.key(), "1:10821:T:A").append(VariantQueryParam.UNKNOWN_GENOTYPE.key(), "?"), null).first(); assertEquals(0, v.getStudies().get(0).getSecondaryAlternates().size()); assertEquals("./.", v.getStudies().get(0).getSampleData("NA12878", "GT")); assertEquals("./.", v.getStudies().get(0).getSampleData("NA12880", "GT")); v = dbAdaptor.get(new Query(VariantQueryParam.ID.key(), "1:10635:C:G").append(VariantQueryParam.UNKNOWN_GENOTYPE.key(), "?"), null).first(); assertEquals(0, v.getStudies().get(0).getSecondaryAlternates().size()); assertEquals("./.", v.getStudies().get(0).getSampleData("NA12880", "GT")); }
public void checkConflictingFiles(StudyConfiguration studyConfiguration) throws Exception { HadoopVariantStorageEngine variantStorageEngine = (HadoopVariantStorageEngine) this.variantStorageEngine; VariantHadoopDBAdaptor dbAdaptor = variantStorageEngine.getDBAdaptor(); List<Integer> sampleIds = new ArrayList<>(studyConfiguration.getSampleIds().values()); sampleIds.sort(Integer::compareTo); fillGaps(variantStorageEngine, studyConfiguration, sampleIds); printVariants(dbAdaptor.getStudyConfigurationManager().getStudyConfiguration(studyConfiguration.getStudyId(), null).first(), dbAdaptor, newOutputUri(1)); checkFillGaps(studyConfiguration, dbAdaptor, sampleIds, Collections.singleton("1:10020:A:T")); checkSampleIndexTable(dbAdaptor); Variant variantGap = dbAdaptor.get(new Query(VariantQueryParam.ID.key(), "1:10020:A:T"), null).first(); assertEquals("0/1", variantGap.getStudies().get(0).getSampleData("s1", "GT")); assertEquals(GenotypeClass.UNKNOWN_GENOTYPE, variantGap.getStudies().get(0).getSampleData("s2", "GT")); Variant variantMulti = dbAdaptor.get(new Query(VariantQueryParam.ID.key(), "1:10012:TTT:-"), null).first(); assertEquals("<*>", variantMulti.getStudies().get(0).getSecondaryAlternates().get(0).getAlternate()); assertEquals("0/1", variantMulti.getStudies().get(0).getSampleData("s1", "GT")); assertEquals("2/2", variantMulti.getStudies().get(0).getSampleData("s2", "GT")); Variant variantNonMulti = dbAdaptor.get(new Query(VariantQueryParam.ID.key(), "1:10054:A:G"), null).first(); assertEquals(new HashSet<>(Arrays.asList("C", "T")), variantNonMulti.getStudies().get(0).getSecondaryAlternates().stream().map(AlternateCoordinate::getAlternate).collect(Collectors.toSet())); assertEquals("2/3", variantNonMulti.getStudies().get(0).getSampleData("s1", "GT")); assertEquals("0/1", variantNonMulti.getStudies().get(0).getSampleData("s2", "GT")); }
public void checkSampleData(Variant variant, StudyConfiguration studyConfiguration, Integer fileId, Function<Integer, String> valueProvider, String field) { assertTrue(studyConfiguration.getFileIds().values().contains(fileId)); studyConfiguration.getSamplesInFiles().get(fileId).forEach((sampleId) -> { String sampleName = studyConfiguration.getSampleIds().inverse().get(sampleId); StudyEntry study = variant.getStudy(studyConfiguration.getStudyName()); assertTrue(study.getSamplesName().contains(sampleName)); assertEquals("Variant=" + variant + " StudyId=" + studyConfiguration.getStudyId() + " FileId=" + fileId + " Field=" + field + " Sample=" + sampleName + " (" + sampleId + ")\n"+variant.toJson(), valueProvider.apply(sampleId), study.getSampleData(sampleName, field)); }); }
@Test public void resolve_INS_REF_Split() throws Exception { Variant a = addGTAndFilter(addAttribute(getVariantFilter("1:102::TTT", "PASS"), QUAL, "731"), "0/1", "PASS"); Variant b = addGTAndFilter(addAttribute(getVariantFilter("1:100-103:AAAA", "PASS"), QUAL, "390"), "0/0", "PASS"); b.setType(NO_VARIATION); List<Variant> resolved = new ArrayList<>(new VariantLocalConflictResolver().resolveConflicts(Arrays.asList(a, b))); assertEquals(3, resolved.size()); assertEquals("SiteConflict", resolved.get(2).getStudies().get(0).getSampleData("1", GENOTYPE_FILTER_KEY)); }
Integer dpIdx = study.getFormatPositions().get("DP"); if (dpIdx != null) { String dpStr = study.getSampleData(0).get(dpIdx); try { Integer dp = Integer.valueOf(dpStr); assertEquals("0/0", variant.getStudies().get(0).getSampleData("s1", "GT")); assertEquals("5", variant.getStudies().get(0).getSampleData("s1", "DP")); variant = dbAdaptor.get(new Query(ID.key(), "1:10050:A:T"), null).first(); assertEquals("0/0", variant.getStudies().get(0).getSampleData("s2", "GT")); assertEquals("other", variant.getStudies().get(0).getSampleData("s2", "OTHER"));
@Test public void indexWithOtherFieldsNoGT() throws Exception { //GL:DP:GU:TU:AU:CU StudyConfiguration studyConfiguration = newStudyConfiguration(); StoragePipelineResult etlResult = runDefaultETL(getResourceUri("variant-test-somatic.vcf"), getVariantStorageEngine(), studyConfiguration, new ObjectMap(VariantStorageEngine.Options.EXTRA_GENOTYPE_FIELDS.key(), Arrays.asList("GL", "DP", "AU", "CU", "GU", "TU")) // .append(VariantStorageEngine.Options.FILE_ID.key(), 2) .append(VariantStorageEngine.Options.ANNOTATE.key(), false) .append(VariantStorageEngine.Options.CALCULATE_STATS.key(), false) ); VariantDBIterator iterator = getVariantStorageEngine().getDBAdaptor().iterator(new Query(VariantQueryParam.UNKNOWN_GENOTYPE.key(), "./."), new QueryOptions()); while (iterator.hasNext()) { Variant variant = iterator.next(); assertThat(variant.getStudy(STUDY_NAME).getSampleData("SAMPLE_1", "GT"), anyOf(is("./."), is("."))); assertNotNull(variant.getStudy(STUDY_NAME).getSampleData("SAMPLE_1", "DP")); assertNotNull(variant.getStudy(STUDY_NAME).getSampleData("SAMPLE_1", "GL")); assertNotNull(variant.getStudy(STUDY_NAME).getSampleData("SAMPLE_1", "AU")); assertNotNull(variant.getStudy(STUDY_NAME).getSampleData("SAMPLE_1", "CU")); assertNotNull(variant.getStudy(STUDY_NAME).getSampleData("SAMPLE_1", "GU")); assertNotNull(variant.getStudy(STUDY_NAME).getSampleData("SAMPLE_1", "TU")); } }
assertEquals("0/1", variants.get("1:10013:T:C").getStudy(studyName).getSampleData("s1", "GT")); assertEquals(null, variants.get("1:10013:T:C").getStudy(studyName).getSampleData("s2", "GT")); variants = buildVariantsIdx(); assertThat(variants.keySet(), hasItem("1:10014:A:G")); assertEquals("0/2", variants.get("1:10014:A:G").getStudy(studyName).getSampleData("s1", "GT")); assertEquals("0/1", variants.get("1:10014:A:G").getStudy(studyName).getSampleData("s2", "GT")); assertEquals("0/1", variants.get("1:10013:T:C").getStudy(studyName).getSampleData("s1", "GT")); assertEquals("0/0", variants.get("1:10013:T:C").getStudy(studyName).getSampleData("s2", "GT")); assertEquals("0/1", variants.get("1:10013:T:C").getStudy(studyName).getSampleData("s1", "GT")); assertEquals(null, variants.get("1:10013:T:C").getStudy(studyName).getSampleData("s2", "GT"));
@Test public void testMultiSampleFile() throws Exception { StudyConfiguration studyConfiguration = VariantStorageBaseTest.newStudyConfiguration(); VariantHadoopDBAdaptor dbAdaptor = getVariantStorageEngine().getDBAdaptor(); loadFile("s1_s2.genome.vcf", studyConfiguration, new ObjectMap(HadoopVariantStorageEngine.VARIANT_TABLE_LOAD_REFERENCE, true)); checkArchiveTableTimeStamp(dbAdaptor); printVariants(studyConfiguration, dbAdaptor, newOutputUri()); int numHomRef = 0; for (Variant variant : dbAdaptor) { StudyEntry study = variant.getStudies().get(0); for (String s : study.getSamplesName()) { String gt = study.getSampleData(s, "GT"); assertNotEquals(GenotypeClass.UNKNOWN_GENOTYPE, gt); if (GenotypeClass.HOM_REF.test(gt)) { numHomRef++; assertTrue(StringUtils.isNumeric(study.getSampleData(s, "DP"))); } } } assertNotEquals(0, numHomRef); }
assertEquals("0/1", variants.get("1:10013:T:C").getStudy(studyName).getSampleData("s1", "GT")); assertEquals(null, variants.get("1:10013:T:C").getStudy(studyName).getSampleData("s2", "GT")); assertThat(variants.keySet(), hasItem("1:10014:A:G")); assertEquals("0/1", variants.get("1:10014:A:G").getStudy(studyName).getSampleData("s2", "GT")); assertEquals("0/1", variants.get("1:10013:T:C").getStudy(studyName).getSampleData("s1", "GT")); assertEquals("0/1", variants.get("1:10013:T:C").getStudy(studyName).getSampleData("s1", "GT")); assertEquals(null, variants.get("1:10013:T:C").getStudy(studyName).getSampleData("s2", "GT"));
@Test public void removeSingleFileTest() throws Exception { StudyConfiguration studyConfiguration = VariantStorageBaseTest.newStudyConfiguration(); System.out.println("studyConfiguration = " + studyConfiguration); String studyName = studyConfiguration.getStudyName(); Map<String, Object> options = Collections.singletonMap(HadoopVariantStorageEngine.VARIANT_TABLE_INDEXES_SKIP, true); loadFile("s1.genome.vcf", studyConfiguration, options); Map<String, Variant> variants = buildVariantsIdx(); assertFalse(variants.containsKey("1:10014:A:G")); assertTrue(variants.containsKey("1:10013:T:C")); assertEquals("0/1", variants.get("1:10013:T:C").getStudy(studyName).getSampleData("s1", "GT")); VariantHadoopDBAdaptor dbAdaptor = getVariantStorageEngine().getDBAdaptor(); VariantHbaseTestUtils.printVariants(getVariantStorageEngine().getDBAdaptor(), newOutputUri()); // delete removeFile("s1.genome.vcf", studyConfiguration, options); VariantHbaseTestUtils.printVariants(getVariantStorageEngine().getDBAdaptor(), newOutputUri()); checkSampleIndexTable(studyConfiguration, dbAdaptor, "s1.genome.vcf"); variants = buildVariantsIdx(); assertEquals("Expected none variants", 0, variants.size()); assertEquals("Expected none indexed files", 0, studyConfiguration.getIndexedFiles().size()); }
assertSame(samplesPosition2, variant2.getStudy(studyConfiguration.getStudyName()).getSamplesPosition()); for (String sampleName : samplesName) { String gt1 = variant1.getStudy(studyConfiguration.getStudyName()).getSampleData(sampleName, "GT"); String gt2 = variant2.getStudy(studyConfiguration.getStudyName()).getSampleData(sampleName, "GT"); assertEquals(sampleName + " " + variant1.getChromosome() + ":" + variant1.getStart(), gt1, gt2);