From d8fb88c544a9ce29ed335c95a19fe027a04e2103 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Wed, 9 Oct 2024 15:03:29 -0400 Subject: [PATCH 01/58] Initial commit --- .../spark/sv/utils/GATKSVVCFConstants.java | 19 + .../tools/walkers/sv/SVCleanPt1a.java | 407 ++++++++++++++++++ 2 files changed, 426 insertions(+) create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java index 5224fe5f7d6..b5f6606d213 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java @@ -3,6 +3,8 @@ import com.google.common.collect.HashBiMap; import htsjdk.variant.variantcontext.Allele; +import java.util.Arrays; +import java.util.List; import java.util.Map; import static java.util.Map.entry; @@ -147,6 +149,23 @@ public enum ComplexVariantSubtype { public static final String LOW_QS_SCORE_FILTER_KEY = "LOW_QS"; public static final String FREQUENCY_FILTER_KEY = "FREQ"; + // CleanPt1 + public static final String EV = "EV"; + public static final List evValues = Arrays.asList( + null, "RD", "PE", "RD,PE", "SR", "RD,SR", "PE,SR", "RD,PE,SR" + ); + public static final String ME = ":ME"; + public static final String VAR_GQ = "varGQ"; + public static final String MULTIALLELIC = "MULTIALLELIC"; + public static final String UNRESOLVED = "UNRESOLVED"; + public static final String HIGH_SR_BACKGROUND = "HIGH_SR_BACKGROUND"; + public static final String BOTHSIDES_SUPPORT = "BOTHSIDES_SUPPORT"; + public static final String END = "END"; + public static final String RD_CN = "RD_CN"; + + // CleanPt2 + + // Clustering public static final String CLUSTER_MEMBER_IDS_KEY = "MEMBERS"; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java new file mode 100644 index 00000000000..bb7c500bbe2 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java @@ -0,0 +1,407 @@ +package org.broadinstitute.hellbender.tools.walkers.sv; + +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.GenotypeBuilder; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLine; +import htsjdk.variant.vcf.VCFInfoHeaderLine; +import htsjdk.variant.vcf.VCFFilterHeaderLine; +import htsjdk.variant.vcf.VCFHeaderLineType; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; +import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; + +import java.util.Arrays; +import java.util.List; +import java.util.Set; +import java.util.Map; +import java.util.LinkedHashSet; +import java.util.HashSet; +import java.util.HashMap; +import java.util.stream.Collectors; + + +@CommandLineProgramProperties( + summary = "Clean and format structural variant VCFs", + oneLineSummary = "Clean and format structural variant VCFs", + programGroup = StructuralVariantDiscoveryProgramGroup.class +) +@BetaFeature +@DocumentedFeature +public final class SVCleanPt1a extends VariantWalker { + public static final String PED_FILE_LONG_NAME = "ped-file"; + public static final String CHRX_LONG_NAME = "chrX"; + public static final String CHRY_LONG_NAME = "chrY"; + public static final String FAIL_LIST_LONG_NAME = "fail-list"; + public static final String PASS_LIST_LONG_NAME = "pass-list"; + public static final String OUTPUT_SAMPLES_LIST_LONG_NAME = "sample-list"; + public static final String OUTPUT_REVISED_EVENTS_LIST_LONG_NAME = "revised-list"; + + @Argument( + fullName = PED_FILE_LONG_NAME, + doc = "Sample PED file" + ) + private GATKPath pedFile; + + @Argument( + fullName = CHRX_LONG_NAME, + doc = "chrX column name" + ) + private String chrX; + + @Argument( + fullName = CHRY_LONG_NAME, + doc = "chrY column name" + ) + private String chrY; + + @Argument( + fullName = FAIL_LIST_LONG_NAME, + doc = "List of complex variants failing the background test" + ) + private GATKPath failList; + + @Argument( + fullName = PASS_LIST_LONG_NAME, + doc = "List of complex variants passing both sides" + ) + private GATKPath passList; + + @Argument( + fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + doc = "Output VCF name" + ) + private GATKPath outputVcf; + + @Argument( + fullName = OUTPUT_SAMPLES_LIST_LONG_NAME, + doc="Output list of samples" + ) + private GATKPath outputSamplesList; + + @Argument( + fullName = OUTPUT_REVISED_EVENTS_LIST_LONG_NAME, + doc="Output list of revised genotyped events" + ) + private GATKPath outputRevisedEventsList; + + private VariantContextWriter vcfWriter = null; + private BufferedWriter samplesWriter = null; + private BufferedWriter revisedEventsWriter = null; + + private Map sampleSexMap = null; + private Set failSet = null; + private Set passSet = null; + private Set writtenRevisedEvents = new HashSet<>(); + + private static final int MIN_ALLOSOME_EVENT_SIZE = 5000; + + + @Override + public void onTraversalStart() { + // Read supporting files into appropriate structures + sampleSexMap = readPedFile(pedFile); + failSet = readLastColumn(failList); + passSet = readLastColumn(passList); + + // Create header without the 'UNRESOLVED' INFO line + final VCFHeader header = getHeaderForVariants(); + Set newHeaderLines = new LinkedHashSet<>(); + for (VCFHeaderLine line : header.getMetaDataInInputOrder()) { + if (!(line instanceof VCFInfoHeaderLine) || !((VCFInfoHeaderLine) line).getID().equals(GATKSVVCFConstants.UNRESOLVED)) { + newHeaderLines.add(line); + } + } + + // Add new header lines + VCFHeader newHeader = new VCFHeader(newHeaderLines, header.getGenotypeSamples()); + newHeader.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.HIGH_SR_BACKGROUND, 0, VCFHeaderLineType.Flag, "High number of SR splits in background samples indicating messy region")); + newHeader.addMetaDataLine(new VCFFilterHeaderLine(GATKSVVCFConstants.UNRESOLVED, "Variant is unresolved")); + newHeader.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.BOTHSIDES_SUPPORT, 0, VCFHeaderLineType.Flag, "Variant has read-level support for both sides of breakpoint")); + + // Write header + vcfWriter = createVCFWriter(outputVcf); + vcfWriter.writeHeader(newHeader); + + // Create output writers + try { + samplesWriter = new BufferedWriter(new FileWriter(outputSamplesList.toPath().toFile())); + revisedEventsWriter = new BufferedWriter(new FileWriter(outputRevisedEventsList.toPath().toFile())); + writeSamples(); + } catch (IOException e) { + throw new RuntimeException("Can't create output file", e); + } + } + + @Override + public void closeTool() { + if (vcfWriter != null) { + vcfWriter.close(); + } + try { + if (samplesWriter != null) { + samplesWriter.close(); + } + if (revisedEventsWriter != null) { + revisedEventsWriter.close(); + } + } catch (IOException e) { + throw new RuntimeException("Error closing output file", e); + } + } + + @Override + public void apply(VariantContext variant, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext) { + VariantContextBuilder variantBuilder = new VariantContextBuilder(variant); + List processedGenotypes = processGenotypes(variant); + variantBuilder.genotypes(processedGenotypes); + processVariant(variant, variantBuilder); + vcfWriter.add(variantBuilder.make()); + } + + private List processGenotypes(VariantContext variant) { + return variant.getGenotypes().stream() + .map(genotype -> { + GenotypeBuilder genotypeBuilder = new GenotypeBuilder(genotype); + processEVGenotype(genotype, genotypeBuilder); + processSVTypeGenotype(variant, genotype, genotypeBuilder); + processAllosomesGenotype(variant, genotype, genotypeBuilder); + return genotypeBuilder.make(); + }) + .collect(Collectors.toList()); + } + + private void processVariant(VariantContext variant, VariantContextBuilder builder) { + processSVType(variant, builder); + processVarGQ(variant, builder); + processMultiallelic(builder); + processUnresolved(variant, builder); + processNoisyEvents(variant, builder); + processBothsidesSupportEvents(variant, builder); + } + + private void processEVGenotype(Genotype genotype, GenotypeBuilder genotypeBuilder) { + if (genotype.hasExtendedAttribute(GATKSVVCFConstants.EV)) { + String evAttribute = (String) genotype.getExtendedAttribute(GATKSVVCFConstants.EV); + try { + int evIndex = Integer.parseInt(evAttribute); + if (evIndex >= 0 && evIndex < GATKSVVCFConstants.evValues.size()) { + genotypeBuilder.attribute(GATKSVVCFConstants.EV, GATKSVVCFConstants.evValues.get(evIndex)); + } + } catch (NumberFormatException e) { + throw new RuntimeException("Invalid EV attribute for genotype: " + genotype.getSampleName(), e); + } + } + } + + private void processSVTypeGenotype(VariantContext variant, Genotype genotype, GenotypeBuilder genotypeBuilder) { + String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, null); + if (svType != null && variant.getAlleles().stream().noneMatch(allele -> allele.getDisplayString().contains(GATKSVVCFConstants.ME))) { + Allele refAllele = variant.getReference(); + Allele altAllele = Allele.create("<" + svType + ">", false); + List newGenotypeAlleles = genotype.getAlleles().stream() + .map(allele -> allele.isReference() ? refAllele : altAllele) + .collect(Collectors.toList()); + genotypeBuilder.alleles(newGenotypeAlleles); + } + } + + private void processAllosomesGenotype(VariantContext variant, Genotype genotype, GenotypeBuilder genotypeBuilder) { + String chromosome = variant.getContig(); + if (chromosome.equals(chrX) || chromosome.equals(chrY)) { + boolean isY = chromosome.equals(chrY); + String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + if ((svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP)) && + (variant.getEnd() - variant.getStart() >= MIN_ALLOSOME_EVENT_SIZE)) { + String sampleName = genotype.getSampleName(); + int sex = sampleSexMap.get(sampleName); + if (sex == 1 && isRevisableEvent(variant, isY)) { // Male + writeRevisedEvents(variant); + adjustMaleGenotype(genotype, genotypeBuilder, svType); + } else if (sex == 2 && isY) { // Female + genotypeBuilder.alleles(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)); + } else if (sex == 0) { // Unknown + genotypeBuilder.alleles(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)); + } + } + } + } + + private void adjustMaleGenotype(Genotype genotype, GenotypeBuilder genotypeBuilder, String svType) { + if (genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { + int rdCN = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()); + genotypeBuilder.attribute(GATKSVVCFConstants.RD_CN, rdCN + 1); + Allele refAllele = genotype.getAllele(0); + Allele altAllele = genotype.getAllele(1); + + if (svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL)) { + if (rdCN >= 1) genotypeBuilder.alleles(Arrays.asList(refAllele, refAllele)); + else if (rdCN == 0) genotypeBuilder.alleles(Arrays.asList(refAllele, altAllele)); + } else if (svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP)) { + if (rdCN <= 1) genotypeBuilder.alleles(Arrays.asList(refAllele, refAllele)); + else if (rdCN == 2) genotypeBuilder.alleles(Arrays.asList(refAllele, altAllele)); + else genotypeBuilder.alleles(Arrays.asList(altAllele, altAllele)); + } + } + } + + private boolean isRevisableEvent(VariantContext variant, boolean isY) { + List genotypes = variant.getGenotypes(); + int[] maleCounts = new int[4]; + int[] femaleCounts = new int[4]; + for (Genotype genotype : genotypes) { + String sampleName = genotype.getSampleName(); + Integer sex = sampleSexMap.get(sampleName); + if (sex == null) continue; + + int rdCN = (int) genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, -1); + if (rdCN == -1) continue; + + int rdCNVal = Math.min(rdCN, 3); + if (sex == 1) { + maleCounts[rdCNVal]++; + } else if (sex == 2) { + femaleCounts[rdCNVal]++; + } + } + + double maleMedian = calcMedian(maleCounts); + double femaleMedian = calcMedian(femaleCounts); + return maleMedian == 1.0 && (isY ? femaleMedian == 0.0 : femaleMedian == 2.0); + } + + private double calcMedian(int[] counts) { + int total = Arrays.stream(counts).sum(); + if (total == 0) return Double.NaN; + + double target = total / 2.0; + int runningTotal = 0; + for (int i = 0; i < 4; i++) { + runningTotal += counts[i]; + if (runningTotal == target) { + return i + 0.5; + } else if (runningTotal > target) { + return i; + } + } + throw new RuntimeException("Median calculation failed"); + } + + private void processSVType(VariantContext variant, VariantContextBuilder builder) { + String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, null); + if (svType != null && variant.getAlleles().stream().noneMatch(allele -> allele.getDisplayString().contains(GATKSVVCFConstants.ME))) { + Allele refAllele = variant.getReference(); + Allele altAllele = Allele.create("<" + svType + ">", false); + List newAlleles = Arrays.asList(refAllele, altAllele); + builder.alleles(newAlleles); + } + } + + private void processVarGQ(VariantContext variant, VariantContextBuilder builder) { + if (variant.hasAttribute(GATKSVVCFConstants.VAR_GQ)) { + double varGQ = variant.getAttributeAsDouble(GATKSVVCFConstants.VAR_GQ, 0); + builder.rmAttribute(GATKSVVCFConstants.VAR_GQ); + builder.log10PError(varGQ / -10.0); + } + } + + private void processMultiallelic(VariantContextBuilder builder) { + builder.rmAttribute(GATKSVVCFConstants.MULTIALLELIC); + } + + private void processUnresolved(VariantContext variant, VariantContextBuilder builder) { + if (variant.hasAttribute(GATKSVVCFConstants.UNRESOLVED)) { + builder.rmAttribute(GATKSVVCFConstants.UNRESOLVED); + builder.filter(GATKSVVCFConstants.UNRESOLVED); + } + } + + private void processNoisyEvents(VariantContext variant, VariantContextBuilder builder) { + if (failSet.contains(variant.getID())) { + builder.attribute(GATKSVVCFConstants.HIGH_SR_BACKGROUND, true); + } + } + + private void processBothsidesSupportEvents(VariantContext variant, VariantContextBuilder builder) { + if (passSet.contains(variant.getID())) { + builder.attribute(GATKSVVCFConstants.BOTHSIDES_SUPPORT, true); + } + } + + private Set readLastColumn(GATKPath filePath) { + try { + return Files.lines(Paths.get(filePath.toString())) + .filter(line -> !line.trim().isEmpty() && !line.startsWith("#")) + .map(line -> { + int lastTabIndex = line.lastIndexOf('\t'); + return lastTabIndex != -1 ? line.substring(lastTabIndex + 1).trim() : line.trim(); + }) + .collect(Collectors.toSet()); + } catch (IOException e) { + throw new RuntimeException("Can't read variant list file: " + filePath, e); + } + } + + private Map readPedFile(GATKPath pedFile) { + Map sampleSexMap = new HashMap<>(); + try (BufferedReader reader = new BufferedReader(new FileReader(pedFile.toPath().toFile()))) { + String line; + while ((line = reader.readLine()) != null) { + if (line.startsWith("#")) continue; + String[] fields = line.split("\t"); + if (fields.length >= 5) { + String sampleName = fields[1]; + int sex = Integer.parseInt(fields[4]); + sampleSexMap.put(sampleName, sex); + } + } + } catch (IOException e) { + throw new RuntimeException("Error reading PED file: " + pedFile, e); + } + return sampleSexMap; + } + + private void writeRevisedEvents(VariantContext variant) { + String variantId = variant.getID(); + if (!writtenRevisedEvents.contains(variantId)) { + try { + revisedEventsWriter.write(variantId); + revisedEventsWriter.newLine(); + writtenRevisedEvents.add(variantId); + } catch (IOException e) { + throw new RuntimeException("Error writing to revised events output file", e); + } + } + } + + private void writeSamples() { + VCFHeader header = getHeaderForVariants(); + try { + for (String sample : header.getGenotypeSamples()) { + samplesWriter.write(sample); + samplesWriter.newLine(); + } + samplesWriter.flush(); + } catch (IOException e) { + throw new RuntimeException("Error writing to samples output file", e); + } + } +} \ No newline at end of file From 0c72bd2de6f1f18edef500e9d66f817a0fd6fab0 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Fri, 11 Oct 2024 09:08:42 -0400 Subject: [PATCH 02/58] Silenced SV type processing --- .../hellbender/tools/walkers/sv/SVCleanPt1a.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java index bb7c500bbe2..2a1b5593295 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java @@ -182,7 +182,7 @@ private List processGenotypes(VariantContext variant) { .map(genotype -> { GenotypeBuilder genotypeBuilder = new GenotypeBuilder(genotype); processEVGenotype(genotype, genotypeBuilder); - processSVTypeGenotype(variant, genotype, genotypeBuilder); + // processSVTypeGenotype(variant, genotype, genotypeBuilder); processAllosomesGenotype(variant, genotype, genotypeBuilder); return genotypeBuilder.make(); }) @@ -190,7 +190,7 @@ private List processGenotypes(VariantContext variant) { } private void processVariant(VariantContext variant, VariantContextBuilder builder) { - processSVType(variant, builder); + // processSVType(variant, builder); processVarGQ(variant, builder); processMultiallelic(builder); processUnresolved(variant, builder); From 05501ced125e2bfe95ae1dd8dd80ccf16fe9009a Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Wed, 16 Oct 2024 11:40:09 -0400 Subject: [PATCH 03/58] Created initial commit for 1b --- .../spark/sv/utils/GATKSVVCFConstants.java | 11 +- .../tools/walkers/sv/SVCleanPt1a.java | 29 +- .../tools/walkers/sv/SVCleanPt1b.java | 286 ++++++++++++++++++ 3 files changed, 309 insertions(+), 17 deletions(-) create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java index b5f6606d213..34f4bf0b5be 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java @@ -9,6 +9,7 @@ import static java.util.Map.entry; + public final class GATKSVVCFConstants { // todo: add these and the other standard SV info fields from the VCF spec to htsjdk VCFStandardHeaderLines @@ -149,7 +150,7 @@ public enum ComplexVariantSubtype { public static final String LOW_QS_SCORE_FILTER_KEY = "LOW_QS"; public static final String FREQUENCY_FILTER_KEY = "FREQ"; - // CleanPt1 + // CleanPt1a public static final String EV = "EV"; public static final List evValues = Arrays.asList( null, "RD", "PE", "RD,PE", "SR", "RD,SR", "PE,SR", "RD,PE,SR" @@ -163,8 +164,12 @@ public enum ComplexVariantSubtype { public static final String END = "END"; public static final String RD_CN = "RD_CN"; - // CleanPt2 - + // CleanPt1b + public static final String GT = "GT"; + public static final String GQ = "GQ"; + public static final String RD_GQ = "RD_GQ"; + public static final String CNVS_DEFAULT_FILE = "multi.cnvs.txt"; + public static final String BLANK_SAMPLES = "blanksample"; // Clustering public static final String CLUSTER_MEMBER_IDS_KEY = "MEMBERS"; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java index 2a1b5593295..acf15d791ad 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java @@ -11,6 +11,7 @@ import htsjdk.variant.vcf.VCFInfoHeaderLine; import htsjdk.variant.vcf.VCFFilterHeaderLine; import htsjdk.variant.vcf.VCFHeaderLineType; + import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.BetaFeature; import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; @@ -37,10 +38,9 @@ import java.util.HashMap; import java.util.stream.Collectors; - @CommandLineProgramProperties( - summary = "Clean and format structural variant VCFs", - oneLineSummary = "Clean and format structural variant VCFs", + summary = "Clean and format structural variant VCFs (Step 1a)", + oneLineSummary = "Clean and format structural variant VCFs (Step 1a)", programGroup = StructuralVariantDiscoveryProgramGroup.class ) @BetaFeature @@ -103,13 +103,13 @@ public final class SVCleanPt1a extends VariantWalker { ) private GATKPath outputRevisedEventsList; - private VariantContextWriter vcfWriter = null; - private BufferedWriter samplesWriter = null; - private BufferedWriter revisedEventsWriter = null; + private VariantContextWriter vcfWriter; + private BufferedWriter samplesWriter; + private BufferedWriter revisedEventsWriter; - private Map sampleSexMap = null; - private Set failSet = null; - private Set passSet = null; + private Map sampleSexMap; + private Set failSet; + private Set passSet; private Set writtenRevisedEvents = new HashSet<>(); private static final int MIN_ALLOSOME_EVENT_SIZE = 5000; @@ -117,7 +117,7 @@ public final class SVCleanPt1a extends VariantWalker { @Override public void onTraversalStart() { - // Read supporting files into appropriate structures + // Read supporting files sampleSexMap = readPedFile(pedFile); failSet = readLastColumn(failList); passSet = readLastColumn(passList); @@ -147,7 +147,7 @@ public void onTraversalStart() { revisedEventsWriter = new BufferedWriter(new FileWriter(outputRevisedEventsList.toPath().toFile())); writeSamples(); } catch (IOException e) { - throw new RuntimeException("Can't create output file", e); + throw new RuntimeException("Error creating output file", e); } } @@ -156,6 +156,7 @@ public void closeTool() { if (vcfWriter != null) { vcfWriter.close(); } + try { if (samplesWriter != null) { samplesWriter.close(); @@ -302,7 +303,7 @@ private double calcMedian(int[] counts) { return i; } } - throw new RuntimeException("Median calculation failed"); + throw new RuntimeException("Error calculating median"); } private void processSVType(VariantContext variant, VariantContextBuilder builder) { @@ -356,7 +357,7 @@ private Set readLastColumn(GATKPath filePath) { }) .collect(Collectors.toSet()); } catch (IOException e) { - throw new RuntimeException("Can't read variant list file: " + filePath, e); + throw new RuntimeException("Error reading variant list file: " + filePath, e); } } @@ -374,7 +375,7 @@ private Map readPedFile(GATKPath pedFile) { } } } catch (IOException e) { - throw new RuntimeException("Error reading PED file: " + pedFile, e); + throw new RuntimeException("Error reading PED file", e); } return sampleSexMap; } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java new file mode 100644 index 00000000000..ce4ed20533d --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java @@ -0,0 +1,286 @@ +package org.broadinstitute.hellbender.tools.walkers.sv; + +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.GenotypeBuilder; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; + +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; +import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.FileWriter; +import java.io.InputStreamReader; +import java.util.zip.GZIPInputStream; +import java.nio.file.Files; +import java.io.IOException; + +import java.util.Arrays; +import java.util.List; +import java.util.ArrayList; +import java.util.Set; +import java.util.Map; +import java.util.HashSet; +import java.util.HashMap; + +import org.apache.commons.lang3.tuple.ImmutablePair; +import org.apache.commons.lang3.tuple.Pair; + +@CommandLineProgramProperties( + summary = "Clean and format structural variant VCFs (Step 1b)", + oneLineSummary = "Clean and format structural variant VCFs (Step 1b)", + programGroup = StructuralVariantDiscoveryProgramGroup.class +) +@BetaFeature +@DocumentedFeature +public class SVCleanPt1b extends TwoPassVariantWalker { + public static final String BED_FILE_LONG_NAME = "bed-file"; + public static final String CNV_FILE_LONG_NAME = "cnv-file"; + + @Argument( + fullName = BED_FILE_LONG_NAME, + doc = "BED file" + ) + private GATKPath bedFile; + + @Argument( + fullName = CNV_FILE_LONG_NAME, + doc = "Output CNVs file name", + optional = true + ) + private GATKPath outputCnvs = new GATKPath(GATKSVVCFConstants.CNVS_DEFAULT_FILE); + + @Argument( + fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + doc = "Output VCF name" + ) + private GATKPath outputVcf; + + private VariantContextWriter vcfWriter; + private BufferedWriter cnvsWriter; + + private Set multiCnvs = new HashSet<>(); + private Map>> revisedEventsAll = new HashMap<>(); + private Map> revisedEventsFiltered = new HashMap<>(); + private Map> revisedRdCn = new HashMap<>(); + + @Override + public void onTraversalStart() { + // Pre-process BED file + processBedFile(); + + // Write header + vcfWriter = createVCFWriter(outputVcf); + vcfWriter.writeHeader(getHeaderForVariants()); + } + + @Override + public Object onTraversalSuccess() { + try { + cnvsWriter = new BufferedWriter(new FileWriter(outputCnvs.toPath().toFile())); + for (String variantId : multiCnvs) { + cnvsWriter.write(variantId); + cnvsWriter.newLine(); + } + } catch (IOException e) { + throw new RuntimeException("Error creating CNVs file", e); + } + return null; + } + + @Override + public void closeTool() { + try { + if (vcfWriter != null) { + vcfWriter.close(); + } + + if (cnvsWriter != null) { + cnvsWriter.close(); + } + } catch (IOException e) { + throw new RuntimeException("Error closing output file", e); + } + } + + @Override + public void firstPassApply(VariantContext variant, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext) { + if (shouldInitializeRdCn(variant)) { + initializeRdCn(variant); + } + } + + @Override + public void afterFirstPass() { + return; + } + + @Override + public void secondPassApply(VariantContext variant, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext) { + VariantContextBuilder builder = new VariantContextBuilder(variant); + if (shouldProcessVariant(variant)) { + processVariant(builder, variant); + } + if (shouldProcessCnvs(variant)) { + processCnvs(variant); + } + vcfWriter.add(builder.make()); + } + + private boolean shouldInitializeRdCn(VariantContext variant) { + return revisedEventsFiltered.containsKey(variant.getID()); + } + + private boolean shouldProcessVariant(VariantContext variant) { + return revisedEventsAll.containsKey(variant.getID()); + } + + private boolean shouldProcessCnvs(VariantContext variant) { + String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + boolean isDelDup = svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL); + boolean isLarge = variant.getEnd() - variant.getStart() >= 1000; + return isDelDup && isLarge; + } + + private void initializeRdCn(VariantContext variant) { + // Initialize data structures + String variantId = variant.getID(); + Set samples = revisedEventsFiltered.get(variantId); + Map variantRdCn = new HashMap<>(); + + // Initialize revisedRdCn value for each variant + for (String sampleName : samples) { + Genotype genotype = variant.getGenotype(sampleName); + if (genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { + String rdCn = (String) genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN); + variantRdCn.put(sampleName, Integer.parseInt(rdCn)); + } + } + revisedRdCn.put(variantId, variantRdCn); + } + + private void processVariant(VariantContextBuilder builder, VariantContext variant) { + // Initialize data structures + String variantId = variant.getID(); + Map> variantEvents = revisedEventsAll.get(variantId); + List newGenotypes = new ArrayList<>(); + + // Create updated genotypes + for (String sample : variant.getSampleNamesOrderedByName()) { + Genotype oldGenotype = variant.getGenotype(sample); + Pair event = variantEvents.get(sample); + + if (event != null) { + String widerVariantId = event.getLeft(); + String widerSvType = event.getRight(); + int currentRdCn = revisedRdCn.get(variantId).getOrDefault(sample, 0); + int widerRdCn = revisedRdCn.getOrDefault(widerVariantId, new HashMap<>()).getOrDefault(sample, 0); + if (!revisedEventsFiltered.getOrDefault(widerVariantId, new HashSet<>()).contains(sample)) { + System.err.println(sample + " " + widerVariantId); + } + + int newVal = -1; + if (widerSvType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) && currentRdCn == 2 && widerRdCn == 3) { + newVal = 1; + } else if (widerSvType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) && currentRdCn == 2 && widerRdCn == 1) { + newVal = 3; + } + + if (newVal != -1) { + GenotypeBuilder gb = new GenotypeBuilder(oldGenotype); + gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); + gb.GQ(Integer.parseInt((String) oldGenotype.getExtendedAttribute(GATKSVVCFConstants.RD_GQ))); + newGenotypes.add(gb.make()); + } else { + newGenotypes.add(oldGenotype); + } + } else { + newGenotypes.add(oldGenotype); + } + } + builder.genotypes(newGenotypes); + } + + private void processCnvs(VariantContext variant) { + boolean isDel = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, "").equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL); + for (String sample : variant.getSampleNamesOrderedByName()) { + Genotype genotype = variant.getGenotype(sample); + if (genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { + String rdCnString = (String) genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN); + int rdCn = Integer.parseInt(rdCnString); + if ((isDel && rdCn > 3) || (!isDel && (rdCn < 1 || rdCn > 4))) { + multiCnvs.add(variant.getID()); + break; + } + } + } + } + + private void processBedFile() { + try (BufferedReader reader = new BufferedReader(new InputStreamReader(new GZIPInputStream(Files.newInputStream(bedFile.toPath()))))) { + String line; + while ((line = reader.readLine()) != null) { + String[] fields = line.split("\t"); + if (fields.length < 12) continue; + + String[] wider = Integer.parseInt(fields[2]) - Integer.parseInt(fields[1]) >= Integer.parseInt(fields[8]) - Integer.parseInt(fields[7]) + ? Arrays.copyOfRange(fields, 0, 6) + : Arrays.copyOfRange(fields, 6, 12); + String[] narrower = Integer.parseInt(fields[2]) - Integer.parseInt(fields[1]) >= Integer.parseInt(fields[8]) - Integer.parseInt(fields[7]) + ? Arrays.copyOfRange(fields, 6, 12) + : Arrays.copyOfRange(fields, 0, 6); + if (wider[5].equals(GATKSVVCFConstants.BLANK_SAMPLES)) continue; + + double coverage = getCoverage(wider, narrower); + if (coverage >= 0.5) { + Set widerSamples = new HashSet<>(Arrays.asList(wider[5].split(","))); + Set narrowerSamples = new HashSet<>(Arrays.asList(narrower[5].split(","))); + Set uniqueSamples = new HashSet<>(widerSamples); + uniqueSamples.removeAll(narrowerSamples); + + for (String sample : uniqueSamples) { + revisedEventsAll.computeIfAbsent(narrower[3], k -> new HashMap<>()) + .put(sample, new ImmutablePair<>(wider[3], wider[4])); + } + } + } + + for (Map.Entry>> entry : revisedEventsAll.entrySet()) { + for (Map.Entry> innerEntry : entry.getValue().entrySet()) { + String sampleName = innerEntry.getKey(); + String variantId = entry.getKey(); + String widerVariantId = innerEntry.getValue().getLeft(); + String svType = innerEntry.getValue().getRight(); + if (svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL)) { + revisedEventsFiltered.computeIfAbsent(variantId, k -> new HashSet<>()).add(sampleName); + revisedEventsFiltered.computeIfAbsent(widerVariantId, k -> new HashSet<>()).add(sampleName); + } + } + } + } catch (IOException e) { + throw new RuntimeException("Error reading bed file", e); + } + } + + private double getCoverage(String[] wider, String[] narrower) { + int nStart = Integer.parseInt(narrower[1]); + int nStop = Integer.parseInt(narrower[2]); + int wStart = Integer.parseInt(wider[1]); + int wStop = Integer.parseInt(wider[2]); + + if (wStart <= nStop && nStart <= wStop) { + int intersectionSize = Math.min(nStop, wStop) - Math.max(nStart, wStart); + return (double) intersectionSize / (nStop - nStart); + } + return 0.0; + } +} From 40295e1589575ff5fa1a4f1bf073dca3c86af5f5 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Wed, 16 Oct 2024 13:04:54 -0400 Subject: [PATCH 04/58] Reformatting per GATK style guide --- .../tools/walkers/sv/SVCleanPt1a.java | 153 +++++++++++------- .../tools/walkers/sv/SVCleanPt1b.java | 87 +++++++--- 2 files changed, 155 insertions(+), 85 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java index acf15d791ad..dde4bd06ad0 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java @@ -38,9 +38,53 @@ import java.util.HashMap; import java.util.stream.Collectors; +/** + * Completes an initial series of cleaning steps for a VCF produced by the GATK-SV pipeline. + * + *

Inputs

+ *
    + *
  • + * VCF containing structural variant (SV) records from the GATK-SV pipeline. + *
  • + *
  • + * TODO + *
  • + *
+ * + *

Output

+ *
    + *
  • + * Annotated VCF. + *
  • + *
+ * + *

Usage Example

+ *
+ *     gatk SVCleanPt1a \
+ *       -V structural.vcf.gz \
+ *       -O cleansed.vcf.gz
+ *       --ped-file pedigree.ped
+ *       --chrX chrX
+ *       --chrY chrY
+ *       --fail-list background_fail.txt
+ *       --pass-list bothsides_pass.txt
+ *       --sample-list sample_list.txt
+ *       --revised-list revised_list.txt
+ * 
+ * + *

Cleaning Steps

+ *
    + *
  1. + * Adds new FILTER and INFO tags to header. + *
  2. + *
  3. + * TODO + *
  4. + *
+ */ @CommandLineProgramProperties( - summary = "Clean and format structural variant VCFs (Step 1a)", - oneLineSummary = "Clean and format structural variant VCFs (Step 1a)", + summary = "Clean and format structural variant VCFs per Step 1a", + oneLineSummary = "Clean and format structural variant VCFs per Step 1a", programGroup = StructuralVariantDiscoveryProgramGroup.class ) @BetaFeature @@ -125,7 +169,7 @@ public void onTraversalStart() { // Create header without the 'UNRESOLVED' INFO line final VCFHeader header = getHeaderForVariants(); Set newHeaderLines = new LinkedHashSet<>(); - for (VCFHeaderLine line : header.getMetaDataInInputOrder()) { + for (final VCFHeaderLine line : header.getMetaDataInInputOrder()) { if (!(line instanceof VCFInfoHeaderLine) || !((VCFInfoHeaderLine) line).getID().equals(GATKSVVCFConstants.UNRESOLVED)) { newHeaderLines.add(line); } @@ -143,9 +187,14 @@ public void onTraversalStart() { // Create output writers try { - samplesWriter = new BufferedWriter(new FileWriter(outputSamplesList.toPath().toFile())); revisedEventsWriter = new BufferedWriter(new FileWriter(outputRevisedEventsList.toPath().toFile())); - writeSamples(); + samplesWriter = new BufferedWriter(new FileWriter(outputSamplesList.toPath().toFile())); + + for (final String sample : header.getGenotypeSamples()) { + samplesWriter.write(sample); + samplesWriter.newLine(); + } + samplesWriter.flush(); } catch (IOException e) { throw new RuntimeException("Error creating output file", e); } @@ -153,11 +202,11 @@ public void onTraversalStart() { @Override public void closeTool() { - if (vcfWriter != null) { - vcfWriter.close(); - } - try { + if (vcfWriter != null) { + vcfWriter.close(); + } + if (samplesWriter != null) { samplesWriter.close(); } @@ -172,7 +221,7 @@ public void closeTool() { @Override public void apply(VariantContext variant, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext) { VariantContextBuilder variantBuilder = new VariantContextBuilder(variant); - List processedGenotypes = processGenotypes(variant); + final List processedGenotypes = processGenotypes(variant); variantBuilder.genotypes(processedGenotypes); processVariant(variant, variantBuilder); vcfWriter.add(variantBuilder.make()); @@ -202,38 +251,33 @@ private void processVariant(VariantContext variant, VariantContextBuilder builde private void processEVGenotype(Genotype genotype, GenotypeBuilder genotypeBuilder) { if (genotype.hasExtendedAttribute(GATKSVVCFConstants.EV)) { String evAttribute = (String) genotype.getExtendedAttribute(GATKSVVCFConstants.EV); - try { - int evIndex = Integer.parseInt(evAttribute); - if (evIndex >= 0 && evIndex < GATKSVVCFConstants.evValues.size()) { - genotypeBuilder.attribute(GATKSVVCFConstants.EV, GATKSVVCFConstants.evValues.get(evIndex)); - } - } catch (NumberFormatException e) { - throw new RuntimeException("Invalid EV attribute for genotype: " + genotype.getSampleName(), e); + final int evIndex = Integer.parseInt(evAttribute); + if (evIndex >= 0 && evIndex < GATKSVVCFConstants.evValues.size()) { + genotypeBuilder.attribute(GATKSVVCFConstants.EV, GATKSVVCFConstants.evValues.get(evIndex)); } } } private void processSVTypeGenotype(VariantContext variant, Genotype genotype, GenotypeBuilder genotypeBuilder) { - String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, null); + final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, null); if (svType != null && variant.getAlleles().stream().noneMatch(allele -> allele.getDisplayString().contains(GATKSVVCFConstants.ME))) { - Allele refAllele = variant.getReference(); - Allele altAllele = Allele.create("<" + svType + ">", false); - List newGenotypeAlleles = genotype.getAlleles().stream() - .map(allele -> allele.isReference() ? refAllele : altAllele) - .collect(Collectors.toList()); + List newGenotypeAlleles = Arrays.asList( + variant.getReference(), + Allele.create("<" + svType + ">", false) + ); genotypeBuilder.alleles(newGenotypeAlleles); } } private void processAllosomesGenotype(VariantContext variant, Genotype genotype, GenotypeBuilder genotypeBuilder) { - String chromosome = variant.getContig(); + final String chromosome = variant.getContig(); if (chromosome.equals(chrX) || chromosome.equals(chrY)) { - boolean isY = chromosome.equals(chrY); - String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + final boolean isY = chromosome.equals(chrY); + final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); if ((svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP)) && (variant.getEnd() - variant.getStart() >= MIN_ALLOSOME_EVENT_SIZE)) { - String sampleName = genotype.getSampleName(); - int sex = sampleSexMap.get(sampleName); + final String sampleName = genotype.getSampleName(); + final int sex = sampleSexMap.get(sampleName); if (sex == 1 && isRevisableEvent(variant, isY)) { // Male writeRevisedEvents(variant); adjustMaleGenotype(genotype, genotypeBuilder, svType); @@ -248,11 +292,11 @@ private void processAllosomesGenotype(VariantContext variant, Genotype genotype, private void adjustMaleGenotype(Genotype genotype, GenotypeBuilder genotypeBuilder, String svType) { if (genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { - int rdCN = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()); + final int rdCN = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()); genotypeBuilder.attribute(GATKSVVCFConstants.RD_CN, rdCN + 1); - Allele refAllele = genotype.getAllele(0); - Allele altAllele = genotype.getAllele(1); + final Allele refAllele = genotype.getAllele(0); + final Allele altAllele = genotype.getAllele(1); if (svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL)) { if (rdCN >= 1) genotypeBuilder.alleles(Arrays.asList(refAllele, refAllele)); else if (rdCN == 0) genotypeBuilder.alleles(Arrays.asList(refAllele, altAllele)); @@ -265,18 +309,18 @@ private void adjustMaleGenotype(Genotype genotype, GenotypeBuilder genotypeBuild } private boolean isRevisableEvent(VariantContext variant, boolean isY) { - List genotypes = variant.getGenotypes(); + final List genotypes = variant.getGenotypes(); int[] maleCounts = new int[4]; int[] femaleCounts = new int[4]; - for (Genotype genotype : genotypes) { - String sampleName = genotype.getSampleName(); - Integer sex = sampleSexMap.get(sampleName); + for (final Genotype genotype : genotypes) { + final String sampleName = genotype.getSampleName(); + final Integer sex = sampleSexMap.get(sampleName); if (sex == null) continue; int rdCN = (int) genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, -1); if (rdCN == -1) continue; - int rdCNVal = Math.min(rdCN, 3); + final int rdCNVal = Math.min(rdCN, 3); if (sex == 1) { maleCounts[rdCNVal]++; } else if (sex == 2) { @@ -284,16 +328,16 @@ private boolean isRevisableEvent(VariantContext variant, boolean isY) { } } - double maleMedian = calcMedian(maleCounts); - double femaleMedian = calcMedian(femaleCounts); + final double maleMedian = calcMedian(maleCounts); + final double femaleMedian = calcMedian(femaleCounts); return maleMedian == 1.0 && (isY ? femaleMedian == 0.0 : femaleMedian == 2.0); } private double calcMedian(int[] counts) { - int total = Arrays.stream(counts).sum(); + final int total = Arrays.stream(counts).sum(); if (total == 0) return Double.NaN; - double target = total / 2.0; + final double target = total / 2.0; int runningTotal = 0; for (int i = 0; i < 4; i++) { runningTotal += counts[i]; @@ -307,10 +351,10 @@ private double calcMedian(int[] counts) { } private void processSVType(VariantContext variant, VariantContextBuilder builder) { - String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, null); + final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, null); if (svType != null && variant.getAlleles().stream().noneMatch(allele -> allele.getDisplayString().contains(GATKSVVCFConstants.ME))) { - Allele refAllele = variant.getReference(); - Allele altAllele = Allele.create("<" + svType + ">", false); + final Allele refAllele = variant.getReference(); + final Allele altAllele = Allele.create("<" + svType + ">", false); List newAlleles = Arrays.asList(refAllele, altAllele); builder.alleles(newAlleles); } @@ -318,7 +362,7 @@ private void processSVType(VariantContext variant, VariantContextBuilder builder private void processVarGQ(VariantContext variant, VariantContextBuilder builder) { if (variant.hasAttribute(GATKSVVCFConstants.VAR_GQ)) { - double varGQ = variant.getAttributeAsDouble(GATKSVVCFConstants.VAR_GQ, 0); + final double varGQ = variant.getAttributeAsDouble(GATKSVVCFConstants.VAR_GQ, 0); builder.rmAttribute(GATKSVVCFConstants.VAR_GQ); builder.log10PError(varGQ / -10.0); } @@ -367,10 +411,10 @@ private Map readPedFile(GATKPath pedFile) { String line; while ((line = reader.readLine()) != null) { if (line.startsWith("#")) continue; - String[] fields = line.split("\t"); + final String[] fields = line.split("\t"); if (fields.length >= 5) { - String sampleName = fields[1]; - int sex = Integer.parseInt(fields[4]); + final String sampleName = fields[1]; + final int sex = Integer.parseInt(fields[4]); sampleSexMap.put(sampleName, sex); } } @@ -381,7 +425,7 @@ private Map readPedFile(GATKPath pedFile) { } private void writeRevisedEvents(VariantContext variant) { - String variantId = variant.getID(); + final String variantId = variant.getID(); if (!writtenRevisedEvents.contains(variantId)) { try { revisedEventsWriter.write(variantId); @@ -392,17 +436,4 @@ private void writeRevisedEvents(VariantContext variant) { } } } - - private void writeSamples() { - VCFHeader header = getHeaderForVariants(); - try { - for (String sample : header.getGenotypeSamples()) { - samplesWriter.write(sample); - samplesWriter.newLine(); - } - samplesWriter.flush(); - } catch (IOException e) { - throw new RuntimeException("Error writing to samples output file", e); - } - } } \ No newline at end of file diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java index ce4ed20533d..f1db226a7a3 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java @@ -34,9 +34,47 @@ import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; +/** + * Completes an initial series of cleaning steps for a VCF produced by the GATK-SV pipeline. + * + *

Inputs

+ *
    + *
  • + * VCF containing structural variant (SV) records from the GATK-SV pipeline. + *
  • + *
  • + * TODO + *
  • + *
+ * + *

Output

+ *
    + *
  • + * Annotated VCF. + *
  • + *
+ * + *

Usage Example

+ *
+ *     gatk SVCleanPt1b \
+ *       -V structural.vcf.gz \
+ *       -O cleansed.vcf.gz
+ *       --bed-file overlap.bed
+ * 
+ * + *

Cleaning Steps

+ *
    + *
  1. + * Calculates new copy numbers for variant genotypes that match an overlapping variant. + *
  2. + *
  3. + * TODO + *
  4. + *
+ */ @CommandLineProgramProperties( - summary = "Clean and format structural variant VCFs (Step 1b)", - oneLineSummary = "Clean and format structural variant VCFs (Step 1b)", + summary = "Clean and format structural variant VCFs per Step 1b", + oneLineSummary = "Clean and format structural variant VCFs per Step 1b", programGroup = StructuralVariantDiscoveryProgramGroup.class ) @BetaFeature @@ -145,23 +183,23 @@ private boolean shouldProcessVariant(VariantContext variant) { } private boolean shouldProcessCnvs(VariantContext variant) { - String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); - boolean isDelDup = svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL); - boolean isLarge = variant.getEnd() - variant.getStart() >= 1000; + final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + final boolean isDelDup = svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL); + final boolean isLarge = variant.getEnd() - variant.getStart() >= 1000; return isDelDup && isLarge; } private void initializeRdCn(VariantContext variant) { // Initialize data structures - String variantId = variant.getID(); - Set samples = revisedEventsFiltered.get(variantId); + final String variantId = variant.getID(); + final Set samples = revisedEventsFiltered.get(variantId); Map variantRdCn = new HashMap<>(); // Initialize revisedRdCn value for each variant - for (String sampleName : samples) { - Genotype genotype = variant.getGenotype(sampleName); + for (final String sampleName : samples) { + final Genotype genotype = variant.getGenotype(sampleName); if (genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { - String rdCn = (String) genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN); + final String rdCn = (String) genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN); variantRdCn.put(sampleName, Integer.parseInt(rdCn)); } } @@ -170,20 +208,20 @@ private void initializeRdCn(VariantContext variant) { private void processVariant(VariantContextBuilder builder, VariantContext variant) { // Initialize data structures - String variantId = variant.getID(); - Map> variantEvents = revisedEventsAll.get(variantId); + final String variantId = variant.getID(); + final Map> variantEvents = revisedEventsAll.get(variantId); List newGenotypes = new ArrayList<>(); // Create updated genotypes for (String sample : variant.getSampleNamesOrderedByName()) { - Genotype oldGenotype = variant.getGenotype(sample); - Pair event = variantEvents.get(sample); + final Genotype oldGenotype = variant.getGenotype(sample); + final Pair event = variantEvents.get(sample); if (event != null) { - String widerVariantId = event.getLeft(); - String widerSvType = event.getRight(); - int currentRdCn = revisedRdCn.get(variantId).getOrDefault(sample, 0); - int widerRdCn = revisedRdCn.getOrDefault(widerVariantId, new HashMap<>()).getOrDefault(sample, 0); + final String widerVariantId = event.getLeft(); + final String widerSvType = event.getRight(); + final int currentRdCn = revisedRdCn.get(variantId).getOrDefault(sample, 0); + final int widerRdCn = revisedRdCn.getOrDefault(widerVariantId, new HashMap<>()).getOrDefault(sample, 0); if (!revisedEventsFiltered.getOrDefault(widerVariantId, new HashSet<>()).contains(sample)) { System.err.println(sample + " " + widerVariantId); } @@ -211,12 +249,12 @@ private void processVariant(VariantContextBuilder builder, VariantContext varian } private void processCnvs(VariantContext variant) { - boolean isDel = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, "").equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL); + final boolean isDel = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, "").equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL); for (String sample : variant.getSampleNamesOrderedByName()) { - Genotype genotype = variant.getGenotype(sample); + final Genotype genotype = variant.getGenotype(sample); if (genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { - String rdCnString = (String) genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN); - int rdCn = Integer.parseInt(rdCnString); + final String rdCnString = (String) genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN); + final int rdCn = Integer.parseInt(rdCnString); if ((isDel && rdCn > 3) || (!isDel && (rdCn < 1 || rdCn > 4))) { multiCnvs.add(variant.getID()); break; @@ -226,10 +264,11 @@ private void processCnvs(VariantContext variant) { } private void processBedFile() { - try (BufferedReader reader = new BufferedReader(new InputStreamReader(new GZIPInputStream(Files.newInputStream(bedFile.toPath()))))) { + try { String line; + BufferedReader reader = new BufferedReader(new InputStreamReader(new GZIPInputStream(Files.newInputStream(bedFile.toPath())))); while ((line = reader.readLine()) != null) { - String[] fields = line.split("\t"); + final String[] fields = line.split("\t"); if (fields.length < 12) continue; String[] wider = Integer.parseInt(fields[2]) - Integer.parseInt(fields[1]) >= Integer.parseInt(fields[8]) - Integer.parseInt(fields[7]) From 5cd8ea3772efe95ad2c129021a5e1f6bde838c10 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Wed, 16 Oct 2024 14:24:48 -0400 Subject: [PATCH 05/58] Update src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java Co-authored-by: Mark Walker --- .../broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java index dde4bd06ad0..9ca9e8132ae 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java @@ -154,7 +154,7 @@ public final class SVCleanPt1a extends VariantWalker { private Map sampleSexMap; private Set failSet; private Set passSet; - private Set writtenRevisedEvents = new HashSet<>(); + private final Set writtenRevisedEvents = new HashSet<>(); private static final int MIN_ALLOSOME_EVENT_SIZE = 5000; From 75f476b117810823d2469fdff9553a41fe00fa57 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Wed, 16 Oct 2024 14:24:57 -0400 Subject: [PATCH 06/58] Update src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java Co-authored-by: Mark Walker --- .../broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java index 9ca9e8132ae..b63b986c9cb 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java @@ -219,7 +219,7 @@ public void closeTool() { } @Override - public void apply(VariantContext variant, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext) { + public void apply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext) { VariantContextBuilder variantBuilder = new VariantContextBuilder(variant); final List processedGenotypes = processGenotypes(variant); variantBuilder.genotypes(processedGenotypes); From 8d31a61a0ae6afa877e4131a63b5ec0af1b10b25 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Wed, 16 Oct 2024 14:25:03 -0400 Subject: [PATCH 07/58] Update src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java Co-authored-by: Mark Walker --- .../hellbender/tools/walkers/sv/SVCleanPt1a.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java index b63b986c9cb..b26c7226f3d 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java @@ -310,8 +310,8 @@ private void adjustMaleGenotype(Genotype genotype, GenotypeBuilder genotypeBuild private boolean isRevisableEvent(VariantContext variant, boolean isY) { final List genotypes = variant.getGenotypes(); - int[] maleCounts = new int[4]; - int[] femaleCounts = new int[4]; + final int[] maleCounts = new int[4]; + final int[] femaleCounts = new int[4]; for (final Genotype genotype : genotypes) { final String sampleName = genotype.getSampleName(); final Integer sex = sampleSexMap.get(sampleName); From 12cc930ab5d586a741750223c60dea9a99761bb9 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Wed, 16 Oct 2024 14:25:09 -0400 Subject: [PATCH 08/58] Update src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java Co-authored-by: Mark Walker --- .../broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java index b26c7226f3d..4480fe19fa3 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java @@ -317,7 +317,7 @@ private boolean isRevisableEvent(VariantContext variant, boolean isY) { final Integer sex = sampleSexMap.get(sampleName); if (sex == null) continue; - int rdCN = (int) genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, -1); + final int rdCN = (int) genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, -1); if (rdCN == -1) continue; final int rdCNVal = Math.min(rdCN, 3); From 4ecd5250356220ec0fcd005db46ed942d0857e48 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Wed, 16 Oct 2024 14:25:22 -0400 Subject: [PATCH 09/58] Update src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java Co-authored-by: Mark Walker --- .../broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java index 4480fe19fa3..8e73029d472 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java @@ -168,7 +168,7 @@ public void onTraversalStart() { // Create header without the 'UNRESOLVED' INFO line final VCFHeader header = getHeaderForVariants(); - Set newHeaderLines = new LinkedHashSet<>(); + final Set newHeaderLines = new LinkedHashSet<>(); for (final VCFHeaderLine line : header.getMetaDataInInputOrder()) { if (!(line instanceof VCFInfoHeaderLine) || !((VCFInfoHeaderLine) line).getID().equals(GATKSVVCFConstants.UNRESOLVED)) { newHeaderLines.add(line); From df375cc5d889aeae0a81a4a4fb07d81d096f0d93 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Thu, 17 Oct 2024 09:19:36 -0400 Subject: [PATCH 10/58] PR feedback --- .../tools/walkers/sv/SVCleanPt1a.java | 49 ++++++------------- 1 file changed, 16 insertions(+), 33 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java index 8e73029d472..d80a6894319 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java @@ -20,6 +20,7 @@ import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; import org.broadinstitute.hellbender.engine.*; import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; +import org.broadinstitute.hellbender.utils.MathUtils; import java.io.BufferedReader; import java.io.BufferedWriter; @@ -135,12 +136,6 @@ public final class SVCleanPt1a extends VariantWalker { ) private GATKPath outputVcf; - @Argument( - fullName = OUTPUT_SAMPLES_LIST_LONG_NAME, - doc="Output list of samples" - ) - private GATKPath outputSamplesList; - @Argument( fullName = OUTPUT_REVISED_EVENTS_LIST_LONG_NAME, doc="Output list of revised genotyped events" @@ -148,7 +143,6 @@ public final class SVCleanPt1a extends VariantWalker { private GATKPath outputRevisedEventsList; private VariantContextWriter vcfWriter; - private BufferedWriter samplesWriter; private BufferedWriter revisedEventsWriter; private Map sampleSexMap; @@ -188,13 +182,6 @@ public void onTraversalStart() { // Create output writers try { revisedEventsWriter = new BufferedWriter(new FileWriter(outputRevisedEventsList.toPath().toFile())); - samplesWriter = new BufferedWriter(new FileWriter(outputSamplesList.toPath().toFile())); - - for (final String sample : header.getGenotypeSamples()) { - samplesWriter.write(sample); - samplesWriter.newLine(); - } - samplesWriter.flush(); } catch (IOException e) { throw new RuntimeException("Error creating output file", e); } @@ -206,10 +193,6 @@ public void closeTool() { if (vcfWriter != null) { vcfWriter.close(); } - - if (samplesWriter != null) { - samplesWriter.close(); - } if (revisedEventsWriter != null) { revisedEventsWriter.close(); } @@ -227,19 +210,19 @@ public void apply(final VariantContext variant, final ReadsContext readsContext, vcfWriter.add(variantBuilder.make()); } - private List processGenotypes(VariantContext variant) { + private List processGenotypes(final VariantContext variant) { return variant.getGenotypes().stream() .map(genotype -> { GenotypeBuilder genotypeBuilder = new GenotypeBuilder(genotype); processEVGenotype(genotype, genotypeBuilder); - // processSVTypeGenotype(variant, genotype, genotypeBuilder); + // processSVTypeGenotype(variant, genotypeBuilder); processAllosomesGenotype(variant, genotype, genotypeBuilder); return genotypeBuilder.make(); }) .collect(Collectors.toList()); } - private void processVariant(VariantContext variant, VariantContextBuilder builder) { + private void processVariant(final VariantContext variant, final VariantContextBuilder builder) { // processSVType(variant, builder); processVarGQ(variant, builder); processMultiallelic(builder); @@ -248,7 +231,7 @@ private void processVariant(VariantContext variant, VariantContextBuilder builde processBothsidesSupportEvents(variant, builder); } - private void processEVGenotype(Genotype genotype, GenotypeBuilder genotypeBuilder) { + private void processEVGenotype(final Genotype genotype, final GenotypeBuilder genotypeBuilder) { if (genotype.hasExtendedAttribute(GATKSVVCFConstants.EV)) { String evAttribute = (String) genotype.getExtendedAttribute(GATKSVVCFConstants.EV); final int evIndex = Integer.parseInt(evAttribute); @@ -258,7 +241,7 @@ private void processEVGenotype(Genotype genotype, GenotypeBuilder genotypeBuilde } } - private void processSVTypeGenotype(VariantContext variant, Genotype genotype, GenotypeBuilder genotypeBuilder) { + private void processSVTypeGenotype(final VariantContext variant, final GenotypeBuilder genotypeBuilder) { final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, null); if (svType != null && variant.getAlleles().stream().noneMatch(allele -> allele.getDisplayString().contains(GATKSVVCFConstants.ME))) { List newGenotypeAlleles = Arrays.asList( @@ -318,9 +301,9 @@ private boolean isRevisableEvent(VariantContext variant, boolean isY) { if (sex == null) continue; final int rdCN = (int) genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, -1); - if (rdCN == -1) continue; - final int rdCNVal = Math.min(rdCN, 3); + if (rdCNVal == -1) continue; + if (sex == 1) { maleCounts[rdCNVal]++; } else if (sex == 2) { @@ -328,23 +311,23 @@ private boolean isRevisableEvent(VariantContext variant, boolean isY) { } } - final double maleMedian = calcMedian(maleCounts); - final double femaleMedian = calcMedian(femaleCounts); - return maleMedian == 1.0 && (isY ? femaleMedian == 0.0 : femaleMedian == 2.0); + final int maleMedian = calcMedianDistribution(maleCounts); + final int femaleMedian = calcMedianDistribution(femaleCounts); + return maleMedian == 2 && (isY ? femaleMedian == 0 : femaleMedian == 4); } - private double calcMedian(int[] counts) { + private int calcMedianDistribution(int[] counts) { final int total = Arrays.stream(counts).sum(); - if (total == 0) return Double.NaN; + if (total == 0) return -1; - final double target = total / 2.0; + final int target = total / 2; int runningTotal = 0; for (int i = 0; i < 4; i++) { runningTotal += counts[i]; if (runningTotal == target) { - return i + 0.5; + return i * 2 + 1; } else if (runningTotal > target) { - return i; + return i * 2; } } throw new RuntimeException("Error calculating median"); From 2e28e1355c621afe2f8e619c540c3376f0d1226e Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Thu, 17 Oct 2024 17:35:53 -0400 Subject: [PATCH 11/58] WIP commit - prior to deprecating BED file input in 1b --- .../spark/sv/utils/GATKSVVCFConstants.java | 6 +- .../tools/walkers/sv/SVCleanPt1a.java | 150 +++++++----------- .../tools/walkers/sv/SVCleanPt1b.java | 113 +++++++------ 3 files changed, 120 insertions(+), 149 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java index 34f4bf0b5be..e9e6569cbc6 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java @@ -155,18 +155,16 @@ public enum ComplexVariantSubtype { public static final List evValues = Arrays.asList( null, "RD", "PE", "RD,PE", "SR", "RD,SR", "PE,SR", "RD,PE,SR" ); - public static final String ME = ":ME"; + public static final String ME = "ME"; public static final String VAR_GQ = "varGQ"; public static final String MULTIALLELIC = "MULTIALLELIC"; public static final String UNRESOLVED = "UNRESOLVED"; public static final String HIGH_SR_BACKGROUND = "HIGH_SR_BACKGROUND"; public static final String BOTHSIDES_SUPPORT = "BOTHSIDES_SUPPORT"; - public static final String END = "END"; + public static final String REVISED_EVENT = "REVISED_EVENT"; public static final String RD_CN = "RD_CN"; // CleanPt1b - public static final String GT = "GT"; - public static final String GQ = "GQ"; public static final String RD_GQ = "RD_GQ"; public static final String CNVS_DEFAULT_FILE = "multi.cnvs.txt"; public static final String BLANK_SAMPLES = "blanksample"; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java index d80a6894319..c3a5417be29 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java @@ -20,13 +20,16 @@ import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; import org.broadinstitute.hellbender.engine.*; import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; -import org.broadinstitute.hellbender.utils.MathUtils; +import org.broadinstitute.hellbender.utils.tsv.TableUtils; +import org.broadinstitute.hellbender.utils.tsv.TableReader; +import org.broadinstitute.hellbender.utils.variant.GATKSVVariantContextUtils; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; +import java.nio.file.Path; import java.nio.file.Files; import java.nio.file.Paths; @@ -38,6 +41,7 @@ import java.util.HashSet; import java.util.HashMap; import java.util.stream.Collectors; +import java.util.function.Function; /** * Completes an initial series of cleaning steps for a VCF produced by the GATK-SV pipeline. @@ -96,26 +100,21 @@ public final class SVCleanPt1a extends VariantWalker { public static final String CHRY_LONG_NAME = "chrY"; public static final String FAIL_LIST_LONG_NAME = "fail-list"; public static final String PASS_LIST_LONG_NAME = "pass-list"; - public static final String OUTPUT_SAMPLES_LIST_LONG_NAME = "sample-list"; public static final String OUTPUT_REVISED_EVENTS_LIST_LONG_NAME = "revised-list"; - @Argument( - fullName = PED_FILE_LONG_NAME, - doc = "Sample PED file" - ) - private GATKPath pedFile; - @Argument( fullName = CHRX_LONG_NAME, - doc = "chrX column name" + doc = "chrX column name", + optional = true ) - private String chrX; + private String chrX = "chrX"; @Argument( fullName = CHRY_LONG_NAME, - doc = "chrY column name" + doc = "chrY column name", + optional = true ) - private String chrY; + private String chrY = "chrY"; @Argument( fullName = FAIL_LIST_LONG_NAME, @@ -136,19 +135,11 @@ public final class SVCleanPt1a extends VariantWalker { ) private GATKPath outputVcf; - @Argument( - fullName = OUTPUT_REVISED_EVENTS_LIST_LONG_NAME, - doc="Output list of revised genotyped events" - ) - private GATKPath outputRevisedEventsList; - private VariantContextWriter vcfWriter; - private BufferedWriter revisedEventsWriter; - private Map sampleSexMap; private Set failSet; private Set passSet; - private final Set writtenRevisedEvents = new HashSet<>(); + private final Set revisedSet = new HashSet<>(); private static final int MIN_ALLOSOME_EVENT_SIZE = 5000; @@ -156,7 +147,6 @@ public final class SVCleanPt1a extends VariantWalker { @Override public void onTraversalStart() { // Read supporting files - sampleSexMap = readPedFile(pedFile); failSet = readLastColumn(failList); passSet = readLastColumn(passList); @@ -174,30 +164,17 @@ public void onTraversalStart() { newHeader.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.HIGH_SR_BACKGROUND, 0, VCFHeaderLineType.Flag, "High number of SR splits in background samples indicating messy region")); newHeader.addMetaDataLine(new VCFFilterHeaderLine(GATKSVVCFConstants.UNRESOLVED, "Variant is unresolved")); newHeader.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.BOTHSIDES_SUPPORT, 0, VCFHeaderLineType.Flag, "Variant has read-level support for both sides of breakpoint")); + newHeader.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.REVISED_EVENT, 0, VCFHeaderLineType.Flag, "Variant has been revised due to a copy number mismatch")); // Write header vcfWriter = createVCFWriter(outputVcf); vcfWriter.writeHeader(newHeader); - - // Create output writers - try { - revisedEventsWriter = new BufferedWriter(new FileWriter(outputRevisedEventsList.toPath().toFile())); - } catch (IOException e) { - throw new RuntimeException("Error creating output file", e); - } } @Override public void closeTool() { - try { - if (vcfWriter != null) { - vcfWriter.close(); - } - if (revisedEventsWriter != null) { - revisedEventsWriter.close(); - } - } catch (IOException e) { - throw new RuntimeException("Error closing output file", e); + if (vcfWriter != null) { + vcfWriter.close(); } } @@ -229,6 +206,7 @@ private void processVariant(final VariantContext variant, final VariantContextBu processUnresolved(variant, builder); processNoisyEvents(variant, builder); processBothsidesSupportEvents(variant, builder); + processAllosomes(variant, builder); } private void processEVGenotype(final Genotype genotype, final GenotypeBuilder genotypeBuilder) { @@ -243,7 +221,11 @@ private void processEVGenotype(final Genotype genotype, final GenotypeBuilder ge private void processSVTypeGenotype(final VariantContext variant, final GenotypeBuilder genotypeBuilder) { final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, null); - if (svType != null && variant.getAlleles().stream().noneMatch(allele -> allele.getDisplayString().contains(GATKSVVCFConstants.ME))) { + boolean hasMobileElement = variant.getAlleles().stream() + .map(allele -> GATKSVVariantContextUtils.getSymbolicAlleleSymbols(allele)) + .flatMap(Arrays::stream) + .anyMatch(symbol -> symbol.equals(GATKSVVCFConstants.ME)); + if (svType != null && !hasMobileElement) { List newGenotypeAlleles = Arrays.asList( variant.getReference(), Allele.create("<" + svType + ">", false) @@ -252,17 +234,18 @@ private void processSVTypeGenotype(final VariantContext variant, final GenotypeB } } - private void processAllosomesGenotype(VariantContext variant, Genotype genotype, GenotypeBuilder genotypeBuilder) { + private void processAllosomesGenotype(final VariantContext variant, final Genotype genotype, final GenotypeBuilder genotypeBuilder) { final String chromosome = variant.getContig(); if (chromosome.equals(chrX) || chromosome.equals(chrY)) { - final boolean isY = chromosome.equals(chrY); final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); if ((svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP)) && (variant.getEnd() - variant.getStart() >= MIN_ALLOSOME_EVENT_SIZE)) { - final String sampleName = genotype.getSampleName(); - final int sex = sampleSexMap.get(sampleName); - if (sex == 1 && isRevisableEvent(variant, isY)) { // Male - writeRevisedEvents(variant); + final boolean isY = chromosome.equals(chrY); + final int chrCN = (int) genotype.getExtendedAttribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT); + final int sex = chrCN == 1 ? 1 : 2; + + if (sex == 1 && isRevisableEvent(variant, isY, sex)) { // Male + revisedSet.add(variant.getID()); adjustMaleGenotype(genotype, genotypeBuilder, svType); } else if (sex == 2 && isY) { // Female genotypeBuilder.alleles(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)); @@ -273,7 +256,7 @@ private void processAllosomesGenotype(VariantContext variant, Genotype genotype, } } - private void adjustMaleGenotype(Genotype genotype, GenotypeBuilder genotypeBuilder, String svType) { + private void adjustMaleGenotype(final Genotype genotype, final GenotypeBuilder genotypeBuilder, final String svType) { if (genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { final int rdCN = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()); genotypeBuilder.attribute(GATKSVVCFConstants.RD_CN, rdCN + 1); @@ -291,15 +274,11 @@ private void adjustMaleGenotype(Genotype genotype, GenotypeBuilder genotypeBuild } } - private boolean isRevisableEvent(VariantContext variant, boolean isY) { + private boolean isRevisableEvent(final VariantContext variant, final boolean isY, final int sex) { final List genotypes = variant.getGenotypes(); final int[] maleCounts = new int[4]; final int[] femaleCounts = new int[4]; for (final Genotype genotype : genotypes) { - final String sampleName = genotype.getSampleName(); - final Integer sex = sampleSexMap.get(sampleName); - if (sex == null) continue; - final int rdCN = (int) genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, -1); final int rdCNVal = Math.min(rdCN, 3); if (rdCNVal == -1) continue; @@ -316,7 +295,7 @@ private boolean isRevisableEvent(VariantContext variant, boolean isY) { return maleMedian == 2 && (isY ? femaleMedian == 0 : femaleMedian == 4); } - private int calcMedianDistribution(int[] counts) { + private int calcMedianDistribution(final int[] counts) { final int total = Arrays.stream(counts).sum(); if (total == 0) return -1; @@ -333,7 +312,7 @@ private int calcMedianDistribution(int[] counts) { throw new RuntimeException("Error calculating median"); } - private void processSVType(VariantContext variant, VariantContextBuilder builder) { + private void processSVType(final VariantContext variant, final VariantContextBuilder builder) { final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, null); if (svType != null && variant.getAlleles().stream().noneMatch(allele -> allele.getDisplayString().contains(GATKSVVCFConstants.ME))) { final Allele refAllele = variant.getReference(); @@ -343,7 +322,7 @@ private void processSVType(VariantContext variant, VariantContextBuilder builder } } - private void processVarGQ(VariantContext variant, VariantContextBuilder builder) { + private void processVarGQ(final VariantContext variant, final VariantContextBuilder builder) { if (variant.hasAttribute(GATKSVVCFConstants.VAR_GQ)) { final double varGQ = variant.getAttributeAsDouble(GATKSVVCFConstants.VAR_GQ, 0); builder.rmAttribute(GATKSVVCFConstants.VAR_GQ); @@ -351,72 +330,49 @@ private void processVarGQ(VariantContext variant, VariantContextBuilder builder) } } - private void processMultiallelic(VariantContextBuilder builder) { + private void processMultiallelic(final VariantContextBuilder builder) { builder.rmAttribute(GATKSVVCFConstants.MULTIALLELIC); } - private void processUnresolved(VariantContext variant, VariantContextBuilder builder) { + private void processUnresolved(final VariantContext variant, final VariantContextBuilder builder) { if (variant.hasAttribute(GATKSVVCFConstants.UNRESOLVED)) { builder.rmAttribute(GATKSVVCFConstants.UNRESOLVED); builder.filter(GATKSVVCFConstants.UNRESOLVED); } } - private void processNoisyEvents(VariantContext variant, VariantContextBuilder builder) { + private void processNoisyEvents(final VariantContext variant, final VariantContextBuilder builder) { if (failSet.contains(variant.getID())) { builder.attribute(GATKSVVCFConstants.HIGH_SR_BACKGROUND, true); } } - private void processBothsidesSupportEvents(VariantContext variant, VariantContextBuilder builder) { + private void processBothsidesSupportEvents(final VariantContext variant, final VariantContextBuilder builder) { if (passSet.contains(variant.getID())) { builder.attribute(GATKSVVCFConstants.BOTHSIDES_SUPPORT, true); } } - private Set readLastColumn(GATKPath filePath) { - try { - return Files.lines(Paths.get(filePath.toString())) - .filter(line -> !line.trim().isEmpty() && !line.startsWith("#")) - .map(line -> { - int lastTabIndex = line.lastIndexOf('\t'); - return lastTabIndex != -1 ? line.substring(lastTabIndex + 1).trim() : line.trim(); - }) - .collect(Collectors.toSet()); - } catch (IOException e) { - throw new RuntimeException("Error reading variant list file: " + filePath, e); + private void processAllosomes(final VariantContext variant, final VariantContextBuilder builder) { + if (revisedSet.contains(variant.getID())) { + builder.attribute(GATKSVVCFConstants.REVISED_EVENT, true); } } - private Map readPedFile(GATKPath pedFile) { - Map sampleSexMap = new HashMap<>(); - try (BufferedReader reader = new BufferedReader(new FileReader(pedFile.toPath().toFile()))) { - String line; - while ((line = reader.readLine()) != null) { - if (line.startsWith("#")) continue; - final String[] fields = line.split("\t"); - if (fields.length >= 5) { - final String sampleName = fields[1]; - final int sex = Integer.parseInt(fields[4]); - sampleSexMap.put(sampleName, sex); - } - } - } catch (IOException e) { - throw new RuntimeException("Error reading PED file", e); - } - return sampleSexMap; - } + private Set readLastColumn(final GATKPath filePath) { + try { + final Path path = filePath.toPath(); + final TableReader reader = TableUtils.reader(path, (columns, exceptionFactory) -> + (dataline) -> { + return dataline.get(columns.columnCount() - 1); + } + ); - private void writeRevisedEvents(VariantContext variant) { - final String variantId = variant.getID(); - if (!writtenRevisedEvents.contains(variantId)) { - try { - revisedEventsWriter.write(variantId); - revisedEventsWriter.newLine(); - writtenRevisedEvents.add(variantId); - } catch (IOException e) { - throw new RuntimeException("Error writing to revised events output file", e); - } + Set result = reader.stream().collect(Collectors.toSet()); + reader.close(); + return result; + } catch (IOException e) { + throw new RuntimeException("Error reading variant list file: " + filePath, e); } } } \ No newline at end of file diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java index f1db226a7a3..ace531b5bbd 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java @@ -79,7 +79,7 @@ ) @BetaFeature @DocumentedFeature -public class SVCleanPt1b extends TwoPassVariantWalker { +public class SVCleanPt1b extends MultiplePassVariantWalker { public static final String BED_FILE_LONG_NAME = "bed-file"; public static final String CNV_FILE_LONG_NAME = "cnv-file"; @@ -106,10 +106,20 @@ public class SVCleanPt1b extends TwoPassVariantWalker { private VariantContextWriter vcfWriter; private BufferedWriter cnvsWriter; - private Set multiCnvs = new HashSet<>(); - private Map>> revisedEventsAll = new HashMap<>(); - private Map> revisedEventsFiltered = new HashMap<>(); - private Map> revisedRdCn = new HashMap<>(); + final private Set multiCnvs = new HashSet<>(); + final private Map>> revisedEventsAll = new HashMap<>(); + final private Map> revisedEventsFiltered = new HashMap<>(); + final private Map> revisedRdCn = new HashMap<>(); + + @Override + protected int numberOfPasses() { + return 3; + } + + @Override + protected void afterNthPass(int n) { + return; + } @Override public void onTraversalStart() { @@ -129,10 +139,10 @@ public Object onTraversalSuccess() { cnvsWriter.write(variantId); cnvsWriter.newLine(); } + return null; } catch (IOException e) { throw new RuntimeException("Error creating CNVs file", e); } - return null; } @Override @@ -141,7 +151,6 @@ public void closeTool() { if (vcfWriter != null) { vcfWriter.close(); } - if (cnvsWriter != null) { cnvsWriter.close(); } @@ -151,19 +160,31 @@ public void closeTool() { } @Override - public void firstPassApply(VariantContext variant, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext) { - if (shouldInitializeRdCn(variant)) { - initializeRdCn(variant); + protected void nthPassApply(VariantContext variant, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext, int n) { + switch(n) { + case 1: + firstPassApply(variant, readsContext, referenceContext, featureContext); + break; + case 2: + secondPassApply(variant, readsContext, referenceContext, featureContext); + break; + case 3: + thirdPassApply(variant, readsContext, referenceContext, featureContext); + break; } } - @Override - public void afterFirstPass() { + public void firstPassApply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext) { return; } - @Override - public void secondPassApply(VariantContext variant, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext) { + public void secondPassApply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext) { + if (shouldInitializeRdCn(variant)) { + initializeRdCn(variant); + } + } + + public void thirdPassApply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext) { VariantContextBuilder builder = new VariantContextBuilder(variant); if (shouldProcessVariant(variant)) { processVariant(builder, variant); @@ -174,43 +195,41 @@ public void secondPassApply(VariantContext variant, ReadsContext readsContext, R vcfWriter.add(builder.make()); } - private boolean shouldInitializeRdCn(VariantContext variant) { + private boolean shouldInitializeRdCn(final VariantContext variant) { return revisedEventsFiltered.containsKey(variant.getID()); } - private boolean shouldProcessVariant(VariantContext variant) { + private boolean shouldProcessVariant(final VariantContext variant) { return revisedEventsAll.containsKey(variant.getID()); } - private boolean shouldProcessCnvs(VariantContext variant) { + private boolean shouldProcessCnvs(final VariantContext variant) { final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); final boolean isDelDup = svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL); final boolean isLarge = variant.getEnd() - variant.getStart() >= 1000; return isDelDup && isLarge; } - private void initializeRdCn(VariantContext variant) { + private void initializeRdCn(final VariantContext variant) { // Initialize data structures final String variantId = variant.getID(); final Set samples = revisedEventsFiltered.get(variantId); - Map variantRdCn = new HashMap<>(); + final Map variantRdCn = new HashMap<>(); // Initialize revisedRdCn value for each variant for (final String sampleName : samples) { final Genotype genotype = variant.getGenotype(sampleName); - if (genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { - final String rdCn = (String) genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN); - variantRdCn.put(sampleName, Integer.parseInt(rdCn)); - } + final int rdCn = (int) genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN); + variantRdCn.put(sampleName, rdCn); } revisedRdCn.put(variantId, variantRdCn); } - private void processVariant(VariantContextBuilder builder, VariantContext variant) { + private void processVariant(final VariantContextBuilder builder, final VariantContext variant) { // Initialize data structures final String variantId = variant.getID(); final Map> variantEvents = revisedEventsAll.get(variantId); - List newGenotypes = new ArrayList<>(); + final List newGenotypes = new ArrayList<>(); // Create updated genotypes for (String sample : variant.getSampleNamesOrderedByName()) { @@ -234,7 +253,7 @@ private void processVariant(VariantContextBuilder builder, VariantContext varian } if (newVal != -1) { - GenotypeBuilder gb = new GenotypeBuilder(oldGenotype); + final GenotypeBuilder gb = new GenotypeBuilder(oldGenotype); gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); gb.GQ(Integer.parseInt((String) oldGenotype.getExtendedAttribute(GATKSVVCFConstants.RD_GQ))); newGenotypes.add(gb.make()); @@ -252,13 +271,10 @@ private void processCnvs(VariantContext variant) { final boolean isDel = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, "").equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL); for (String sample : variant.getSampleNamesOrderedByName()) { final Genotype genotype = variant.getGenotype(sample); - if (genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { - final String rdCnString = (String) genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN); - final int rdCn = Integer.parseInt(rdCnString); - if ((isDel && rdCn > 3) || (!isDel && (rdCn < 1 || rdCn > 4))) { - multiCnvs.add(variant.getID()); - break; - } + final int rdCn = (int) genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN); + if ((isDel && rdCn > 3) || (!isDel && (rdCn < 1 || rdCn > 4))) { + multiCnvs.add(variant.getID()); + break; } } } @@ -266,15 +282,15 @@ private void processCnvs(VariantContext variant) { private void processBedFile() { try { String line; - BufferedReader reader = new BufferedReader(new InputStreamReader(new GZIPInputStream(Files.newInputStream(bedFile.toPath())))); + final BufferedReader reader = new BufferedReader(new InputStreamReader(new GZIPInputStream(Files.newInputStream(bedFile.toPath())))); while ((line = reader.readLine()) != null) { final String[] fields = line.split("\t"); if (fields.length < 12) continue; - String[] wider = Integer.parseInt(fields[2]) - Integer.parseInt(fields[1]) >= Integer.parseInt(fields[8]) - Integer.parseInt(fields[7]) + final String[] wider = Integer.parseInt(fields[2]) - Integer.parseInt(fields[1]) >= Integer.parseInt(fields[8]) - Integer.parseInt(fields[7]) ? Arrays.copyOfRange(fields, 0, 6) : Arrays.copyOfRange(fields, 6, 12); - String[] narrower = Integer.parseInt(fields[2]) - Integer.parseInt(fields[1]) >= Integer.parseInt(fields[8]) - Integer.parseInt(fields[7]) + final String[] narrower = Integer.parseInt(fields[2]) - Integer.parseInt(fields[1]) >= Integer.parseInt(fields[8]) - Integer.parseInt(fields[7]) ? Arrays.copyOfRange(fields, 6, 12) : Arrays.copyOfRange(fields, 0, 6); if (wider[5].equals(GATKSVVCFConstants.BLANK_SAMPLES)) continue; @@ -292,13 +308,14 @@ private void processBedFile() { } } } - - for (Map.Entry>> entry : revisedEventsAll.entrySet()) { - for (Map.Entry> innerEntry : entry.getValue().entrySet()) { - String sampleName = innerEntry.getKey(); - String variantId = entry.getKey(); - String widerVariantId = innerEntry.getValue().getLeft(); - String svType = innerEntry.getValue().getRight(); + reader.close(); + + for (final Map.Entry>> entry : revisedEventsAll.entrySet()) { + for (final Map.Entry> innerEntry : entry.getValue().entrySet()) { + final String sampleName = innerEntry.getKey(); + final String variantId = entry.getKey(); + final String widerVariantId = innerEntry.getValue().getLeft(); + final String svType = innerEntry.getValue().getRight(); if (svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL)) { revisedEventsFiltered.computeIfAbsent(variantId, k -> new HashSet<>()).add(sampleName); revisedEventsFiltered.computeIfAbsent(widerVariantId, k -> new HashSet<>()).add(sampleName); @@ -311,13 +328,13 @@ private void processBedFile() { } private double getCoverage(String[] wider, String[] narrower) { - int nStart = Integer.parseInt(narrower[1]); - int nStop = Integer.parseInt(narrower[2]); - int wStart = Integer.parseInt(wider[1]); - int wStop = Integer.parseInt(wider[2]); + final int nStart = Integer.parseInt(narrower[1]); + final int nStop = Integer.parseInt(narrower[2]); + final int wStart = Integer.parseInt(wider[1]); + final int wStop = Integer.parseInt(wider[2]); if (wStart <= nStop && nStart <= wStop) { - int intersectionSize = Math.min(nStop, wStop) - Math.max(nStart, wStart); + final int intersectionSize = Math.min(nStop, wStop) - Math.max(nStart, wStart); return (double) intersectionSize / (nStop - nStart); } return 0.0; From f0c0e0f5407c0345b1e43523ecef3cf6b1bf0b13 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Fri, 18 Oct 2024 12:42:48 -0400 Subject: [PATCH 12/58] Updated to no longer ingest BED file --- .../spark/sv/utils/GATKSVVCFConstants.java | 1 - .../tools/walkers/sv/SVCleanPt1a.java | 9 - .../tools/walkers/sv/SVCleanPt1b.java | 208 ++++++++++-------- 3 files changed, 114 insertions(+), 104 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java index e9e6569cbc6..200a448e717 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java @@ -167,7 +167,6 @@ public enum ComplexVariantSubtype { // CleanPt1b public static final String RD_GQ = "RD_GQ"; public static final String CNVS_DEFAULT_FILE = "multi.cnvs.txt"; - public static final String BLANK_SAMPLES = "blanksample"; // Clustering public static final String CLUSTER_MEMBER_IDS_KEY = "MEMBERS"; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java index c3a5417be29..4731e4c8017 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java @@ -24,24 +24,15 @@ import org.broadinstitute.hellbender.utils.tsv.TableReader; import org.broadinstitute.hellbender.utils.variant.GATKSVVariantContextUtils; -import java.io.BufferedReader; -import java.io.BufferedWriter; -import java.io.FileReader; -import java.io.FileWriter; import java.io.IOException; import java.nio.file.Path; -import java.nio.file.Files; -import java.nio.file.Paths; import java.util.Arrays; import java.util.List; import java.util.Set; -import java.util.Map; import java.util.LinkedHashSet; import java.util.HashSet; -import java.util.HashMap; import java.util.stream.Collectors; -import java.util.function.Function; /** * Completes an initial series of cleaning steps for a VCF produced by the GATK-SV pipeline. diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java index ace531b5bbd..d43a1173921 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java @@ -15,12 +15,8 @@ import org.broadinstitute.hellbender.engine.*; import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; -import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileWriter; -import java.io.InputStreamReader; -import java.util.zip.GZIPInputStream; -import java.nio.file.Files; import java.io.IOException; import java.util.Arrays; @@ -30,6 +26,7 @@ import java.util.Map; import java.util.HashSet; import java.util.HashMap; +import java.util.Comparator; import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; @@ -80,15 +77,8 @@ @BetaFeature @DocumentedFeature public class SVCleanPt1b extends MultiplePassVariantWalker { - public static final String BED_FILE_LONG_NAME = "bed-file"; public static final String CNV_FILE_LONG_NAME = "cnv-file"; - @Argument( - fullName = BED_FILE_LONG_NAME, - doc = "BED file" - ) - private GATKPath bedFile; - @Argument( fullName = CNV_FILE_LONG_NAME, doc = "Output CNVs file name", @@ -106,6 +96,7 @@ public class SVCleanPt1b extends MultiplePassVariantWalker { private VariantContextWriter vcfWriter; private BufferedWriter cnvsWriter; + private List overlappingVariantsBuffer = new ArrayList<>(); final private Set multiCnvs = new HashSet<>(); final private Map>> revisedEventsAll = new HashMap<>(); final private Map> revisedEventsFiltered = new HashMap<>(); @@ -116,19 +107,9 @@ protected int numberOfPasses() { return 3; } - @Override - protected void afterNthPass(int n) { - return; - } - @Override public void onTraversalStart() { - // Pre-process BED file - processBedFile(); - - // Write header - vcfWriter = createVCFWriter(outputVcf); - vcfWriter.writeHeader(getHeaderForVariants()); + return; } @Override @@ -161,53 +142,124 @@ public void closeTool() { @Override protected void nthPassApply(VariantContext variant, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext, int n) { - switch(n) { - case 1: + switch (n) { + case 0: firstPassApply(variant, readsContext, referenceContext, featureContext); break; - case 2: + case 1: secondPassApply(variant, readsContext, referenceContext, featureContext); break; - case 3: + case 2: thirdPassApply(variant, readsContext, referenceContext, featureContext); break; } } + @Override + protected void afterNthPass(int n) { + switch (n) { + case 0: + processCollectedVariants(); + break; + case 1: + vcfWriter = createVCFWriter(outputVcf); + vcfWriter.writeHeader(getHeaderForVariants()); + break; + } + } + public void firstPassApply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext) { - return; + final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + final boolean isDelDup = svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL); + final boolean isLarge = Math.abs(variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) >= 5000; + if (isDelDup && isLarge) { + overlappingVariantsBuffer.removeIf(vc -> vc.getEnd() < variant.getStart()); + for (VariantContext bufferedVariant : overlappingVariantsBuffer) { + if (overlaps(bufferedVariant, variant)) { + processOverlap(bufferedVariant, variant); + } + } + overlappingVariantsBuffer.add(variant); + } } public void secondPassApply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext) { - if (shouldInitializeRdCn(variant)) { + if (revisedEventsFiltered.containsKey(variant.getID())) { initializeRdCn(variant); } } public void thirdPassApply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext) { VariantContextBuilder builder = new VariantContextBuilder(variant); - if (shouldProcessVariant(variant)) { + if (revisedEventsAll.containsKey(variant.getID())) { processVariant(builder, variant); } - if (shouldProcessCnvs(variant)) { + + final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + final boolean isDelDup = svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL); + final boolean isLarge = variant.getEnd() - variant.getStart() >= 1000; + if (isDelDup && isLarge) { processCnvs(variant); } vcfWriter.add(builder.make()); } - private boolean shouldInitializeRdCn(final VariantContext variant) { - return revisedEventsFiltered.containsKey(variant.getID()); - } + private void processOverlap(VariantContext v1, VariantContext v2) { + // Get overlap data + VariantContext wider; + VariantContext narrower; + if (v1.getLengthOnReference() > v2.getLengthOnReference()) { + wider = v1; + narrower = v2; + } else if (v2.getLengthOnReference() > v1.getLengthOnReference()) { + wider = v2; + narrower = v1; + } else { + return; + } + String widerID = wider.getID(); + String narrowerID = narrower.getID(); + + // Skip processing if same variant ID, SV type or samples + String widerSvType = wider.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + String narrowerSvType = narrower.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + Set widerSamples = getNonReferenceSamples(wider); + Set narrowerSamples = getNonReferenceSamples(narrower); + if (widerID.equals(narrowerID) || widerSvType.equals(narrowerSvType) || widerSamples.equals(narrowerSamples)) { + return; + } + + // Get samples present in wider but not in narrower + Set nonCommonSamples = new HashSet<>(widerSamples); + nonCommonSamples.removeAll(narrowerSamples); + if (nonCommonSamples.isEmpty()) { + return; + } - private boolean shouldProcessVariant(final VariantContext variant) { - return revisedEventsAll.containsKey(variant.getID()); + // Revise variant if coverage exceeds threshold + double coverage = getCoverage(wider, narrower); + if (coverage >= 0.5) { + for (String sample : nonCommonSamples) { + revisedEventsAll.computeIfAbsent(narrowerID, k -> new HashMap<>()) + .put(sample, new ImmutablePair<>(widerID, widerSvType)); + } + } } - private boolean shouldProcessCnvs(final VariantContext variant) { - final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); - final boolean isDelDup = svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL); - final boolean isLarge = variant.getEnd() - variant.getStart() >= 1000; - return isDelDup && isLarge; + private void processCollectedVariants() { + for (final Map.Entry>> entry : revisedEventsAll.entrySet()) { + for (final Map.Entry> innerEntry : entry.getValue().entrySet()) { + // Identifies variant-sample pairs we need RD_CN values for to improve speed + final String sampleName = innerEntry.getKey(); + final String variantId = entry.getKey(); + final String widerVariantId = innerEntry.getValue().getLeft(); + final String svType = innerEntry.getValue().getRight(); + if (svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL)) { + revisedEventsFiltered.computeIfAbsent(variantId, k -> new HashSet<>()).add(sampleName); + revisedEventsFiltered.computeIfAbsent(widerVariantId, k -> new HashSet<>()).add(sampleName); + } + } + } } private void initializeRdCn(final VariantContext variant) { @@ -219,8 +271,8 @@ private void initializeRdCn(final VariantContext variant) { // Initialize revisedRdCn value for each variant for (final String sampleName : samples) { final Genotype genotype = variant.getGenotype(sampleName); - final int rdCn = (int) genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN); - variantRdCn.put(sampleName, rdCn); + final String rdCn = (String) genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN); + variantRdCn.put(sampleName, Integer.parseInt(rdCn)); } revisedRdCn.put(variantId, variantRdCn); } @@ -271,7 +323,8 @@ private void processCnvs(VariantContext variant) { final boolean isDel = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, "").equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL); for (String sample : variant.getSampleNamesOrderedByName()) { final Genotype genotype = variant.getGenotype(sample); - final int rdCn = (int) genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN); + final String rdCnString = (String) genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN); + final int rdCn = Integer.parseInt(rdCnString); if ((isDel && rdCn > 3) || (!isDel && (rdCn < 1 || rdCn > 4))) { multiCnvs.add(variant.getID()); break; @@ -279,63 +332,30 @@ private void processCnvs(VariantContext variant) { } } - private void processBedFile() { - try { - String line; - final BufferedReader reader = new BufferedReader(new InputStreamReader(new GZIPInputStream(Files.newInputStream(bedFile.toPath())))); - while ((line = reader.readLine()) != null) { - final String[] fields = line.split("\t"); - if (fields.length < 12) continue; - - final String[] wider = Integer.parseInt(fields[2]) - Integer.parseInt(fields[1]) >= Integer.parseInt(fields[8]) - Integer.parseInt(fields[7]) - ? Arrays.copyOfRange(fields, 0, 6) - : Arrays.copyOfRange(fields, 6, 12); - final String[] narrower = Integer.parseInt(fields[2]) - Integer.parseInt(fields[1]) >= Integer.parseInt(fields[8]) - Integer.parseInt(fields[7]) - ? Arrays.copyOfRange(fields, 6, 12) - : Arrays.copyOfRange(fields, 0, 6); - if (wider[5].equals(GATKSVVCFConstants.BLANK_SAMPLES)) continue; - - double coverage = getCoverage(wider, narrower); - if (coverage >= 0.5) { - Set widerSamples = new HashSet<>(Arrays.asList(wider[5].split(","))); - Set narrowerSamples = new HashSet<>(Arrays.asList(narrower[5].split(","))); - Set uniqueSamples = new HashSet<>(widerSamples); - uniqueSamples.removeAll(narrowerSamples); - - for (String sample : uniqueSamples) { - revisedEventsAll.computeIfAbsent(narrower[3], k -> new HashMap<>()) - .put(sample, new ImmutablePair<>(wider[3], wider[4])); - } - } - } - reader.close(); - - for (final Map.Entry>> entry : revisedEventsAll.entrySet()) { - for (final Map.Entry> innerEntry : entry.getValue().entrySet()) { - final String sampleName = innerEntry.getKey(); - final String variantId = entry.getKey(); - final String widerVariantId = innerEntry.getValue().getLeft(); - final String svType = innerEntry.getValue().getRight(); - if (svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL)) { - revisedEventsFiltered.computeIfAbsent(variantId, k -> new HashSet<>()).add(sampleName); - revisedEventsFiltered.computeIfAbsent(widerVariantId, k -> new HashSet<>()).add(sampleName); - } - } + private boolean overlaps(VariantContext v1, VariantContext v2) { + return v1.getContig().equals(v2.getContig()) && v1.getStart() <= v2.getEnd() && v2.getStart() <= v1.getEnd(); + } + + private Set getNonReferenceSamples(VariantContext variant) { + Set samples = new HashSet<>(); + for (String sampleName : variant.getSampleNames()) { + Genotype genotype = variant.getGenotype(sampleName); + if (genotype.isCalled() && !genotype.isHomRef()) { + samples.add(sampleName); } - } catch (IOException e) { - throw new RuntimeException("Error reading bed file", e); } + return samples; } - private double getCoverage(String[] wider, String[] narrower) { - final int nStart = Integer.parseInt(narrower[1]); - final int nStop = Integer.parseInt(narrower[2]); - final int wStart = Integer.parseInt(wider[1]); - final int wStop = Integer.parseInt(wider[2]); + private double getCoverage(VariantContext wider, VariantContext narrower) { + int nStart = narrower.getStart(); + int nStop = narrower.getEnd(); + int wStart = wider.getStart(); + int wStop = wider.getEnd(); if (wStart <= nStop && nStart <= wStop) { - final int intersectionSize = Math.min(nStop, wStop) - Math.max(nStart, wStart); - return (double) intersectionSize / (nStop - nStart); + int intersectionSize = Math.min(nStop, wStop) - Math.max(nStart, wStart) + 1; + return (double) intersectionSize / (nStop - nStart + 1); } return 0.0; } From 50c4eaf40941cfbd1d4631f63cc5730a791ae012 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Fri, 18 Oct 2024 12:55:08 -0400 Subject: [PATCH 13/58] Cleaned up scripts... --- .../tools/walkers/sv/SVCleanPt1a.java | 27 +++++----------- .../tools/walkers/sv/SVCleanPt1b.java | 32 +++++++------------ 2 files changed, 19 insertions(+), 40 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java index 4731e4c8017..b302d337e1e 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java @@ -50,62 +50,52 @@ *

Output

*
    *
  • - * Annotated VCF. + * Cleansed VCF. *
  • *
* *

Usage Example

*
  *     gatk SVCleanPt1a \
- *       -V structural.vcf.gz \
- *       -O cleansed.vcf.gz
- *       --ped-file pedigree.ped
- *       --chrX chrX
- *       --chrY chrY
+ *       -V input.vcf.gz \
+ *       -O output.vcf.gz
  *       --fail-list background_fail.txt
  *       --pass-list bothsides_pass.txt
- *       --sample-list sample_list.txt
- *       --revised-list revised_list.txt
  * 
* *

Cleaning Steps

*
    *
  1. - * Adds new FILTER and INFO tags to header. - *
  2. - *
  3. * TODO *
  4. *
*/ @CommandLineProgramProperties( - summary = "Clean and format structural variant VCFs per Step 1a", - oneLineSummary = "Clean and format structural variant VCFs per Step 1a", + summary = "Clean and format structural variant VCFs", + oneLineSummary = "Clean and format structural variant VCFs", programGroup = StructuralVariantDiscoveryProgramGroup.class ) @BetaFeature @DocumentedFeature public final class SVCleanPt1a extends VariantWalker { - public static final String PED_FILE_LONG_NAME = "ped-file"; public static final String CHRX_LONG_NAME = "chrX"; public static final String CHRY_LONG_NAME = "chrY"; public static final String FAIL_LIST_LONG_NAME = "fail-list"; public static final String PASS_LIST_LONG_NAME = "pass-list"; - public static final String OUTPUT_REVISED_EVENTS_LIST_LONG_NAME = "revised-list"; @Argument( fullName = CHRX_LONG_NAME, doc = "chrX column name", optional = true ) - private String chrX = "chrX"; + private final String chrX = "chrX"; @Argument( fullName = CHRY_LONG_NAME, doc = "chrY column name", optional = true ) - private String chrY = "chrY"; + private final String chrY = "chrY"; @Argument( fullName = FAIL_LIST_LONG_NAME, @@ -232,8 +222,7 @@ private void processAllosomesGenotype(final VariantContext variant, final Genoty if ((svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP)) && (variant.getEnd() - variant.getStart() >= MIN_ALLOSOME_EVENT_SIZE)) { final boolean isY = chromosome.equals(chrY); - final int chrCN = (int) genotype.getExtendedAttribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT); - final int sex = chrCN == 1 ? 1 : 2; + final int sex = (int) genotype.getExtendedAttribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT); if (sex == 1 && isRevisableEvent(variant, isY, sex)) { // Male revisedSet.add(variant.getID()); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java index d43a1173921..3d2e234b640 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java @@ -26,7 +26,6 @@ import java.util.Map; import java.util.HashSet; import java.util.HashMap; -import java.util.Comparator; import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; @@ -47,24 +46,20 @@ *

Output

*
    *
  • - * Annotated VCF. + * Cleansed VCF. *
  • *
* *

Usage Example

*
  *     gatk SVCleanPt1b \
- *       -V structural.vcf.gz \
- *       -O cleansed.vcf.gz
- *       --bed-file overlap.bed
+ *       -V input.vcf.gz \
+ *       -O output.vcf.gz
  * 
* *

Cleaning Steps

*
    *
  1. - * Calculates new copy numbers for variant genotypes that match an overlapping variant. - *
  2. - *
  3. * TODO *
  4. *
@@ -84,7 +79,7 @@ public class SVCleanPt1b extends MultiplePassVariantWalker { doc = "Output CNVs file name", optional = true ) - private GATKPath outputCnvs = new GATKPath(GATKSVVCFConstants.CNVS_DEFAULT_FILE); + private final GATKPath outputCnvs = new GATKPath(GATKSVVCFConstants.CNVS_DEFAULT_FILE); @Argument( fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, @@ -96,7 +91,7 @@ public class SVCleanPt1b extends MultiplePassVariantWalker { private VariantContextWriter vcfWriter; private BufferedWriter cnvsWriter; - private List overlappingVariantsBuffer = new ArrayList<>(); + private final List overlappingVariantsBuffer = new ArrayList<>(); final private Set multiCnvs = new HashSet<>(); final private Map>> revisedEventsAll = new HashMap<>(); final private Map> revisedEventsFiltered = new HashMap<>(); @@ -107,11 +102,6 @@ protected int numberOfPasses() { return 3; } - @Override - public void onTraversalStart() { - return; - } - @Override public Object onTraversalSuccess() { try { @@ -144,13 +134,13 @@ public void closeTool() { protected void nthPassApply(VariantContext variant, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext, int n) { switch (n) { case 0: - firstPassApply(variant, readsContext, referenceContext, featureContext); + firstPassApply(variant); break; case 1: - secondPassApply(variant, readsContext, referenceContext, featureContext); + secondPassApply(variant); break; case 2: - thirdPassApply(variant, readsContext, referenceContext, featureContext); + thirdPassApply(variant); break; } } @@ -168,7 +158,7 @@ protected void afterNthPass(int n) { } } - public void firstPassApply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext) { + public void firstPassApply(final VariantContext variant) { final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); final boolean isDelDup = svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL); final boolean isLarge = Math.abs(variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) >= 5000; @@ -183,13 +173,13 @@ public void firstPassApply(final VariantContext variant, final ReadsContext read } } - public void secondPassApply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext) { + public void secondPassApply(final VariantContext variant) { if (revisedEventsFiltered.containsKey(variant.getID())) { initializeRdCn(variant); } } - public void thirdPassApply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext) { + public void thirdPassApply(final VariantContext variant) { VariantContextBuilder builder = new VariantContextBuilder(variant); if (revisedEventsAll.containsKey(variant.getID())) { processVariant(builder, variant); From 74de1b04a64171a4be484999206796bab530e1cc Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Fri, 18 Oct 2024 18:07:07 -0400 Subject: [PATCH 14/58] SVCleanPt2 WIP --- .../tools/walkers/sv/SVCleanPt1a.java | 4 +- .../tools/walkers/sv/SVCleanPt1b.java | 6 +- .../tools/walkers/sv/SVCleanPt2.java | 493 ++++++++++++++++++ 3 files changed, 499 insertions(+), 4 deletions(-) create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java index b302d337e1e..1c8d108fa1e 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java @@ -71,8 +71,8 @@ * */ @CommandLineProgramProperties( - summary = "Clean and format structural variant VCFs", - oneLineSummary = "Clean and format structural variant VCFs", + summary = "Clean and format SV VCF", + oneLineSummary = "Clean and format SV VCF", programGroup = StructuralVariantDiscoveryProgramGroup.class ) @BetaFeature diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java index 3d2e234b640..7e90899f640 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java @@ -65,8 +65,8 @@ * */ @CommandLineProgramProperties( - summary = "Clean and format structural variant VCFs per Step 1b", - oneLineSummary = "Clean and format structural variant VCFs per Step 1b", + summary = "Clean and format SV VCF", + oneLineSummary = "Clean and format SV VCF", programGroup = StructuralVariantDiscoveryProgramGroup.class ) @BetaFeature @@ -142,6 +142,8 @@ protected void nthPassApply(VariantContext variant, ReadsContext readsContext, R case 2: thirdPassApply(variant); break; + default: + throw new IllegalArgumentException("Invalid pass number: " + n); } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java new file mode 100644 index 00000000000..a7f1451b024 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java @@ -0,0 +1,493 @@ +package org.broadinstitute.hellbender.tools.walkers.sv; + +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.GenotypeBuilder; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; + +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; +import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; + +import java.io.BufferedWriter; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; + +import java.util.Arrays; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Set; +import java.util.Map; +import java.util.HashSet; +import java.util.HashMap; + +/** + * Completes an initial series of cleaning steps for a VCF produced by the GATK-SV pipeline. + * + *

Inputs

+ *
    + *
  • + * VCF containing structural variant (SV) records from the GATK-SV pipeline. + *
  • + *
  • + * TODO + *
  • + *
+ * + *

Output

+ *
    + *
  • + * Cleansed VCF. + *
  • + *
+ * + *

Usage Example

+ *
+ *     gatk SVCleanPt2 \
+ *       -V input.vcf.gz \
+ *       --sample-list samples.txt \
+ * 	     --multi-cnv-list multi.cnvs.txt
+ * 	     --output-prefix result
+ * 
+ * + *

Cleaning Steps

+ *
    + *
  1. + * TODO + *
  2. + *
+ */ +@CommandLineProgramProperties( + summary = "Clean and format SV VCF", + oneLineSummary = "Clean and format SV VCF", + programGroup = StructuralVariantDiscoveryProgramGroup.class +) +@BetaFeature +@DocumentedFeature +public class SVCleanPt2 extends MultiplePassVariantWalker { + public static final String SAMPLE_LIST_LONG_NAME = "sample-list"; + public static final String MULTI_CNV_LONG_NAME = "multi-cnv-list"; + public static final String OUTPUT_PREFIX_LONG_NAME = "output-prefix"; + + @Argument( + fullName = SAMPLE_LIST_LONG_NAME, + doc = "Samples to include" + ) + private GATKPath sampleListPath; + + @Argument( + fullName = MULTI_CNV_LONG_NAME, + doc = "List of multiallelic CNVs" + ) + private GATKPath multiCnvPath; + + @Argument( + fullName = OUTPUT_PREFIX_LONG_NAME, + doc = "Prefix for output files" + ) + private String outputPrefix; + + private BufferedWriter revisedCnWriter; + + private Set sampleWhitelist; + private Set multiallelicCnvs; + + private Set wasRevisedToNormal = new HashSet<>(); + private Map> revisedCopyNumbers = new HashMap<>(); + private final Map> variantToSamplesWithAbnormalCN = new HashMap<>(); + private final List variantBuffer = new ArrayList<>(); + private final Map variantLengths = new HashMap<>(); + + @Override + protected int numberOfPasses() { + return 3; + } + + @Override + public void onTraversalStart() { + try { + sampleWhitelist = new HashSet<>(Files.readAllLines(sampleListPath.toPath())); + multiallelicCnvs = new HashSet<>(Files.readAllLines(multiCnvPath.toPath())); + } catch (IOException e) { + throw new RuntimeException("Error reading input file", e); + } + } + + @Override + public Object onTraversalSuccess() { + try { + for (Map.Entry> entry : revisedCopyNumbers.entrySet()) { + String variantID = entry.getKey(); + for (Map.Entry sampleEntry : entry.getValue().entrySet()) { + String sample = sampleEntry.getKey(); + int rdCn = sampleEntry.getValue(); + revisedCnWriter.write(variantID + "\t" + sample + "\t" + rdCn); + revisedCnWriter.newLine(); + } + } + + if (revisedCnWriter != null) { + revisedCnWriter.close(); + } + + return null; + } catch (IOException e) { + throw new RuntimeException("Error writing multiallelic CNVs", e); + } + } + + @Override + protected void nthPassApply(VariantContext variant, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext, int n) { + switch (n) { + case 0: + firstPassApply(variant); + break; + case 1: + secondPassApply(variant); + break; + case 2: + thirdPassApply(variant); + break; + default: + throw new IllegalArgumentException("Invalid pass number: " + n); + } + } + + @Override + protected void afterNthPass(int n) { + if (n == 2) { + try { + revisedCnWriter = Files.newBufferedWriter(Paths.get(outputPrefix + ".txt")); + } catch (IOException e) { + throw new RuntimeException("Error creating output files", e); + } + } + } + + private void firstPassApply(VariantContext variant) { + // Skip variants not in DEL or DUP + String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + if (!svType.equals("") && !svType.equals("")) { + return; + } + + // Process each sample + for (String sample : variant.getSampleNames()) { + if (!sampleWhitelist.contains(sample)) { + continue; + } + Genotype genotype = variant.getGenotype(sample); + if (!genotype.isCalled()) { + continue; + } + Integer rdCn = genotype.hasExtendedAttribute("RD_CN") ? + Integer.parseInt(genotype.getExtendedAttribute("RD_CN").toString()) : null; + if (rdCn == null || rdCn == 2) { + continue; + } + if ((svType.equals("") && rdCn < 2) || (svType.equals("") && rdCn > 2)) { + variantToSamplesWithAbnormalCN.computeIfAbsent(variant.getID(), k -> new HashSet<>()).add(sample); + } + } + + // Store variant length + int variantLength = Math.abs(variant.getAttributeAsInt("SVLEN", 0)); + variantLengths.put(variant.getID(), variantLength); + + // Add to variant buffer for overlap detection in the next pass + variantBuffer.add(variant); + } + + private void secondPassApply(VariantContext variant) { + String variantID = variant.getID(); + VariantContext currentVariant = variantBuffer.stream() + .filter(vc -> vc.getID().equals(variantID)) + .findFirst() + .orElse(null); + if (currentVariant == null) { + return; + } + + // Find overlapping variants + for (VariantContext otherVariant : variantBuffer) { + if (variantID.equals(otherVariant.getID())) { + continue; + } + if (variantsOverlap(currentVariant, otherVariant)) { + // Apply the logic from the script to adjust RD_CN values + adjustCopyNumbers(currentVariant, otherVariant); + } + } + } + + private void thirdPassApply(VariantContext variant) { + VariantContextBuilder builder = new VariantContextBuilder(variant); + String variantID = variant.getID(); + Map revisedRdCnForVariant = revisedCopyNumbers.getOrDefault(variantID, Collections.emptyMap()); + List newGenotypes = new ArrayList<>(); + + // Build the set of alleles for the variant + List variantAlleles = new ArrayList<>(variant.getAlleles()); + boolean variantAllelesModified = false; + + for (Genotype genotype : variant.getGenotypes()) { + String sample = genotype.getSampleName(); + Integer revisedRdCn = revisedRdCnForVariant.get(sample); + if (revisedRdCn != null) { + // Create a new genotype with the revised RD_CN + GenotypeBuilder gb = new GenotypeBuilder(genotype); + gb.attribute("RD_CN", revisedRdCn); + + // Adjust GT and alleles if necessary + if (revisedRdCn == 2) { + // Homozygous reference + gb.alleles(Arrays.asList(variant.getReference(), variant.getReference())); + gb.GQ(99); // Example GQ value for homozygous reference + } else { + // Heterozygous or other genotype + Allele altAllele; + if (variant.getAlternateAlleles().isEmpty()) { + // Need to create ALT allele + String svType = variant.getAttributeAsString("SVTYPE", null); + if (svType == null) { + throw new IllegalArgumentException("SVTYPE is missing for variant " + variantID); + } + altAllele = Allele.create("<" + svType + ">", false); + variantAlleles.add(altAllele); + variantAllelesModified = true; + } else { + altAllele = variant.getAlternateAllele(0); + } + gb.alleles(Arrays.asList(variant.getReference(), altAllele)); + } + + newGenotypes.add(gb.make()); + } else { + newGenotypes.add(genotype); + } + } + + // Update the variant's alleles if modified + if (variantAllelesModified) { + builder.alleles(variantAlleles); + } + + builder.genotypes(newGenotypes); + VariantContext updatedVariant = builder.make(); + identifyMultiallelicCnvs(updatedVariant); + } + + private boolean variantsOverlap(VariantContext v1, VariantContext v2) { + return v1.getContig().equals(v2.getContig()) && + v1.getStart() <= v2.getEnd() && + v2.getStart() <= v1.getEnd(); + } + + private void adjustCopyNumbers(VariantContext v1, VariantContext v2) { + // Determine larger and smaller variants + VariantContext largerVariant = (variantLengths.get(v1.getID()) >= variantLengths.get(v2.getID())) ? v1 : v2; + VariantContext smallerVariant = (largerVariant == v1) ? v2 : v1; + + // Calculate overlap + int overlapStart = Math.max(largerVariant.getStart(), smallerVariant.getStart()); + int overlapEnd = Math.min(largerVariant.getEnd(), smallerVariant.getEnd()); + int overlapLength = overlapEnd - overlapStart + 1; + double overlapPercentageSmaller = (double) overlapLength / (smallerVariant.getEnd() - smallerVariant.getStart() + 1); + double overlapPercentageLarger = (double) overlapLength / (largerVariant.getEnd() - largerVariant.getStart() + 1); + + // Apply logic based on support type and other conditions + // (Implementation of specific conditions from the script) + // For brevity, let's assume we have a method that applies these conditions + applyAdjustmentLogic(largerVariant, smallerVariant, overlapPercentageSmaller, overlapPercentageLarger); + } + + private void applyAdjustmentLogic(VariantContext largerVariant, VariantContext smallerVariant, + double overlapSmaller, double overlapLarger) { + + String smallerVariantID = smallerVariant.getID(); + String largerVariantID = largerVariant.getID(); + Map smallerVariantRdCn = getRdCnForVariant(smallerVariant); + Map largerVariantRdCn = getRdCnForVariant(largerVariant); + Map smallerVariantSupport = getSupportForVariant(smallerVariant); + Map largerVariantSupport = getSupportForVariant(largerVariant); + Map smallerVariantGT = getGTForVariant(smallerVariant); + Map largerVariantGT = getGTForVariant(largerVariant); + String svtype1 = smallerVariant.getAttributeAsString("SVTYPE", ""); + String svtype2 = largerVariant.getAttributeAsString("SVTYPE", ""); + + // Lengths of the variants + int length1 = smallerVariant.getEnd() - smallerVariant.getStart(); + int length2 = largerVariant.getEnd() - largerVariant.getStart(); + + // Iterate over samples present in both variants + Set samples = new HashSet<>(smallerVariant.getSampleNames()); + samples.retainAll(largerVariant.getSampleNames()); + + for (String sample : samples) { + String id1 = smallerVariantID + "@" + sample; + String id2 = largerVariantID + "@" + sample; + + // Check if id1 has already been revised to normal + if (wasRevisedToNormal.contains(id1)) { + continue; + } + + // Retrieve or update RD_CN values if they have been revised already + Integer RD_CN1 = revisedCopyNumbers.getOrDefault(smallerVariantID, Collections.emptyMap()).getOrDefault(sample, smallerVariantRdCn.get(sample)); + Integer RD_CN2 = revisedCopyNumbers.getOrDefault(largerVariantID, Collections.emptyMap()).getOrDefault(sample, largerVariantRdCn.get(sample)); + + String support1 = smallerVariantSupport.get(sample); + String support2 = largerVariantSupport.get(sample); + String GT1 = smallerVariantGT.get(sample); + String GT2 = largerVariantGT.get(sample); + + // Ensure RD_CN values are not null + if (RD_CN1 == null || RD_CN2 == null) { + continue; + } + + // Calculate overlaps + boolean smallOverlap50 = overlapSmaller > 0.5; + boolean largeOverlap50 = overlapLarger > 0.5; + + // Apply the conditions from the shell script + + // Condition 1: Smaller depth call is being driven by larger + if (support1.contains("RD") && !support1.equals("RD") && support2.equals("RD") && + smallOverlap50 && !multiallelicCnvs.contains(smallerVariantID)) { + + if (RD_CN1 == 0) { + makeRevision(id2, RD_CN2 + 2); + } else if (RD_CN1 == 1) { + makeRevision(id2, RD_CN2 + RD_CN1); + } else if (RD_CN1 > 1) { + int newCN = RD_CN2 - RD_CN1 + 2; + newCN = Math.max(newCN, 0); + makeRevision(id2, newCN); + } + } + + // Condition 2: Smaller CNV driving larger CNV genotype + else if (support1.equals("RD") && support2.contains("RD") && !support2.equals("RD") && + smallOverlap50 && !multiallelicCnvs.contains(largerVariantID) && + !GT2.equals("0/0") && largeOverlap50) { + + if (RD_CN2 == 0) { + makeRevision(id1, RD_CN1 + 2); + } else if (RD_CN2 == 1) { + makeRevision(id1, RD_CN1 + RD_CN2); + } else if (RD_CN2 > 1) { + int newCN = RD_CN1 - RD_CN2 + 2; + newCN = Math.max(newCN, 0); + makeRevision(id1, newCN); + } + } + + // Condition 3: Depth-only calls where smaller call is being driven by larger + else if (support1.equals("RD") && support2.equals("RD") && smallOverlap50 && + svtype1.equals(svtype2) && !multiallelicCnvs.contains(smallerVariantID)) { + + if (RD_CN1 == 0 && !RD_CN1.equals(RD_CN2)) { + makeRevision(id2, RD_CN2 + 2); + } else if (RD_CN1 == 1 && RD_CN1 > RD_CN2) { + makeRevision(id2, 1); + } else if (RD_CN1 > 1 && RD_CN1 < RD_CN2) { + int newCN = RD_CN2 - RD_CN1 + 2; + newCN = Math.max(newCN, 0); + makeRevision(id2, newCN); + } else { + makeRevision(id2, 2); + } + } + + // Condition 4: Any other time a larger call is driving a smaller call + else if (support1.contains("RD") && smallOverlap50 && length2 > 5000 && + !multiallelicCnvs.contains(smallerVariantID)) { + + if (RD_CN1 == 0) { + makeRevision(id2, RD_CN2 + 2); + } else if (RD_CN1 == 1) { + makeRevision(id2, RD_CN2 + RD_CN1); + } else if (RD_CN1 > 1) { + int newCN = RD_CN2 - RD_CN1 + 2; + newCN = Math.max(newCN, 0); + makeRevision(id2, newCN); + } + } + } + } + + private void makeRevision(String id, int val) { + // id is in the format variantID@sample + String[] tokens = id.split("@"); + String variantID = tokens[0]; + String sample = tokens[1]; + revisedCopyNumbers.computeIfAbsent(variantID, k -> new HashMap<>()).put(sample, val); + if (val == 2) { + wasRevisedToNormal.add(id); + } + } + + private Map getSupportForVariant(VariantContext variant) { + Map supportMap = new HashMap<>(); + for (String sample : variant.getSampleNames()) { + Genotype genotype = variant.getGenotype(sample); + String support = genotype.hasExtendedAttribute("EV") ? + genotype.getExtendedAttribute("EV").toString() : ""; + supportMap.put(sample, support); + } + return supportMap; + } + + private Map getGTForVariant(VariantContext variant) { + Map gtMap = new HashMap<>(); + for (String sample : variant.getSampleNames()) { + Genotype genotype = variant.getGenotype(sample); + String gt = genotype.isCalled() ? genotype.getGenotypeString() : "./."; + gtMap.put(sample, gt); + } + return gtMap; + } + + private Map getRdCnForVariant(VariantContext variant) { + Map rdCnMap = new HashMap<>(); + for (String sample : variant.getSampleNames()) { + Genotype genotype = variant.getGenotype(sample); + if (genotype.hasExtendedAttribute("RD_CN")) { + rdCnMap.put(sample, Integer.parseInt(genotype.getExtendedAttribute("RD_CN").toString())); + } + } + return rdCnMap; + } + + private void identifyMultiallelicCnvs(VariantContext variant) { + String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + boolean isDel = svType.equals(""); + boolean isDup = svType.equals(""); + int variantLength = variantLengths.getOrDefault(variant.getID(), 0); + if ((isDel || isDup) && variantLength >= 5000) { + for (Genotype genotype : variant.getGenotypes()) { + Integer rdCn = genotype.hasExtendedAttribute("RD_CN") ? + Integer.parseInt(genotype.getExtendedAttribute("RD_CN").toString()) : null; + if (rdCn != null) { + if (isDel && rdCn > 3) { + // Multiallelic deletion + multiallelicCnvs.add(variant.getID()); + break; + } else if (isDup && (rdCn < 1 || rdCn > 4)) { + // Multiallelic duplication + multiallelicCnvs.add(variant.getID()); + break; + } + } + } + } + } +} From 74f0d7301649690f08e266eec29c952ba9c9d442 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Tue, 22 Oct 2024 15:38:09 -0400 Subject: [PATCH 15/58] Working version of SVCleanPt2 --- .../tools/walkers/sv/SVCleanPt2.java | 314 ++++++++---------- 1 file changed, 130 insertions(+), 184 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java index a7f1451b024..9fc6aca2a4b 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java @@ -5,6 +5,7 @@ import htsjdk.variant.variantcontext.GenotypeBuilder; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.samtools.util.OverlapDetector; import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.BetaFeature; @@ -99,11 +100,12 @@ public class SVCleanPt2 extends MultiplePassVariantWalker { private Set sampleWhitelist; private Set multiallelicCnvs; - private Set wasRevisedToNormal = new HashSet<>(); - private Map> revisedCopyNumbers = new HashMap<>(); - private final Map> variantToSamplesWithAbnormalCN = new HashMap<>(); - private final List variantBuffer = new ArrayList<>(); - private final Map variantLengths = new HashMap<>(); + private final Map> abnormalRdCn = new HashMap<>(); + private OverlapDetector overlapDetector = new OverlapDetector<>(0, 0); + private final Map> revisedCopyNumbers = new HashMap<>(); // STATUS: To Be Verified + private final Set revisedComplete = new HashSet<>(); // STATUS: To Be Verified + + private static final int MIN_VARIANT_SIZE = 5000; @Override protected int numberOfPasses() { @@ -113,6 +115,8 @@ protected int numberOfPasses() { @Override public void onTraversalStart() { try { + revisedCnWriter = Files.newBufferedWriter(Paths.get(outputPrefix + ".txt")); + sampleWhitelist = new HashSet<>(Files.readAllLines(sampleListPath.toPath())); multiallelicCnvs = new HashSet<>(Files.readAllLines(multiCnvPath.toPath())); } catch (IOException e) { @@ -162,67 +166,44 @@ protected void nthPassApply(VariantContext variant, ReadsContext readsContext, R @Override protected void afterNthPass(int n) { - if (n == 2) { - try { - revisedCnWriter = Files.newBufferedWriter(Paths.get(outputPrefix + ".txt")); - } catch (IOException e) { - throw new RuntimeException("Error creating output files", e); - } - } + return; } private void firstPassApply(VariantContext variant) { - // Skip variants not in DEL or DUP - String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); - if (!svType.equals("") && !svType.equals("")) { + // Skip if not expected SVTYPE or SVLEN + if (!isDelDup(variant) || !isLargeVariant(variant, MIN_VARIANT_SIZE)) { return; } - // Process each sample + // Flag sample as having abnormal copy number if it passes various conditions for (String sample : variant.getSampleNames()) { - if (!sampleWhitelist.contains(sample)) { - continue; - } Genotype genotype = variant.getGenotype(sample); - if (!genotype.isCalled()) { + Integer rdCn = genotype.hasExtendedAttribute("RD_CN") ? Integer.parseInt(genotype.getExtendedAttribute("RD_CN").toString()) : null; + if (!sampleWhitelist.contains(sample) || !genotype.isCalled() || rdCn == null || rdCn == 2) { continue; } - Integer rdCn = genotype.hasExtendedAttribute("RD_CN") ? - Integer.parseInt(genotype.getExtendedAttribute("RD_CN").toString()) : null; - if (rdCn == null || rdCn == 2) { - continue; - } - if ((svType.equals("") && rdCn < 2) || (svType.equals("") && rdCn > 2)) { - variantToSamplesWithAbnormalCN.computeIfAbsent(variant.getID(), k -> new HashSet<>()).add(sample); + + String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + if ((svType.equals("DEL") && rdCn < 2) || (svType.equals("DUP") && rdCn > 2)) { + abnormalRdCn.computeIfAbsent(variant.getID(), k -> new HashSet<>()).add(sample); } } - // Store variant length - int variantLength = Math.abs(variant.getAttributeAsInt("SVLEN", 0)); - variantLengths.put(variant.getID(), variantLength); - - // Add to variant buffer for overlap detection in the next pass - variantBuffer.add(variant); + // Add variant to overlap detector + overlapDetector.addLhs(variant, variant); } private void secondPassApply(VariantContext variant) { - String variantID = variant.getID(); - VariantContext currentVariant = variantBuffer.stream() - .filter(vc -> vc.getID().equals(variantID)) - .findFirst() - .orElse(null); - if (currentVariant == null) { + // Skip if not expected SVTYPE or SVLEN + if (!isDelDup(variant) || !isLargeVariant(variant, MIN_VARIANT_SIZE)) { return; } - // Find overlapping variants - for (VariantContext otherVariant : variantBuffer) { - if (variantID.equals(otherVariant.getID())) { - continue; - } - if (variantsOverlap(currentVariant, otherVariant)) { - // Apply the logic from the script to adjust RD_CN values - adjustCopyNumbers(currentVariant, otherVariant); + // Adjust copy numbers for overlapping variants + Set overlappingVariants = overlapDetector.getOverlaps(variant); + for (VariantContext otherVariant : overlappingVariants) { + if (!variant.getID().equals(otherVariant.getID())) { + adjustCopyNumbers(variant, otherVariant); } } } @@ -247,9 +228,8 @@ private void thirdPassApply(VariantContext variant) { // Adjust GT and alleles if necessary if (revisedRdCn == 2) { - // Homozygous reference gb.alleles(Arrays.asList(variant.getReference(), variant.getReference())); - gb.GQ(99); // Example GQ value for homozygous reference + gb.GQ(99); } else { // Heterozygous or other genotype Allele altAllele; @@ -284,122 +264,84 @@ private void thirdPassApply(VariantContext variant) { identifyMultiallelicCnvs(updatedVariant); } - private boolean variantsOverlap(VariantContext v1, VariantContext v2) { - return v1.getContig().equals(v2.getContig()) && - v1.getStart() <= v2.getEnd() && - v2.getStart() <= v1.getEnd(); - } - private void adjustCopyNumbers(VariantContext v1, VariantContext v2) { - // Determine larger and smaller variants - VariantContext largerVariant = (variantLengths.get(v1.getID()) >= variantLengths.get(v2.getID())) ? v1 : v2; - VariantContext smallerVariant = (largerVariant == v1) ? v2 : v1; + // Define data structures to store metadata + String variantId1 = v1.getID(); + String variantId2 = v2.getID(); + Map variantRdCn1 = getRdCnForVariant(v1); + Map variantRdCn2 = getRdCnForVariant(v2); + Map> variantSupport1 = getSupportForVariant(v1); + Map> variantSupport2 = getSupportForVariant(v2); + Map variantGt1 = getGTForVariant(v1); + Map variantGt2 = getGTForVariant(v2); + String svtype1 = v1.getAttributeAsString("SVTYPE", ""); + String svtype2 = v2.getAttributeAsString("SVTYPE", ""); // Calculate overlap - int overlapStart = Math.max(largerVariant.getStart(), smallerVariant.getStart()); - int overlapEnd = Math.min(largerVariant.getEnd(), smallerVariant.getEnd()); - int overlapLength = overlapEnd - overlapStart + 1; - double overlapPercentageSmaller = (double) overlapLength / (smallerVariant.getEnd() - smallerVariant.getStart() + 1); - double overlapPercentageLarger = (double) overlapLength / (largerVariant.getEnd() - largerVariant.getStart() + 1); - - // Apply logic based on support type and other conditions - // (Implementation of specific conditions from the script) - // For brevity, let's assume we have a method that applies these conditions - applyAdjustmentLogic(largerVariant, smallerVariant, overlapPercentageSmaller, overlapPercentageLarger); - } - - private void applyAdjustmentLogic(VariantContext largerVariant, VariantContext smallerVariant, - double overlapSmaller, double overlapLarger) { + int length1 = v1.getEnd() - v1.getStart(); + int length2 = v2.getEnd() - v2.getStart(); + int lengthOverlap = Math.min(v2.getEnd(), v1.getEnd()) - Math.max(v1.getStart(), v2.getStart()); + double overlap1 = (double) lengthOverlap / (double) length1; + double overlap2 = (double) lengthOverlap / (double) length2; - String smallerVariantID = smallerVariant.getID(); - String largerVariantID = largerVariant.getID(); - Map smallerVariantRdCn = getRdCnForVariant(smallerVariant); - Map largerVariantRdCn = getRdCnForVariant(largerVariant); - Map smallerVariantSupport = getSupportForVariant(smallerVariant); - Map largerVariantSupport = getSupportForVariant(largerVariant); - Map smallerVariantGT = getGTForVariant(smallerVariant); - Map largerVariantGT = getGTForVariant(largerVariant); - String svtype1 = smallerVariant.getAttributeAsString("SVTYPE", ""); - String svtype2 = largerVariant.getAttributeAsString("SVTYPE", ""); - - // Lengths of the variants - int length1 = smallerVariant.getEnd() - smallerVariant.getStart(); - int length2 = largerVariant.getEnd() - largerVariant.getStart(); - - // Iterate over samples present in both variants - Set samples = new HashSet<>(smallerVariant.getSampleNames()); - samples.retainAll(largerVariant.getSampleNames()); + // Get samples with abnormal CN across both variants + Set samples = new HashSet<>(abnormalRdCn.getOrDefault(variantId1, Collections.emptySet())); + samples.retainAll(abnormalRdCn.getOrDefault(variantId2, Collections.emptySet())); + // Iterate through samples to test against conditions for (String sample : samples) { - String id1 = smallerVariantID + "@" + sample; - String id2 = largerVariantID + "@" + sample; - - // Check if id1 has already been revised to normal - if (wasRevisedToNormal.contains(id1)) { - continue; - } - - // Retrieve or update RD_CN values if they have been revised already - Integer RD_CN1 = revisedCopyNumbers.getOrDefault(smallerVariantID, Collections.emptyMap()).getOrDefault(sample, smallerVariantRdCn.get(sample)); - Integer RD_CN2 = revisedCopyNumbers.getOrDefault(largerVariantID, Collections.emptyMap()).getOrDefault(sample, largerVariantRdCn.get(sample)); - - String support1 = smallerVariantSupport.get(sample); - String support2 = largerVariantSupport.get(sample); - String GT1 = smallerVariantGT.get(sample); - String GT2 = largerVariantGT.get(sample); - - // Ensure RD_CN values are not null - if (RD_CN1 == null || RD_CN2 == null) { + // Validate baseline filters + String id1 = variantId1 + "@" + sample; + String id2 = variantId2 + "@" + sample; + Integer rdCn1 = revisedCopyNumbers.getOrDefault(variantId1, Collections.emptyMap()).getOrDefault(sample, variantRdCn1.get(sample)); + Integer rdCn2 = revisedCopyNumbers.getOrDefault(variantId2, Collections.emptyMap()).getOrDefault(sample, variantRdCn2.get(sample)); + if (revisedComplete.contains(id1) || rdCn1 == null || rdCn2 == null) { continue; } - // Calculate overlaps - boolean smallOverlap50 = overlapSmaller > 0.5; - boolean largeOverlap50 = overlapLarger > 0.5; - - // Apply the conditions from the shell script - - // Condition 1: Smaller depth call is being driven by larger - if (support1.contains("RD") && !support1.equals("RD") && support2.equals("RD") && - smallOverlap50 && !multiallelicCnvs.contains(smallerVariantID)) { - - if (RD_CN1 == 0) { - makeRevision(id2, RD_CN2 + 2); - } else if (RD_CN1 == 1) { - makeRevision(id2, RD_CN2 + RD_CN1); - } else if (RD_CN1 > 1) { - int newCN = RD_CN2 - RD_CN1 + 2; + // Initialize fields for evaluation + Set support1 = variantSupport1.get(sample); + Set support2 = variantSupport2.get(sample); + Genotype genotype1 = variantGt1.get(sample); + Genotype genotype2 = variantGt2.get(sample); + + // Condition 1: Smaller depth call is being driven by a larger call + if (support1.contains("RD") && support1.size() > 1 && support2.equals(Collections.singleton("RD")) + && overlap2 > 0.5 && !multiallelicCnvs.contains(variantId1)) { + if (rdCn1 == 0) { + makeRevision(id2, rdCn2 + 2); + } else if (rdCn1 == 1) { + makeRevision(id2, rdCn2 + rdCn1); + } else if (rdCn1 > 1) { + int newCN = rdCn2 - rdCn1 + 2; newCN = Math.max(newCN, 0); makeRevision(id2, newCN); } } - // Condition 2: Smaller CNV driving larger CNV genotype - else if (support1.equals("RD") && support2.contains("RD") && !support2.equals("RD") && - smallOverlap50 && !multiallelicCnvs.contains(largerVariantID) && - !GT2.equals("0/0") && largeOverlap50) { - - if (RD_CN2 == 0) { - makeRevision(id1, RD_CN1 + 2); - } else if (RD_CN2 == 1) { - makeRevision(id1, RD_CN1 + RD_CN2); - } else if (RD_CN2 > 1) { - int newCN = RD_CN1 - RD_CN2 + 2; + // Condition 2: Smaller CNV is driven by a larger CNV genotype + else if (support1.equals(Collections.singleton("RD")) && support2.contains("RD") && support2.size() > 1 + && overlap1 > 0.5 && overlap2 > 0.5 && !multiallelicCnvs.contains(variantId2) && !genotype2.isHomRef()) { + if (rdCn2 == 0) { + makeRevision(id1, rdCn1 + 2); + } else if (rdCn2 == 1) { + makeRevision(id1, rdCn1 + rdCn2); + } else if (rdCn2 > 1) { + int newCN = rdCn1 - rdCn2 + 2; newCN = Math.max(newCN, 0); makeRevision(id1, newCN); } } - // Condition 3: Depth-only calls where smaller call is being driven by larger - else if (support1.equals("RD") && support2.equals("RD") && smallOverlap50 && - svtype1.equals(svtype2) && !multiallelicCnvs.contains(smallerVariantID)) { - - if (RD_CN1 == 0 && !RD_CN1.equals(RD_CN2)) { - makeRevision(id2, RD_CN2 + 2); - } else if (RD_CN1 == 1 && RD_CN1 > RD_CN2) { + // Condition 3: Depth-only calls where smaller call is driven by a larger call + else if (support1.equals(Collections.singleton("RD")) && support2.equals(Collections.singleton("RD")) + && overlap2 > 0.5 && !multiallelicCnvs.contains(variantId1) && svtype1.equals(svtype2)) { + if (rdCn1 == 0 && !rdCn1.equals(rdCn2)) { + makeRevision(id2, rdCn2 + 2); + } else if (rdCn1 == 1 && rdCn1 > rdCn2) { makeRevision(id2, 1); - } else if (RD_CN1 > 1 && RD_CN1 < RD_CN2) { - int newCN = RD_CN2 - RD_CN1 + 2; + } else if (rdCn1 > 1 && rdCn1 < rdCn2) { + int newCN = rdCn2 - rdCn1 + 2; newCN = Math.max(newCN, 0); makeRevision(id2, newCN); } else { @@ -407,16 +349,15 @@ else if (support1.equals("RD") && support2.equals("RD") && smallOverlap50 && } } - // Condition 4: Any other time a larger call is driving a smaller call - else if (support1.contains("RD") && smallOverlap50 && length2 > 5000 && - !multiallelicCnvs.contains(smallerVariantID)) { - - if (RD_CN1 == 0) { - makeRevision(id2, RD_CN2 + 2); - } else if (RD_CN1 == 1) { - makeRevision(id2, RD_CN2 + RD_CN1); - } else if (RD_CN1 > 1) { - int newCN = RD_CN2 - RD_CN1 + 2; + // Condition 4: Any other time a larger call drives a smaller call + else if (support1.contains("RD") && overlap2 > 0.5 && !multiallelicCnvs.contains(variantId1) + && length2 > MIN_VARIANT_SIZE) { + if (rdCn1 == 0) { + makeRevision(id2, rdCn2 + 2); + } else if (rdCn1 == 1) { + makeRevision(id2, rdCn2 + rdCn1); + } else if (rdCn1 > 1) { + int newCN = rdCn2 - rdCn1 + 2; newCN = Math.max(newCN, 0); makeRevision(id2, newCN); } @@ -424,34 +365,35 @@ else if (support1.contains("RD") && smallOverlap50 && length2 > 5000 && } } - private void makeRevision(String id, int val) { - // id is in the format variantID@sample - String[] tokens = id.split("@"); - String variantID = tokens[0]; - String sample = tokens[1]; - revisedCopyNumbers.computeIfAbsent(variantID, k -> new HashMap<>()).put(sample, val); - if (val == 2) { - wasRevisedToNormal.add(id); - } + private boolean isDelDup(VariantContext variant) { + String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + return svType.equals("DEL") || svType.equals("DUP"); } - private Map getSupportForVariant(VariantContext variant) { - Map supportMap = new HashMap<>(); + private boolean isLargeVariant(VariantContext variant, int minSize) { + int variantLength = Math.abs(variant.getAttributeAsInt("SVLEN", 0)); + return variantLength >= minSize; + } + + private Map> getSupportForVariant(VariantContext variant) { + Map> supportMap = new HashMap<>(); for (String sample : variant.getSampleNames()) { Genotype genotype = variant.getGenotype(sample); - String support = genotype.hasExtendedAttribute("EV") ? - genotype.getExtendedAttribute("EV").toString() : ""; - supportMap.put(sample, support); + String supportStr = genotype.hasExtendedAttribute("EV") ? genotype.getExtendedAttribute("EV").toString() : ""; + Set supportSet = new HashSet<>(); + if (!supportStr.isEmpty()) { + supportSet.addAll(Arrays.asList(supportStr.split(","))); + } + supportMap.put(sample, supportSet); } return supportMap; } - private Map getGTForVariant(VariantContext variant) { - Map gtMap = new HashMap<>(); + private Map getGTForVariant(VariantContext variant) { + Map gtMap = new HashMap<>(); for (String sample : variant.getSampleNames()) { Genotype genotype = variant.getGenotype(sample); - String gt = genotype.isCalled() ? genotype.getGenotypeString() : "./."; - gtMap.put(sample, gt); + gtMap.put(sample, genotype); } return gtMap; } @@ -467,22 +409,26 @@ private Map getRdCnForVariant(VariantContext variant) { return rdCnMap; } + private void makeRevision(String id, int val) { + String[] tokens = id.split("@"); + String variantId = tokens[0]; + String sample = tokens[1]; + revisedCopyNumbers.computeIfAbsent(variantId, k -> new HashMap<>()).put(sample, val); + if (val == 2) { + revisedComplete.add(id); + } + } + private void identifyMultiallelicCnvs(VariantContext variant) { - String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); - boolean isDel = svType.equals(""); - boolean isDup = svType.equals(""); - int variantLength = variantLengths.getOrDefault(variant.getID(), 0); - if ((isDel || isDup) && variantLength >= 5000) { + if (isDelDup(variant) && isLargeVariant(variant, MIN_VARIANT_SIZE)) { for (Genotype genotype : variant.getGenotypes()) { - Integer rdCn = genotype.hasExtendedAttribute("RD_CN") ? - Integer.parseInt(genotype.getExtendedAttribute("RD_CN").toString()) : null; + Integer rdCn = genotype.hasExtendedAttribute("RD_CN") ? Integer.parseInt(genotype.getExtendedAttribute("RD_CN").toString()) : null; if (rdCn != null) { - if (isDel && rdCn > 3) { - // Multiallelic deletion + String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + if (svType.equals("DEL") && rdCn > 3) { multiallelicCnvs.add(variant.getID()); break; - } else if (isDup && (rdCn < 1 || rdCn > 4)) { - // Multiallelic duplication + } else if (svType.equals("DUP") && (rdCn < 1 || rdCn > 4)) { multiallelicCnvs.add(variant.getID()); break; } From 54f5f968b512cd8537290b96a6b5f36940541796 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Tue, 22 Oct 2024 15:48:34 -0400 Subject: [PATCH 16/58] Code cleanup --- .../tools/walkers/sv/SVCleanPt2.java | 125 ++---------------- 1 file changed, 13 insertions(+), 112 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java index 9fc6aca2a4b..340dd24588f 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java @@ -1,10 +1,7 @@ package org.broadinstitute.hellbender.tools.walkers.sv; -import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.Genotype; -import htsjdk.variant.variantcontext.GenotypeBuilder; import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.samtools.util.OverlapDetector; import org.broadinstitute.barclay.argparser.Argument; @@ -21,9 +18,7 @@ import java.nio.file.Paths; import java.util.Arrays; -import java.util.ArrayList; import java.util.Collections; -import java.util.List; import java.util.Set; import java.util.Map; import java.util.HashSet; @@ -109,7 +104,7 @@ public class SVCleanPt2 extends MultiplePassVariantWalker { @Override protected int numberOfPasses() { - return 3; + return 2; } @Override @@ -149,6 +144,11 @@ public Object onTraversalSuccess() { @Override protected void nthPassApply(VariantContext variant, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext, int n) { + // Skip if not expected SVTYPE or below SVLEN threshold + if (!isDelDup(variant) || !isLargeVariant(variant, MIN_VARIANT_SIZE)) { + return; + } + switch (n) { case 0: firstPassApply(variant); @@ -156,9 +156,6 @@ protected void nthPassApply(VariantContext variant, ReadsContext readsContext, R case 1: secondPassApply(variant); break; - case 2: - thirdPassApply(variant); - break; default: throw new IllegalArgumentException("Invalid pass number: " + n); } @@ -170,12 +167,7 @@ protected void afterNthPass(int n) { } private void firstPassApply(VariantContext variant) { - // Skip if not expected SVTYPE or SVLEN - if (!isDelDup(variant) || !isLargeVariant(variant, MIN_VARIANT_SIZE)) { - return; - } - - // Flag sample as having abnormal copy number if it passes various conditions + // Flag sample as having an abnormal copy number if it passes certain conditions for (String sample : variant.getSampleNames()) { Genotype genotype = variant.getGenotype(sample); Integer rdCn = genotype.hasExtendedAttribute("RD_CN") ? Integer.parseInt(genotype.getExtendedAttribute("RD_CN").toString()) : null; @@ -194,90 +186,27 @@ private void firstPassApply(VariantContext variant) { } private void secondPassApply(VariantContext variant) { - // Skip if not expected SVTYPE or SVLEN - if (!isDelDup(variant) || !isLargeVariant(variant, MIN_VARIANT_SIZE)) { - return; - } - - // Adjust copy numbers for overlapping variants + // Check if copy number needs to be adjusted for samples within overlapping variants Set overlappingVariants = overlapDetector.getOverlaps(variant); for (VariantContext otherVariant : overlappingVariants) { if (!variant.getID().equals(otherVariant.getID())) { - adjustCopyNumbers(variant, otherVariant); + adjustCopyNumber(variant, otherVariant); } } } - private void thirdPassApply(VariantContext variant) { - VariantContextBuilder builder = new VariantContextBuilder(variant); - String variantID = variant.getID(); - Map revisedRdCnForVariant = revisedCopyNumbers.getOrDefault(variantID, Collections.emptyMap()); - List newGenotypes = new ArrayList<>(); - - // Build the set of alleles for the variant - List variantAlleles = new ArrayList<>(variant.getAlleles()); - boolean variantAllelesModified = false; - - for (Genotype genotype : variant.getGenotypes()) { - String sample = genotype.getSampleName(); - Integer revisedRdCn = revisedRdCnForVariant.get(sample); - if (revisedRdCn != null) { - // Create a new genotype with the revised RD_CN - GenotypeBuilder gb = new GenotypeBuilder(genotype); - gb.attribute("RD_CN", revisedRdCn); - - // Adjust GT and alleles if necessary - if (revisedRdCn == 2) { - gb.alleles(Arrays.asList(variant.getReference(), variant.getReference())); - gb.GQ(99); - } else { - // Heterozygous or other genotype - Allele altAllele; - if (variant.getAlternateAlleles().isEmpty()) { - // Need to create ALT allele - String svType = variant.getAttributeAsString("SVTYPE", null); - if (svType == null) { - throw new IllegalArgumentException("SVTYPE is missing for variant " + variantID); - } - altAllele = Allele.create("<" + svType + ">", false); - variantAlleles.add(altAllele); - variantAllelesModified = true; - } else { - altAllele = variant.getAlternateAllele(0); - } - gb.alleles(Arrays.asList(variant.getReference(), altAllele)); - } - - newGenotypes.add(gb.make()); - } else { - newGenotypes.add(genotype); - } - } - - // Update the variant's alleles if modified - if (variantAllelesModified) { - builder.alleles(variantAlleles); - } - - builder.genotypes(newGenotypes); - VariantContext updatedVariant = builder.make(); - identifyMultiallelicCnvs(updatedVariant); - } - - private void adjustCopyNumbers(VariantContext v1, VariantContext v2) { - // Define data structures to store metadata + private void adjustCopyNumber(VariantContext v1, VariantContext v2) { + // Track metadata through data structures String variantId1 = v1.getID(); String variantId2 = v2.getID(); Map variantRdCn1 = getRdCnForVariant(v1); Map variantRdCn2 = getRdCnForVariant(v2); Map> variantSupport1 = getSupportForVariant(v1); Map> variantSupport2 = getSupportForVariant(v2); - Map variantGt1 = getGTForVariant(v1); - Map variantGt2 = getGTForVariant(v2); String svtype1 = v1.getAttributeAsString("SVTYPE", ""); String svtype2 = v2.getAttributeAsString("SVTYPE", ""); - // Calculate overlap + // Calculate overlap metadata int length1 = v1.getEnd() - v1.getStart(); int length2 = v2.getEnd() - v2.getStart(); int lengthOverlap = Math.min(v2.getEnd(), v1.getEnd()) - Math.max(v1.getStart(), v2.getStart()); @@ -302,8 +231,7 @@ private void adjustCopyNumbers(VariantContext v1, VariantContext v2) { // Initialize fields for evaluation Set support1 = variantSupport1.get(sample); Set support2 = variantSupport2.get(sample); - Genotype genotype1 = variantGt1.get(sample); - Genotype genotype2 = variantGt2.get(sample); + Genotype genotype2 = v2.getGenotype(sample); // Condition 1: Smaller depth call is being driven by a larger call if (support1.contains("RD") && support1.size() > 1 && support2.equals(Collections.singleton("RD")) @@ -389,15 +317,6 @@ private Map> getSupportForVariant(VariantContext variant) { return supportMap; } - private Map getGTForVariant(VariantContext variant) { - Map gtMap = new HashMap<>(); - for (String sample : variant.getSampleNames()) { - Genotype genotype = variant.getGenotype(sample); - gtMap.put(sample, genotype); - } - return gtMap; - } - private Map getRdCnForVariant(VariantContext variant) { Map rdCnMap = new HashMap<>(); for (String sample : variant.getSampleNames()) { @@ -418,22 +337,4 @@ private void makeRevision(String id, int val) { revisedComplete.add(id); } } - - private void identifyMultiallelicCnvs(VariantContext variant) { - if (isDelDup(variant) && isLargeVariant(variant, MIN_VARIANT_SIZE)) { - for (Genotype genotype : variant.getGenotypes()) { - Integer rdCn = genotype.hasExtendedAttribute("RD_CN") ? Integer.parseInt(genotype.getExtendedAttribute("RD_CN").toString()) : null; - if (rdCn != null) { - String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); - if (svType.equals("DEL") && rdCn > 3) { - multiallelicCnvs.add(variant.getID()); - break; - } else if (svType.equals("DUP") && (rdCn < 1 || rdCn > 4)) { - multiallelicCnvs.add(variant.getID()); - break; - } - } - } - } - } } From 18da350c245084a1a22a32e8d6c587fdeb91b991 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Tue, 22 Oct 2024 16:03:20 -0400 Subject: [PATCH 17/58] Added sorting and better formatting of outputs --- .../spark/sv/utils/GATKSVVCFConstants.java | 2 +- .../tools/walkers/sv/SVCleanPt1a.java | 4 +- .../tools/walkers/sv/SVCleanPt2.java | 47 ++++++++++++------- 3 files changed, 32 insertions(+), 21 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java index 200a448e717..bad1fb6235a 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java @@ -152,7 +152,7 @@ public enum ComplexVariantSubtype { // CleanPt1a public static final String EV = "EV"; - public static final List evValues = Arrays.asList( + public static final List EV_VALUES = Arrays.asList( null, "RD", "PE", "RD,PE", "SR", "RD,SR", "PE,SR", "RD,PE,SR" ); public static final String ME = "ME"; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java index 1c8d108fa1e..c460ff281ff 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java @@ -194,8 +194,8 @@ private void processEVGenotype(final Genotype genotype, final GenotypeBuilder ge if (genotype.hasExtendedAttribute(GATKSVVCFConstants.EV)) { String evAttribute = (String) genotype.getExtendedAttribute(GATKSVVCFConstants.EV); final int evIndex = Integer.parseInt(evAttribute); - if (evIndex >= 0 && evIndex < GATKSVVCFConstants.evValues.size()) { - genotypeBuilder.attribute(GATKSVVCFConstants.EV, GATKSVVCFConstants.evValues.get(evIndex)); + if (evIndex >= 0 && evIndex < GATKSVVCFConstants.EV_VALUES.size()) { + genotypeBuilder.attribute(GATKSVVCFConstants.EV, GATKSVVCFConstants.EV_VALUES.get(evIndex)); } } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java index 340dd24588f..0d6ef449a35 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java @@ -18,11 +18,13 @@ import java.nio.file.Paths; import java.util.Arrays; -import java.util.Collections; +import java.util.List; +import java.util.ArrayList; import java.util.Set; import java.util.Map; import java.util.HashSet; import java.util.HashMap; +import java.util.Collections; /** * Completes an initial series of cleaning steps for a VCF produced by the GATK-SV pipeline. @@ -97,8 +99,8 @@ public class SVCleanPt2 extends MultiplePassVariantWalker { private final Map> abnormalRdCn = new HashMap<>(); private OverlapDetector overlapDetector = new OverlapDetector<>(0, 0); - private final Map> revisedCopyNumbers = new HashMap<>(); // STATUS: To Be Verified - private final Set revisedComplete = new HashSet<>(); // STATUS: To Be Verified + private final Map> revisedCopyNumbers = new HashMap<>(); + private final Set revisedComplete = new HashSet<>(); private static final int MIN_VARIANT_SIZE = 5000; @@ -122,11 +124,17 @@ public void onTraversalStart() { @Override public Object onTraversalSuccess() { try { - for (Map.Entry> entry : revisedCopyNumbers.entrySet()) { - String variantID = entry.getKey(); - for (Map.Entry sampleEntry : entry.getValue().entrySet()) { - String sample = sampleEntry.getKey(); - int rdCn = sampleEntry.getValue(); + List variantIDs = new ArrayList<>(revisedCopyNumbers.keySet()); + Collections.sort(variantIDs); + + for (String variantID : variantIDs) { + Map sampleMap = revisedCopyNumbers.get(variantID); + + List samples = new ArrayList<>(sampleMap.keySet()); + Collections.sort(samples); + + for (String sample : samples) { + int rdCn = sampleMap.get(sample); revisedCnWriter.write(variantID + "\t" + sample + "\t" + rdCn); revisedCnWriter.newLine(); } @@ -176,7 +184,7 @@ private void firstPassApply(VariantContext variant) { } String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); - if ((svType.equals("DEL") && rdCn < 2) || (svType.equals("DUP") && rdCn > 2)) { + if ((svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) && rdCn < 2) || (svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) && rdCn > 2)) { abnormalRdCn.computeIfAbsent(variant.getID(), k -> new HashSet<>()).add(sample); } } @@ -234,7 +242,8 @@ private void adjustCopyNumber(VariantContext v1, VariantContext v2) { Genotype genotype2 = v2.getGenotype(sample); // Condition 1: Smaller depth call is being driven by a larger call - if (support1.contains("RD") && support1.size() > 1 && support2.equals(Collections.singleton("RD")) + if (support1.contains(GATKSVVCFConstants.EV_VALUES.get(1)) && support1.size() > 1 + && support2.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) && overlap2 > 0.5 && !multiallelicCnvs.contains(variantId1)) { if (rdCn1 == 0) { makeRevision(id2, rdCn2 + 2); @@ -248,7 +257,8 @@ private void adjustCopyNumber(VariantContext v1, VariantContext v2) { } // Condition 2: Smaller CNV is driven by a larger CNV genotype - else if (support1.equals(Collections.singleton("RD")) && support2.contains("RD") && support2.size() > 1 + else if (support1.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) + && support2.contains(GATKSVVCFConstants.EV_VALUES.get(1)) && support2.size() > 1 && overlap1 > 0.5 && overlap2 > 0.5 && !multiallelicCnvs.contains(variantId2) && !genotype2.isHomRef()) { if (rdCn2 == 0) { makeRevision(id1, rdCn1 + 2); @@ -262,7 +272,8 @@ else if (support1.equals(Collections.singleton("RD")) && support2.contains("RD") } // Condition 3: Depth-only calls where smaller call is driven by a larger call - else if (support1.equals(Collections.singleton("RD")) && support2.equals(Collections.singleton("RD")) + else if (support1.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) + && support2.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) && overlap2 > 0.5 && !multiallelicCnvs.contains(variantId1) && svtype1.equals(svtype2)) { if (rdCn1 == 0 && !rdCn1.equals(rdCn2)) { makeRevision(id2, rdCn2 + 2); @@ -278,7 +289,7 @@ else if (support1.equals(Collections.singleton("RD")) && support2.equals(Collect } // Condition 4: Any other time a larger call drives a smaller call - else if (support1.contains("RD") && overlap2 > 0.5 && !multiallelicCnvs.contains(variantId1) + else if (support1.contains(GATKSVVCFConstants.EV_VALUES.get(1)) && overlap2 > 0.5 && !multiallelicCnvs.contains(variantId1) && length2 > MIN_VARIANT_SIZE) { if (rdCn1 == 0) { makeRevision(id2, rdCn2 + 2); @@ -295,11 +306,11 @@ else if (support1.contains("RD") && overlap2 > 0.5 && !multiallelicCnvs.contains private boolean isDelDup(VariantContext variant) { String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); - return svType.equals("DEL") || svType.equals("DUP"); + return svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP); } private boolean isLargeVariant(VariantContext variant, int minSize) { - int variantLength = Math.abs(variant.getAttributeAsInt("SVLEN", 0)); + int variantLength = Math.abs(variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); return variantLength >= minSize; } @@ -307,7 +318,7 @@ private Map> getSupportForVariant(VariantContext variant) { Map> supportMap = new HashMap<>(); for (String sample : variant.getSampleNames()) { Genotype genotype = variant.getGenotype(sample); - String supportStr = genotype.hasExtendedAttribute("EV") ? genotype.getExtendedAttribute("EV").toString() : ""; + String supportStr = genotype.hasExtendedAttribute(GATKSVVCFConstants.EV) ? genotype.getExtendedAttribute(GATKSVVCFConstants.EV).toString() : ""; Set supportSet = new HashSet<>(); if (!supportStr.isEmpty()) { supportSet.addAll(Arrays.asList(supportStr.split(","))); @@ -321,8 +332,8 @@ private Map getRdCnForVariant(VariantContext variant) { Map rdCnMap = new HashMap<>(); for (String sample : variant.getSampleNames()) { Genotype genotype = variant.getGenotype(sample); - if (genotype.hasExtendedAttribute("RD_CN")) { - rdCnMap.put(sample, Integer.parseInt(genotype.getExtendedAttribute("RD_CN").toString())); + if (genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { + rdCnMap.put(sample, Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString())); } } return rdCnMap; From f7e14c6cdf5dcb50b551a05287a2c08962f64bd9 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Wed, 23 Oct 2024 12:55:28 -0400 Subject: [PATCH 18/58] Initial commit for CleanPt4 --- .../spark/sv/utils/GATKSVVCFConstants.java | 6 + .../tools/walkers/sv/SVCleanPt2.java | 12 +- .../tools/walkers/sv/SVCleanPt4.java | 269 ++++++++++++++++++ 3 files changed, 281 insertions(+), 6 deletions(-) create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java index bad1fb6235a..21e8638ebcf 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java @@ -168,6 +168,12 @@ public enum ComplexVariantSubtype { public static final String RD_GQ = "RD_GQ"; public static final String CNVS_DEFAULT_FILE = "multi.cnvs.txt"; + // CleanPt4 + public static final String PE_GT = "PE_GT"; + public static final String SR_GT = "SR_GT"; + public static final String PE_GQ = "PE_GQ"; + public static final String SR_GQ = "SR_GQ"; + // Clustering public static final String CLUSTER_MEMBER_IDS_KEY = "MEMBERS"; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java index 0d6ef449a35..e4207c539af 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java @@ -178,8 +178,8 @@ private void firstPassApply(VariantContext variant) { // Flag sample as having an abnormal copy number if it passes certain conditions for (String sample : variant.getSampleNames()) { Genotype genotype = variant.getGenotype(sample); - Integer rdCn = genotype.hasExtendedAttribute("RD_CN") ? Integer.parseInt(genotype.getExtendedAttribute("RD_CN").toString()) : null; - if (!sampleWhitelist.contains(sample) || !genotype.isCalled() || rdCn == null || rdCn == 2) { + int rdCn = Integer.parseInt(genotype.getExtendedAttribute("RD_CN").toString()); + if (!sampleWhitelist.contains(sample) || !genotype.isCalled() || rdCn == 2) { continue; } @@ -230,9 +230,9 @@ private void adjustCopyNumber(VariantContext v1, VariantContext v2) { // Validate baseline filters String id1 = variantId1 + "@" + sample; String id2 = variantId2 + "@" + sample; - Integer rdCn1 = revisedCopyNumbers.getOrDefault(variantId1, Collections.emptyMap()).getOrDefault(sample, variantRdCn1.get(sample)); - Integer rdCn2 = revisedCopyNumbers.getOrDefault(variantId2, Collections.emptyMap()).getOrDefault(sample, variantRdCn2.get(sample)); - if (revisedComplete.contains(id1) || rdCn1 == null || rdCn2 == null) { + int rdCn1 = revisedCopyNumbers.getOrDefault(variantId1, Collections.emptyMap()).getOrDefault(sample, variantRdCn1.get(sample)); + int rdCn2 = revisedCopyNumbers.getOrDefault(variantId2, Collections.emptyMap()).getOrDefault(sample, variantRdCn2.get(sample)); + if (revisedComplete.contains(id1)) { continue; } @@ -275,7 +275,7 @@ else if (support1.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get( else if (support1.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) && support2.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) && overlap2 > 0.5 && !multiallelicCnvs.contains(variantId1) && svtype1.equals(svtype2)) { - if (rdCn1 == 0 && !rdCn1.equals(rdCn2)) { + if (rdCn1 == 0 && rdCn1 != rdCn2) { makeRevision(id2, rdCn2 + 2); } else if (rdCn1 == 1 && rdCn1 > rdCn2) { makeRevision(id2, 1); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java new file mode 100644 index 00000000000..22df7cada76 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java @@ -0,0 +1,269 @@ +package org.broadinstitute.hellbender.tools.walkers.sv; + +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.GenotypeBuilder; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; + +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; +import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.FileReader; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; + +import java.util.Arrays; +import java.util.List; +import java.util.ArrayList; +import java.util.Set; +import java.util.Map; +import java.util.HashSet; +import java.util.HashMap; +import java.util.Collections; +import java.util.zip.GZIPOutputStream; + +/** + * Completes an initial series of cleaning steps for a VCF produced by the GATK-SV pipeline. + * + *

Inputs

+ *
    + *
  • + * VCF containing structural variant (SV) records from the GATK-SV pipeline. + *
  • + *
  • + * TODO + *
  • + *
+ * + *

Output

+ *
    + *
  • + * Cleansed VCF. + *
  • + *
+ * + *

Usage Example

+ *
+ *     gatk SVCleanPt4 \
+ *       -V input.vcf.gz \
+ *       --revised-cn-list revised.txt \
+ * 	     --output-prefix result
+ * 
+ * + *

Cleaning Steps

+ *
    + *
  1. + * TODO + *
  2. + *
+ */ +@CommandLineProgramProperties( + summary = "SClean and format SV VCF", + oneLineSummary = "Clean and format SV VCF", + programGroup = StructuralVariantDiscoveryProgramGroup.class +) +@BetaFeature +@DocumentedFeature +public class SVCleanPt4 extends VariantWalker { + public static final String REVISED_CN_LIST_LONG_NAME = "revised-cn-list"; + public static final String OUTPUT_PREFIX_LONG_NAME = "output-prefix"; + + @Argument( + fullName = REVISED_CN_LIST_LONG_NAME, + doc = "File with variant IDs, sample IDs, and RD_CN values" + ) + private GATKPath cnReviseList; + + @Argument( + fullName = OUTPUT_PREFIX_LONG_NAME, + doc = "Prefix for output files" + ) + private String outputPrefix; + + private VariantContextWriter vcfWriter; + private BufferedWriter multiGenoWriter; + + private Map> revisedCopyNumbers; + private final Set multiGenoIds = new HashSet<>(); + + private double recordStart; + private double recordEnd; + private int maxVF; + private long recordIdx; + + @Override + public void onTraversalStart() { + // Read revised copy numbers + revisedCopyNumbers = readRevisedEvents(cnReviseList); + + // Parse batchNum and totalBatch from file name + String cnReviseListFileName = cnReviseList.toPath().getFileName().toString(); + String[] regenoFileNameTokens = cnReviseListFileName.split("\\."); + String[] batchTokens = regenoFileNameTokens[1].split("_"); + int batchNum = Math.max(Integer.parseInt(batchTokens[0]), 1); + int totalBatch = Math.max(Integer.parseInt(batchTokens[1]), 1); + + // Get variant count + long totalNumVariants = 0; + String inputVcfPath = getDrivingVariantsFeatureInput().getFeaturePath(); + try (FeatureDataSource dataSource = new FeatureDataSource<>(inputVcfPath)) { + for (VariantContext vc : dataSource) { + totalNumVariants++; + } + } + + // Initialize metadata variables + double segments = totalNumVariants / (double) totalBatch; + recordStart = (batchNum - 1) * segments; + recordEnd = batchNum * segments; + maxVF = Math.max((int) (getHeaderForVariants().getGenotypeSamples().size() * 0.01), 2); + recordIdx = 0; + + // Create output writers + try { + vcfWriter = createVCFWriter(Paths.get(outputPrefix + ".revised_vcf_lines.txt")); + vcfWriter.writeHeader(getHeaderForVariants()); + + multiGenoWriter = Files.newBufferedWriter(Paths.get(outputPrefix + ".multi_geno_ids.txt")); + } catch (IOException e) { + throw new RuntimeException("Error creating output file", e); + } + } + + @Override + public Object onTraversalSuccess() { + try { + List variantIDs = new ArrayList<>(multiGenoIds); + Collections.sort(variantIDs); + for (String variantID : variantIDs) { + multiGenoWriter.write(variantID); + multiGenoWriter.newLine(); + } + return null; + } catch (IOException e) { + throw new RuntimeException("Error writing to output file ", e); + } + } + + public void closeTool() { + try { + if (vcfWriter != null) { + vcfWriter.close(); + } + if (multiGenoWriter != null) { + multiGenoWriter.close(); + } + } catch (IOException e) { + throw new RuntimeException("Error closing output file ", e); + } + } + + @Override + public void apply(VariantContext variant, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext) { + // Initialize data structures + recordIdx++; + VariantContextBuilder variantBuilder = new VariantContextBuilder(variant); + List genotypes = variant.getGenotypes(); + + // Modify genotypes if variant appears in revise list + if (revisedCopyNumbers.containsKey(variant.getID())) { + Map sampleCnMap = revisedCopyNumbers.get(variant.getID()); + List newGenotypes = new ArrayList<>(); + for (Genotype genotype : genotypes) { + String sampleName = genotype.getSampleName(); + if (sampleCnMap.containsKey(sampleName)) { + GenotypeBuilder gb = new GenotypeBuilder(genotype); + gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); + gb.attribute(GATKSVVCFConstants.RD_CN, sampleCnMap.get(sampleName)); + newGenotypes.add(gb.make()); + } else { + newGenotypes.add(genotype); + } + } + variantBuilder.genotypes(newGenotypes); + vcfWriter.add(variantBuilder.make()); + } + + // Identify multiple genotypes if within recordStart and recordEnd + if (recordIdx >= recordStart && recordIdx < recordEnd) { + int numGtOver2 = 0; + for (Genotype genotype : genotypes) { + Integer peGt = genotype.hasExtendedAttribute(GATKSVVCFConstants.PE_GT) ? + Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.PE_GT).toString()) : null; + Integer srGt = genotype.hasExtendedAttribute(GATKSVVCFConstants.SR_GT) ? + Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.SR_GT).toString()) : null; + Integer gt = null; + if (peGt == null) { + continue; + } else if (srGt == null) { + gt = peGt; + } else if (peGt > 0 && srGt == 0) { + gt = peGt; + } else if (peGt == 0) { + gt = srGt; + } else { + Integer peGq = genotype.hasExtendedAttribute(GATKSVVCFConstants.PE_GQ) ? + Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.PE_GQ).toString()) : null; + Integer srGq = genotype.hasExtendedAttribute(GATKSVVCFConstants.SR_GQ) ? + Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.SR_GQ).toString()) : null; + if (peGq != null && srGq != null && peGq >= srGq) { + gt = peGt; + } else { + gt = srGt; + } + } + if (gt > 2) { + numGtOver2++; + } + } + if (numGtOver2 > maxVF) { + multiGenoIds.add(variant.getID()); + } + } + } + + private Integer getIntegerAttribute(Genotype genotype, String attributeName) { + if (genotype.hasExtendedAttribute(attributeName)) { + Object attr = genotype.getExtendedAttribute(attributeName); + if (attr instanceof Integer) { + return (Integer) attr; + } else if (attr instanceof String) { + try { + return Integer.parseInt((String) attr); + } catch (NumberFormatException e) { + return null; + } + } + } + return null; + } + + private Map> readRevisedEvents(final GATKPath filePath) { + try (BufferedReader reader = new BufferedReader(new FileReader(filePath.toPath().toFile()))) { + final Map> result = new HashMap<>(); + String line; + while ((line = reader.readLine()) != null) { + String[] fields = line.split("\t"); + if (fields.length < 3) continue; + + String variantId = fields[0]; + String sampleId = fields[1]; + int rdCn = Integer.parseInt(fields[2]); + + result.computeIfAbsent(variantId, k -> new HashMap<>()).put(sampleId, rdCn); + } + return result; + } catch (IOException e) { + throw new RuntimeException("Error reading input file", e); + } + } +} From 5f5a6cdd66a6243e12e5841b10cb4ca83076a388 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Thu, 24 Oct 2024 16:08:07 -0400 Subject: [PATCH 19/58] WIP --- .../spark/sv/utils/GATKSVVCFConstants.java | 3 +- .../tools/walkers/sv/SVCleanPt1a.java | 10 +- .../tools/walkers/sv/SVCleanPt1b.java | 67 +++----- .../tools/walkers/sv/SVCleanPt2.java | 57 +++---- .../tools/walkers/sv/SVCleanPt4.java | 81 +++++----- .../tools/walkers/sv/SVCleanPt5.java | 146 ++++++++++++++++++ 6 files changed, 228 insertions(+), 136 deletions(-) create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java index 21e8638ebcf..a81758169de 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java @@ -166,9 +166,10 @@ public enum ComplexVariantSubtype { // CleanPt1b public static final String RD_GQ = "RD_GQ"; - public static final String CNVS_DEFAULT_FILE = "multi.cnvs.txt"; + public static final String MULTI_CNV = "MULTI_CNV"; // CleanPt4 + public static final String MULTI_GENO = "MULTI_GENO"; // TODO: Delete? public static final String PE_GT = "PE_GT"; public static final String SR_GT = "SR_GT"; public static final String PE_GQ = "PE_GQ"; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java index c460ff281ff..f0a4b326504 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java @@ -63,7 +63,7 @@ * --pass-list bothsides_pass.txt * * - *

Cleaning Steps

+ *

Processing Steps

*
    *
  1. * TODO @@ -142,8 +142,8 @@ public void onTraversalStart() { // Add new header lines VCFHeader newHeader = new VCFHeader(newHeaderLines, header.getGenotypeSamples()); - newHeader.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.HIGH_SR_BACKGROUND, 0, VCFHeaderLineType.Flag, "High number of SR splits in background samples indicating messy region")); newHeader.addMetaDataLine(new VCFFilterHeaderLine(GATKSVVCFConstants.UNRESOLVED, "Variant is unresolved")); + newHeader.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.HIGH_SR_BACKGROUND, 0, VCFHeaderLineType.Flag, "High number of SR splits in background samples indicating messy region")); newHeader.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.BOTHSIDES_SUPPORT, 0, VCFHeaderLineType.Flag, "Variant has read-level support for both sides of breakpoint")); newHeader.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.REVISED_EVENT, 0, VCFHeaderLineType.Flag, "Variant has been revised due to a copy number mismatch")); @@ -203,7 +203,7 @@ private void processEVGenotype(final Genotype genotype, final GenotypeBuilder ge private void processSVTypeGenotype(final VariantContext variant, final GenotypeBuilder genotypeBuilder) { final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, null); boolean hasMobileElement = variant.getAlleles().stream() - .map(allele -> GATKSVVariantContextUtils.getSymbolicAlleleSymbols(allele)) + .map(GATKSVVariantContextUtils::getSymbolicAlleleSymbols) .flatMap(Arrays::stream) .anyMatch(symbol -> symbol.equals(GATKSVVCFConstants.ME)); if (svType != null && !hasMobileElement) { @@ -343,9 +343,7 @@ private Set readLastColumn(final GATKPath filePath) { try { final Path path = filePath.toPath(); final TableReader reader = TableUtils.reader(path, (columns, exceptionFactory) -> - (dataline) -> { - return dataline.get(columns.columnCount() - 1); - } + (dataline) -> dataline.get(columns.columnCount() - 1) ); Set result = reader.stream().collect(Collectors.toSet()); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java index 7e90899f640..1579848bfe6 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java @@ -6,6 +6,9 @@ import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLineType; +import htsjdk.variant.vcf.VCFInfoHeaderLine; import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.BetaFeature; import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; @@ -15,10 +18,6 @@ import org.broadinstitute.hellbender.engine.*; import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; -import java.io.BufferedWriter; -import java.io.FileWriter; -import java.io.IOException; - import java.util.Arrays; import java.util.List; import java.util.ArrayList; @@ -57,7 +56,7 @@ * -O output.vcf.gz * * - *

    Cleaning Steps

    + *

    Processing Steps

    *
      *
    1. * TODO @@ -72,15 +71,6 @@ @BetaFeature @DocumentedFeature public class SVCleanPt1b extends MultiplePassVariantWalker { - public static final String CNV_FILE_LONG_NAME = "cnv-file"; - - @Argument( - fullName = CNV_FILE_LONG_NAME, - doc = "Output CNVs file name", - optional = true - ) - private final GATKPath outputCnvs = new GATKPath(GATKSVVCFConstants.CNVS_DEFAULT_FILE); - @Argument( fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, @@ -89,10 +79,8 @@ public class SVCleanPt1b extends MultiplePassVariantWalker { private GATKPath outputVcf; private VariantContextWriter vcfWriter; - private BufferedWriter cnvsWriter; private final List overlappingVariantsBuffer = new ArrayList<>(); - final private Set multiCnvs = new HashSet<>(); final private Map>> revisedEventsAll = new HashMap<>(); final private Map> revisedEventsFiltered = new HashMap<>(); final private Map> revisedRdCn = new HashMap<>(); @@ -102,36 +90,15 @@ protected int numberOfPasses() { return 3; } - @Override - public Object onTraversalSuccess() { - try { - cnvsWriter = new BufferedWriter(new FileWriter(outputCnvs.toPath().toFile())); - for (String variantId : multiCnvs) { - cnvsWriter.write(variantId); - cnvsWriter.newLine(); - } - return null; - } catch (IOException e) { - throw new RuntimeException("Error creating CNVs file", e); - } - } - @Override public void closeTool() { - try { - if (vcfWriter != null) { - vcfWriter.close(); - } - if (cnvsWriter != null) { - cnvsWriter.close(); - } - } catch (IOException e) { - throw new RuntimeException("Error closing output file", e); + if (vcfWriter != null) { + vcfWriter.close(); } } @Override - protected void nthPassApply(VariantContext variant, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext, int n) { + protected void nthPassApply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext, final int n) { switch (n) { case 0: firstPassApply(variant); @@ -148,14 +115,16 @@ protected void nthPassApply(VariantContext variant, ReadsContext readsContext, R } @Override - protected void afterNthPass(int n) { + protected void afterNthPass(final int n) { switch (n) { case 0: processCollectedVariants(); break; case 1: vcfWriter = createVCFWriter(outputVcf); - vcfWriter.writeHeader(getHeaderForVariants()); + final VCFHeader header = getHeaderForVariants(); + header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.MULTI_CNV, 0, VCFHeaderLineType.Flag, "Variant is a multiallelic CNV")); + vcfWriter.writeHeader(header); break; } } @@ -191,12 +160,12 @@ public void thirdPassApply(final VariantContext variant) { final boolean isDelDup = svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL); final boolean isLarge = variant.getEnd() - variant.getStart() >= 1000; if (isDelDup && isLarge) { - processCnvs(variant); + processCnvs(builder, variant); } vcfWriter.add(builder.make()); } - private void processOverlap(VariantContext v1, VariantContext v2) { + private void processOverlap(final VariantContext v1, final VariantContext v2) { // Get overlap data VariantContext wider; VariantContext narrower; @@ -311,24 +280,24 @@ private void processVariant(final VariantContextBuilder builder, final VariantCo builder.genotypes(newGenotypes); } - private void processCnvs(VariantContext variant) { + private void processCnvs(final VariantContextBuilder builder, final VariantContext variant) { final boolean isDel = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, "").equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL); for (String sample : variant.getSampleNamesOrderedByName()) { final Genotype genotype = variant.getGenotype(sample); final String rdCnString = (String) genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN); final int rdCn = Integer.parseInt(rdCnString); if ((isDel && rdCn > 3) || (!isDel && (rdCn < 1 || rdCn > 4))) { - multiCnvs.add(variant.getID()); + builder.attribute(GATKSVVCFConstants.MULTI_CNV, true); break; } } } - private boolean overlaps(VariantContext v1, VariantContext v2) { + private boolean overlaps(final VariantContext v1, final VariantContext v2) { return v1.getContig().equals(v2.getContig()) && v1.getStart() <= v2.getEnd() && v2.getStart() <= v1.getEnd(); } - private Set getNonReferenceSamples(VariantContext variant) { + private Set getNonReferenceSamples(final VariantContext variant) { Set samples = new HashSet<>(); for (String sampleName : variant.getSampleNames()) { Genotype genotype = variant.getGenotype(sampleName); @@ -339,7 +308,7 @@ private Set getNonReferenceSamples(VariantContext variant) { return samples; } - private double getCoverage(VariantContext wider, VariantContext narrower) { + private double getCoverage(final VariantContext wider, final VariantContext narrower) { int nStart = narrower.getStart(); int nStop = narrower.getEnd(); int wStart = wider.getStart(); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java index e4207c539af..cdfc972e04f 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java @@ -51,11 +51,10 @@ * gatk SVCleanPt2 \ * -V input.vcf.gz \ * --sample-list samples.txt \ - * --multi-cnv-list multi.cnvs.txt * --output-prefix result * * - *

      Cleaning Steps

      + *

      Processing Steps

      *
        *
      1. * TODO @@ -71,7 +70,6 @@ @DocumentedFeature public class SVCleanPt2 extends MultiplePassVariantWalker { public static final String SAMPLE_LIST_LONG_NAME = "sample-list"; - public static final String MULTI_CNV_LONG_NAME = "multi-cnv-list"; public static final String OUTPUT_PREFIX_LONG_NAME = "output-prefix"; @Argument( @@ -80,25 +78,18 @@ public class SVCleanPt2 extends MultiplePassVariantWalker { ) private GATKPath sampleListPath; - @Argument( - fullName = MULTI_CNV_LONG_NAME, - doc = "List of multiallelic CNVs" - ) - private GATKPath multiCnvPath; - @Argument( fullName = OUTPUT_PREFIX_LONG_NAME, doc = "Prefix for output files" ) - private String outputPrefix; + private GATKPath outputPrefix; private BufferedWriter revisedCnWriter; private Set sampleWhitelist; - private Set multiallelicCnvs; private final Map> abnormalRdCn = new HashMap<>(); - private OverlapDetector overlapDetector = new OverlapDetector<>(0, 0); + private final OverlapDetector overlapDetector = new OverlapDetector<>(0, 0); private final Map> revisedCopyNumbers = new HashMap<>(); private final Set revisedComplete = new HashSet<>(); @@ -112,10 +103,9 @@ protected int numberOfPasses() { @Override public void onTraversalStart() { try { - revisedCnWriter = Files.newBufferedWriter(Paths.get(outputPrefix + ".txt")); + revisedCnWriter = Files.newBufferedWriter(Paths.get(outputPrefix.toString() + ".txt")); sampleWhitelist = new HashSet<>(Files.readAllLines(sampleListPath.toPath())); - multiallelicCnvs = new HashSet<>(Files.readAllLines(multiCnvPath.toPath())); } catch (IOException e) { throw new RuntimeException("Error reading input file", e); } @@ -146,12 +136,12 @@ public Object onTraversalSuccess() { return null; } catch (IOException e) { - throw new RuntimeException("Error writing multiallelic CNVs", e); + throw new RuntimeException("Error writing output file", e); } } @Override - protected void nthPassApply(VariantContext variant, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext, int n) { + protected void nthPassApply(final VariantContext variant, final ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext, int n) { // Skip if not expected SVTYPE or below SVLEN threshold if (!isDelDup(variant) || !isLargeVariant(variant, MIN_VARIANT_SIZE)) { return; @@ -170,11 +160,9 @@ protected void nthPassApply(VariantContext variant, ReadsContext readsContext, R } @Override - protected void afterNthPass(int n) { - return; - } + protected void afterNthPass(final int n) {} - private void firstPassApply(VariantContext variant) { + private void firstPassApply(final VariantContext variant) { // Flag sample as having an abnormal copy number if it passes certain conditions for (String sample : variant.getSampleNames()) { Genotype genotype = variant.getGenotype(sample); @@ -193,7 +181,7 @@ private void firstPassApply(VariantContext variant) { overlapDetector.addLhs(variant, variant); } - private void secondPassApply(VariantContext variant) { + private void secondPassApply(final VariantContext variant) { // Check if copy number needs to be adjusted for samples within overlapping variants Set overlappingVariants = overlapDetector.getOverlaps(variant); for (VariantContext otherVariant : overlappingVariants) { @@ -203,7 +191,7 @@ private void secondPassApply(VariantContext variant) { } } - private void adjustCopyNumber(VariantContext v1, VariantContext v2) { + private void adjustCopyNumber(final VariantContext v1, final VariantContext v2) { // Track metadata through data structures String variantId1 = v1.getID(); String variantId2 = v2.getID(); @@ -244,7 +232,7 @@ private void adjustCopyNumber(VariantContext v1, VariantContext v2) { // Condition 1: Smaller depth call is being driven by a larger call if (support1.contains(GATKSVVCFConstants.EV_VALUES.get(1)) && support1.size() > 1 && support2.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) - && overlap2 > 0.5 && !multiallelicCnvs.contains(variantId1)) { + && overlap2 > 0.5 && !v1.hasAttribute(GATKSVVCFConstants.MULTI_CNV)) { if (rdCn1 == 0) { makeRevision(id2, rdCn2 + 2); } else if (rdCn1 == 1) { @@ -259,7 +247,8 @@ private void adjustCopyNumber(VariantContext v1, VariantContext v2) { // Condition 2: Smaller CNV is driven by a larger CNV genotype else if (support1.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) && support2.contains(GATKSVVCFConstants.EV_VALUES.get(1)) && support2.size() > 1 - && overlap1 > 0.5 && overlap2 > 0.5 && !multiallelicCnvs.contains(variantId2) && !genotype2.isHomRef()) { + && overlap1 > 0.5 && overlap2 > 0.5 && !v2.hasAttribute(GATKSVVCFConstants.MULTI_CNV) + && !genotype2.isHomRef()) { if (rdCn2 == 0) { makeRevision(id1, rdCn1 + 2); } else if (rdCn2 == 1) { @@ -274,23 +263,21 @@ else if (support1.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get( // Condition 3: Depth-only calls where smaller call is driven by a larger call else if (support1.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) && support2.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) - && overlap2 > 0.5 && !multiallelicCnvs.contains(variantId1) && svtype1.equals(svtype2)) { + && overlap2 > 0.5 && !v1.hasAttribute(GATKSVVCFConstants.MULTI_CNV) && svtype1.equals(svtype2)) { if (rdCn1 == 0 && rdCn1 != rdCn2) { makeRevision(id2, rdCn2 + 2); } else if (rdCn1 == 1 && rdCn1 > rdCn2) { makeRevision(id2, 1); } else if (rdCn1 > 1 && rdCn1 < rdCn2) { - int newCN = rdCn2 - rdCn1 + 2; - newCN = Math.max(newCN, 0); - makeRevision(id2, newCN); + makeRevision(id2, Math.max(rdCn2 - rdCn1 + 2, 0)); } else { makeRevision(id2, 2); } } // Condition 4: Any other time a larger call drives a smaller call - else if (support1.contains(GATKSVVCFConstants.EV_VALUES.get(1)) && overlap2 > 0.5 && !multiallelicCnvs.contains(variantId1) - && length2 > MIN_VARIANT_SIZE) { + else if (support1.contains(GATKSVVCFConstants.EV_VALUES.get(1)) + && overlap2 > 0.5 && !v1.hasAttribute(GATKSVVCFConstants.MULTI_CNV) && length2 > MIN_VARIANT_SIZE) { if (rdCn1 == 0) { makeRevision(id2, rdCn2 + 2); } else if (rdCn1 == 1) { @@ -304,17 +291,17 @@ else if (support1.contains(GATKSVVCFConstants.EV_VALUES.get(1)) && overlap2 > 0. } } - private boolean isDelDup(VariantContext variant) { + private boolean isDelDup(final VariantContext variant) { String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); return svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP); } - private boolean isLargeVariant(VariantContext variant, int minSize) { + private boolean isLargeVariant(final VariantContext variant, final int minSize) { int variantLength = Math.abs(variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); return variantLength >= minSize; } - private Map> getSupportForVariant(VariantContext variant) { + private Map> getSupportForVariant(final VariantContext variant) { Map> supportMap = new HashMap<>(); for (String sample : variant.getSampleNames()) { Genotype genotype = variant.getGenotype(sample); @@ -328,7 +315,7 @@ private Map> getSupportForVariant(VariantContext variant) { return supportMap; } - private Map getRdCnForVariant(VariantContext variant) { + private Map getRdCnForVariant(final VariantContext variant) { Map rdCnMap = new HashMap<>(); for (String sample : variant.getSampleNames()) { Genotype genotype = variant.getGenotype(sample); @@ -339,7 +326,7 @@ private Map getRdCnForVariant(VariantContext variant) { return rdCnMap; } - private void makeRevision(String id, int val) { + private void makeRevision(final String id, final int val) { String[] tokens = id.split("@"); String variantId = tokens[0]; String sample = tokens[1]; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java index 22df7cada76..4aac2aed5d7 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java @@ -6,10 +6,15 @@ import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.vcf.VCFFilterHeaderLine; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLineType; +import htsjdk.variant.vcf.VCFInfoHeaderLine; import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.BetaFeature; import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; import org.broadinstitute.hellbender.engine.*; import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; @@ -18,18 +23,13 @@ import java.io.BufferedWriter; import java.io.FileReader; import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Paths; import java.util.Arrays; import java.util.List; import java.util.ArrayList; -import java.util.Set; import java.util.Map; -import java.util.HashSet; import java.util.HashMap; import java.util.Collections; -import java.util.zip.GZIPOutputStream; /** * Completes an initial series of cleaning steps for a VCF produced by the GATK-SV pipeline. @@ -56,10 +56,9 @@ * gatk SVCleanPt4 \ * -V input.vcf.gz \ * --revised-cn-list revised.txt \ - * --output-prefix result * * - *

        Cleaning Steps

        + *

        Processing Steps

        *
          *
        1. * TODO @@ -75,7 +74,6 @@ @DocumentedFeature public class SVCleanPt4 extends VariantWalker { public static final String REVISED_CN_LIST_LONG_NAME = "revised-cn-list"; - public static final String OUTPUT_PREFIX_LONG_NAME = "output-prefix"; @Argument( fullName = REVISED_CN_LIST_LONG_NAME, @@ -84,16 +82,16 @@ public class SVCleanPt4 extends VariantWalker { private GATKPath cnReviseList; @Argument( - fullName = OUTPUT_PREFIX_LONG_NAME, - doc = "Prefix for output files" + fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + doc = "Output VCF name" ) - private String outputPrefix; + private GATKPath outputVcf; private VariantContextWriter vcfWriter; private BufferedWriter multiGenoWriter; private Map> revisedCopyNumbers; - private final Set multiGenoIds = new HashSet<>(); private double recordStart; private double recordEnd; @@ -112,11 +110,11 @@ public void onTraversalStart() { int batchNum = Math.max(Integer.parseInt(batchTokens[0]), 1); int totalBatch = Math.max(Integer.parseInt(batchTokens[1]), 1); - // Get variant count + // Get VCF length (note: didn't seem to warrant long totalNumVariants = 0; String inputVcfPath = getDrivingVariantsFeatureInput().getFeaturePath(); try (FeatureDataSource dataSource = new FeatureDataSource<>(inputVcfPath)) { - for (VariantContext vc : dataSource) { + for (VariantContext ignored : dataSource) { totalNumVariants++; } } @@ -128,30 +126,12 @@ public void onTraversalStart() { maxVF = Math.max((int) (getHeaderForVariants().getGenotypeSamples().size() * 0.01), 2); recordIdx = 0; - // Create output writers - try { - vcfWriter = createVCFWriter(Paths.get(outputPrefix + ".revised_vcf_lines.txt")); - vcfWriter.writeHeader(getHeaderForVariants()); - - multiGenoWriter = Files.newBufferedWriter(Paths.get(outputPrefix + ".multi_geno_ids.txt")); - } catch (IOException e) { - throw new RuntimeException("Error creating output file", e); - } - } - - @Override - public Object onTraversalSuccess() { - try { - List variantIDs = new ArrayList<>(multiGenoIds); - Collections.sort(variantIDs); - for (String variantID : variantIDs) { - multiGenoWriter.write(variantID); - multiGenoWriter.newLine(); - } - return null; - } catch (IOException e) { - throw new RuntimeException("Error writing to output file ", e); - } + // Create output writer + vcfWriter = createVCFWriter(outputVcf); + final VCFHeader header = getHeaderForVariants(); + header.addMetaDataLine(new VCFFilterHeaderLine(GATKSVVCFConstants.UNRESOLVED, "Variant is unresolved")); + header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.HIGH_SR_BACKGROUND, 0, VCFHeaderLineType.Flag, "High number of SR splits in background samples indicating messy region")); + vcfWriter.writeHeader(header); } public void closeTool() { @@ -168,10 +148,12 @@ public void closeTool() { } @Override - public void apply(VariantContext variant, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext) { + public void apply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext) { // Initialize data structures + boolean isRevisedEvent = false; + boolean isMultiGeno = false; recordIdx++; - VariantContextBuilder variantBuilder = new VariantContextBuilder(variant); + VariantContextBuilder builder = new VariantContextBuilder(variant); List genotypes = variant.getGenotypes(); // Modify genotypes if variant appears in revise list @@ -189,8 +171,8 @@ public void apply(VariantContext variant, ReadsContext readsContext, ReferenceCo newGenotypes.add(genotype); } } - variantBuilder.genotypes(newGenotypes); - vcfWriter.add(variantBuilder.make()); + builder.genotypes(newGenotypes); + isRevisedEvent = true; } // Identify multiple genotypes if within recordStart and recordEnd @@ -201,7 +183,7 @@ public void apply(VariantContext variant, ReadsContext readsContext, ReferenceCo Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.PE_GT).toString()) : null; Integer srGt = genotype.hasExtendedAttribute(GATKSVVCFConstants.SR_GT) ? Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.SR_GT).toString()) : null; - Integer gt = null; + int gt; if (peGt == null) { continue; } else if (srGt == null) { @@ -226,12 +208,21 @@ public void apply(VariantContext variant, ReadsContext readsContext, ReferenceCo } } if (numGtOver2 > maxVF) { - multiGenoIds.add(variant.getID()); + isMultiGeno = true; } } + + if (isRevisedEvent) { + if (isMultiGeno) { + builder.attribute(GATKSVVCFConstants.MULTI_GENO, true); + } + vcfWriter.add(builder.make()); + } + + // TODO: Sex Revisions } - private Integer getIntegerAttribute(Genotype genotype, String attributeName) { + private Integer getIntegerAttribute(final Genotype genotype, final String attributeName) { if (genotype.hasExtendedAttribute(attributeName)) { Object attr = genotype.getExtendedAttribute(attributeName); if (attr instanceof Integer) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java new file mode 100644 index 00000000000..b84d3f95633 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java @@ -0,0 +1,146 @@ +package org.broadinstitute.hellbender.tools.walkers.sv; + +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.samtools.util.OverlapDetector; + +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; +import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; + +import java.io.BufferedWriter; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; + +import java.util.Arrays; +import java.util.List; +import java.util.ArrayList; +import java.util.Set; +import java.util.Map; +import java.util.HashSet; +import java.util.HashMap; +import java.util.Collections; + +/** + * Completes an initial series of cleaning steps for a VCF produced by the GATK-SV pipeline. + * + *

          Inputs

          + *
            + *
          • + * VCF containing structural variant (SV) records from the GATK-SV pipeline. + *
          • + *
          • + * TODO + *
          • + *
          + * + *

          Output

          + *
            + *
          • + * Cleansed VCF. + *
          • + *
          + * + *

          Usage Example

          + *
          + *     gatk SVCleanPt5 \
          + *       -V input.vcf.gz \
          + *       --sample-list samples.txt \
          + * 	     --multi-cnv-list multi.cnvs.txt
          + * 	     --output-prefix result
          + * 
          + * + *

          Processing Steps

          + *
            + *
          1. + * TODO + *
          2. + *
          + */ +@CommandLineProgramProperties( + summary = "Clean and format SV VCF", + oneLineSummary = "Clean and format SV VCF", + programGroup = StructuralVariantDiscoveryProgramGroup.class +) +@BetaFeature +@DocumentedFeature +public class SVCleanPt5 extends MultiplePassVariantWalker { // MultiVariantWalker? + @Override + protected int numberOfPasses() { + return 2; + } + + @Override + public void onTraversalStart() { + /* + try { + revisedCnWriter = Files.newBufferedWriter(Paths.get(outputPrefix + ".txt")); + + sampleWhitelist = new HashSet<>(Files.readAllLines(sampleListPath.toPath())); + multiallelicCnvs = new HashSet<>(Files.readAllLines(multiCnvPath.toPath())); + } catch (IOException e) { + throw new RuntimeException("Error reading input file", e); + } + */ + return; + } + + @Override + public Object onTraversalSuccess() { + /* + try { + List variantIDs = new ArrayList<>(revisedCopyNumbers.keySet()); + Collections.sort(variantIDs); + + for (String variantID : variantIDs) { + Map sampleMap = revisedCopyNumbers.get(variantID); + + List samples = new ArrayList<>(sampleMap.keySet()); + Collections.sort(samples); + + for (String sample : samples) { + int rdCn = sampleMap.get(sample); + revisedCnWriter.write(variantID + "\t" + sample + "\t" + rdCn); + revisedCnWriter.newLine(); + } + } + + if (revisedCnWriter != null) { + revisedCnWriter.close(); + } + + return null; + } catch (IOException e) { + throw new RuntimeException("Error writing multiallelic CNVs", e); + } + */ + return null; + } + + @Override + protected void nthPassApply(VariantContext variant, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext, int n) { + /* + switch (n) { + case 0: + firstPassApply(variant); + break; + case 1: + secondPassApply(variant); + break; + default: + throw new IllegalArgumentException("Invalid pass number: " + n); + } + */ + return; + } + + @Override + protected void afterNthPass(int n) { + return; + } +} From 52a904981fdadc676f419eba59400acd81a9ce6f Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Fri, 25 Oct 2024 18:18:18 -0400 Subject: [PATCH 20/58] WIP - up till CleanVcf5 (first task complete) --- .../spark/sv/utils/GATKSVVCFConstants.java | 5 +- .../tools/walkers/sv/SVCleanPt1a.java | 53 +-- .../tools/walkers/sv/SVCleanPt2.java | 2 +- .../tools/walkers/sv/SVCleanPt4.java | 363 +++++++++++++----- 4 files changed, 289 insertions(+), 134 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java index a81758169de..6dc216090af 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java @@ -169,11 +169,14 @@ public enum ComplexVariantSubtype { public static final String MULTI_CNV = "MULTI_CNV"; // CleanPt4 - public static final String MULTI_GENO = "MULTI_GENO"; // TODO: Delete? + public static final String PESR_GT_OVERDISPERSION = "PESR_GT_OVERDISPERSION"; + public static final String GT = "GT"; + public static final String GQ = "GQ"; public static final String PE_GT = "PE_GT"; public static final String SR_GT = "SR_GT"; public static final String PE_GQ = "PE_GQ"; public static final String SR_GQ = "SR_GQ"; + public static final String CNV = "CNV"; // Clustering public static final String CLUSTER_MEMBER_IDS_KEY = "MEMBERS"; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java index f0a4b326504..f0f10fe7aa9 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java @@ -99,13 +99,13 @@ public final class SVCleanPt1a extends VariantWalker { @Argument( fullName = FAIL_LIST_LONG_NAME, - doc = "List of complex variants failing the background test" + doc = "File with complex variants failing the background test" ) private GATKPath failList; @Argument( fullName = PASS_LIST_LONG_NAME, - doc = "List of complex variants passing both sides" + doc = "Fail with complex variants passing both sides" ) private GATKPath passList; @@ -124,7 +124,6 @@ public final class SVCleanPt1a extends VariantWalker { private static final int MIN_ALLOSOME_EVENT_SIZE = 5000; - @Override public void onTraversalStart() { // Read supporting files @@ -135,7 +134,10 @@ public void onTraversalStart() { final VCFHeader header = getHeaderForVariants(); final Set newHeaderLines = new LinkedHashSet<>(); for (final VCFHeaderLine line : header.getMetaDataInInputOrder()) { - if (!(line instanceof VCFInfoHeaderLine) || !((VCFInfoHeaderLine) line).getID().equals(GATKSVVCFConstants.UNRESOLVED)) { + if (!(line instanceof VCFInfoHeaderLine) + || (!((VCFInfoHeaderLine) line).getID().equals(GATKSVVCFConstants.UNRESOLVED) + && !((VCFInfoHeaderLine) line).getID().equals(GATKSVVCFConstants.MULTIALLELIC) + && !((VCFInfoHeaderLine) line).getID().equals(GATKSVVCFConstants.VAR_GQ))) { newHeaderLines.add(line); } } @@ -161,11 +163,11 @@ public void closeTool() { @Override public void apply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext) { - VariantContextBuilder variantBuilder = new VariantContextBuilder(variant); + VariantContextBuilder builder = new VariantContextBuilder(variant); final List processedGenotypes = processGenotypes(variant); - variantBuilder.genotypes(processedGenotypes); - processVariant(variant, variantBuilder); - vcfWriter.add(variantBuilder.make()); + builder.genotypes(processedGenotypes); + processVariant(variant, builder); + vcfWriter.add(builder.make()); } private List processGenotypes(final VariantContext variant) { @@ -173,7 +175,6 @@ private List processGenotypes(final VariantContext variant) { .map(genotype -> { GenotypeBuilder genotypeBuilder = new GenotypeBuilder(genotype); processEVGenotype(genotype, genotypeBuilder); - // processSVTypeGenotype(variant, genotypeBuilder); processAllosomesGenotype(variant, genotype, genotypeBuilder); return genotypeBuilder.make(); }) @@ -181,9 +182,8 @@ private List processGenotypes(final VariantContext variant) { } private void processVariant(final VariantContext variant, final VariantContextBuilder builder) { - // processSVType(variant, builder); processVarGQ(variant, builder); - processMultiallelic(builder); + processMultiallelic(variant, builder); processUnresolved(variant, builder); processNoisyEvents(variant, builder); processBothsidesSupportEvents(variant, builder); @@ -200,21 +200,6 @@ private void processEVGenotype(final Genotype genotype, final GenotypeBuilder ge } } - private void processSVTypeGenotype(final VariantContext variant, final GenotypeBuilder genotypeBuilder) { - final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, null); - boolean hasMobileElement = variant.getAlleles().stream() - .map(GATKSVVariantContextUtils::getSymbolicAlleleSymbols) - .flatMap(Arrays::stream) - .anyMatch(symbol -> symbol.equals(GATKSVVCFConstants.ME)); - if (svType != null && !hasMobileElement) { - List newGenotypeAlleles = Arrays.asList( - variant.getReference(), - Allele.create("<" + svType + ">", false) - ); - genotypeBuilder.alleles(newGenotypeAlleles); - } - } - private void processAllosomesGenotype(final VariantContext variant, final Genotype genotype, final GenotypeBuilder genotypeBuilder) { final String chromosome = variant.getContig(); if (chromosome.equals(chrX) || chromosome.equals(chrY)) { @@ -292,16 +277,6 @@ private int calcMedianDistribution(final int[] counts) { throw new RuntimeException("Error calculating median"); } - private void processSVType(final VariantContext variant, final VariantContextBuilder builder) { - final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, null); - if (svType != null && variant.getAlleles().stream().noneMatch(allele -> allele.getDisplayString().contains(GATKSVVCFConstants.ME))) { - final Allele refAllele = variant.getReference(); - final Allele altAllele = Allele.create("<" + svType + ">", false); - List newAlleles = Arrays.asList(refAllele, altAllele); - builder.alleles(newAlleles); - } - } - private void processVarGQ(final VariantContext variant, final VariantContextBuilder builder) { if (variant.hasAttribute(GATKSVVCFConstants.VAR_GQ)) { final double varGQ = variant.getAttributeAsDouble(GATKSVVCFConstants.VAR_GQ, 0); @@ -310,8 +285,10 @@ private void processVarGQ(final VariantContext variant, final VariantContextBuil } } - private void processMultiallelic(final VariantContextBuilder builder) { - builder.rmAttribute(GATKSVVCFConstants.MULTIALLELIC); + private void processMultiallelic(final VariantContext variant, final VariantContextBuilder builder) { + if (variant.hasAttribute(GATKSVVCFConstants.MULTIALLELIC)) { + builder.rmAttribute(GATKSVVCFConstants.MULTIALLELIC); + } } private void processUnresolved(final VariantContext variant, final VariantContextBuilder builder) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java index cdfc972e04f..409c3d418f7 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java @@ -74,7 +74,7 @@ public class SVCleanPt2 extends MultiplePassVariantWalker { @Argument( fullName = SAMPLE_LIST_LONG_NAME, - doc = "Samples to include" + doc = "File with samples to include" ) private GATKPath sampleListPath; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java index 4aac2aed5d7..0842e586c9a 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java @@ -1,15 +1,13 @@ package org.broadinstitute.hellbender.tools.walkers.sv; +import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.GenotypeBuilder; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.variant.variantcontext.writer.VariantContextWriter; -import htsjdk.variant.vcf.VCFFilterHeaderLine; -import htsjdk.variant.vcf.VCFHeader; -import htsjdk.variant.vcf.VCFHeaderLineType; -import htsjdk.variant.vcf.VCFInfoHeaderLine; +import htsjdk.variant.vcf.*; import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.BetaFeature; import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; @@ -18,18 +16,16 @@ import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; import org.broadinstitute.hellbender.engine.*; import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; +import org.broadinstitute.hellbender.utils.variant.GATKSVVariantContextUtils; +import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; import java.io.BufferedReader; -import java.io.BufferedWriter; import java.io.FileReader; import java.io.IOException; -import java.util.Arrays; -import java.util.List; -import java.util.ArrayList; -import java.util.Map; -import java.util.HashMap; -import java.util.Collections; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.*; /** * Completes an initial series of cleaning steps for a VCF produced by the GATK-SV pipeline. @@ -74,6 +70,9 @@ @DocumentedFeature public class SVCleanPt4 extends VariantWalker { public static final String REVISED_CN_LIST_LONG_NAME = "revised-cn-list"; + public static final String OUTLIERS_LIST_LONG_NAME = "outliers-list"; + public static final String OUTPUT_MULTIALLELIC_VCF_LONG_NAME = "output-multiallelic-vcf"; + public static final String OUTPUT_NO_CALLS_VCF_LONG_NAME = "output-no-calls-vcf"; @Argument( fullName = REVISED_CN_LIST_LONG_NAME, @@ -81,6 +80,13 @@ public class SVCleanPt4 extends VariantWalker { ) private GATKPath cnReviseList; + @Argument( + fullName = OUTLIERS_LIST_LONG_NAME, + doc = "File with outlier samples", + optional = true + ) + private GATKPath outliersListPath; + @Argument( fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, @@ -88,29 +94,54 @@ public class SVCleanPt4 extends VariantWalker { ) private GATKPath outputVcf; + /* + @Argument( + fullName = OUTPUT_MULTIALLELIC_VCF_LONG_NAME, + doc = "Output multiallelic VCF name" + ) + private GATKPath outputVcfMultiallelic; + + @Argument( + fullName = OUTPUT_NO_CALLS_VCF_LONG_NAME, + doc = "Output no sample calls VCF name" + ) + private GATKPath outputVcfNoCalls; + */ + private VariantContextWriter vcfWriter; - private BufferedWriter multiGenoWriter; + private VariantContextWriter vcfWriterMultiallelic; + private VariantContextWriter vcfWriterNoCalls; private Map> revisedCopyNumbers; + private Set outlierSamples; private double recordStart; private double recordEnd; - private int maxVF; private long recordIdx; + private int maxVF; + + private static final int MIN_LARGE_EVENT_SIZE = 1000; + private static final int MIN_MULTIALLELIC_EVENT_SIZE = 5000; @Override public void onTraversalStart() { - // Read revised copy numbers - revisedCopyNumbers = readRevisedEvents(cnReviseList); + // Read and parse input files + try { + revisedCopyNumbers = readRevisedEvents(cnReviseList); + outlierSamples = new HashSet<>(); + if (outliersListPath != null) { + outlierSamples = new HashSet<>(Files.readAllLines(outliersListPath.toPath())); + } + } catch (IOException e) { + throw new RuntimeException("Error reading input file", e); + } - // Parse batchNum and totalBatch from file name + // Parse batch-level metadata String cnReviseListFileName = cnReviseList.toPath().getFileName().toString(); String[] regenoFileNameTokens = cnReviseListFileName.split("\\."); String[] batchTokens = regenoFileNameTokens[1].split("_"); int batchNum = Math.max(Integer.parseInt(batchTokens[0]), 1); int totalBatch = Math.max(Integer.parseInt(batchTokens[1]), 1); - - // Get VCF length (note: didn't seem to warrant long totalNumVariants = 0; String inputVcfPath = getDrivingVariantsFeatureInput().getFeaturePath(); try (FeatureDataSource dataSource = new FeatureDataSource<>(inputVcfPath)) { @@ -118,124 +149,268 @@ public void onTraversalStart() { totalNumVariants++; } } - - // Initialize metadata variables double segments = totalNumVariants / (double) totalBatch; recordStart = (batchNum - 1) * segments; recordEnd = batchNum * segments; - maxVF = Math.max((int) (getHeaderForVariants().getGenotypeSamples().size() * 0.01), 2); + maxVF = Math.max((int) ((getHeaderForVariants().getGenotypeSamples().size() - outlierSamples.size()) * 0.01), 2); recordIdx = 0; - // Create output writer + // Create primary output VCF vcfWriter = createVCFWriter(outputVcf); final VCFHeader header = getHeaderForVariants(); - header.addMetaDataLine(new VCFFilterHeaderLine(GATKSVVCFConstants.UNRESOLVED, "Variant is unresolved")); - header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.HIGH_SR_BACKGROUND, 0, VCFHeaderLineType.Flag, "High number of SR splits in background samples indicating messy region")); + header.addMetaDataLine(new VCFFilterHeaderLine(GATKSVVCFConstants.MULTIALLELIC, "Multiallelic site")); + header.addMetaDataLine(new VCFFormatHeaderLine(GATKSVVCFConstants.COPY_NUMBER_FORMAT, 1, VCFHeaderLineType.Integer, "Predicted copy state")); + header.addMetaDataLine(new VCFFormatHeaderLine(GATKSVVCFConstants.COPY_NUMBER_QUALITY_FORMAT, 1, VCFHeaderLineType.Integer, "Read-depth genotype quality")); + header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.PESR_GT_OVERDISPERSION, 0, VCFHeaderLineType.Flag, "High PESR dispersion count")); vcfWriter.writeHeader(header); + + // Create supporting output VCFs + /* + vcfWriterMultiallelic = createVCFWriter(outputVcfMultiallelic); + vcfWriterMultiallelic.writeHeader(header); + vcfWriterNoCalls = createVCFWriter(outputVcfNoCalls); + vcfWriterNoCalls.writeHeader(header); + */ } public void closeTool() { - try { - if (vcfWriter != null) { - vcfWriter.close(); - } - if (multiGenoWriter != null) { - multiGenoWriter.close(); - } - } catch (IOException e) { - throw new RuntimeException("Error closing output file ", e); + if (vcfWriter != null) { + vcfWriter.close(); + } + + if (vcfWriterMultiallelic != null) { + vcfWriterMultiallelic.close(); + } + + if (vcfWriterNoCalls != null) { + vcfWriterNoCalls.close(); } } @Override public void apply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext) { // Initialize data structures - boolean isRevisedEvent = false; - boolean isMultiGeno = false; recordIdx++; VariantContextBuilder builder = new VariantContextBuilder(variant); + + // Exit if outside batch range // TODO: Does this have to move to after processRevisedCn? + if (recordIdx < recordStart || recordIdx >= recordEnd) { + vcfWriter.add(builder.make()); + return; + } + + // Process variants List genotypes = variant.getGenotypes(); + genotypes = processRevisedCn(variant, genotypes); + processMultiallelic(builder, genotypes); + genotypes = processLargeDeletions(variant, builder, genotypes); + genotypes = processLargeDuplications(variant, builder, genotypes); + genotypes = processRevisedSex(variant, genotypes); - // Modify genotypes if variant appears in revise list - if (revisedCopyNumbers.containsKey(variant.getID())) { - Map sampleCnMap = revisedCopyNumbers.get(variant.getID()); - List newGenotypes = new ArrayList<>(); - for (Genotype genotype : genotypes) { - String sampleName = genotype.getSampleName(); - if (sampleCnMap.containsKey(sampleName)) { - GenotypeBuilder gb = new GenotypeBuilder(genotype); - gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); - gb.attribute(GATKSVVCFConstants.RD_CN, sampleCnMap.get(sampleName)); - newGenotypes.add(gb.make()); + // Build genotypes + builder.genotypes(genotypes); + vcfWriter.add(builder.make()); + } + + private List processRevisedCn(final VariantContext variant, final List genotypes) { + if (!revisedCopyNumbers.containsKey(variant.getID())) { + return genotypes; + } + + List updatedGenotypes = new ArrayList<>(genotypes.size()); + Map sampleCnMap = revisedCopyNumbers.get(variant.getID()); + for (Genotype genotype : genotypes) { + String sampleName = genotype.getSampleName(); + if (sampleCnMap.containsKey(sampleName)) { + GenotypeBuilder gb = new GenotypeBuilder(genotype) + .alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))) + .attribute(GATKSVVCFConstants.RD_CN, sampleCnMap.get(sampleName)); + updatedGenotypes.add(gb.make()); + } else { + updatedGenotypes.add(genotype); + } + } + return updatedGenotypes; + } + + private void processMultiallelic(final VariantContextBuilder builder, final List genotypes) { + int numGtOver2 = 0; + for (Genotype genotype : genotypes) { + Integer peGt = genotype.hasExtendedAttribute(GATKSVVCFConstants.PE_GT) ? + Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.PE_GT).toString()) : null; + Integer srGt = genotype.hasExtendedAttribute(GATKSVVCFConstants.SR_GT) ? + Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.SR_GT).toString()) : null; + int gt; + if (peGt == null) { + continue; + } else if (srGt == null) { + gt = peGt; + } else if (peGt > 0 && srGt == 0) { + gt = peGt; + } else if (peGt == 0) { + gt = srGt; + } else { + Integer peGq = genotype.hasExtendedAttribute(GATKSVVCFConstants.PE_GQ) ? + Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.PE_GQ).toString()) : null; + Integer srGq = genotype.hasExtendedAttribute(GATKSVVCFConstants.SR_GQ) ? + Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.SR_GQ).toString()) : null; + if (peGq != null && srGq != null && peGq >= srGq) { + gt = peGt; } else { - newGenotypes.add(genotype); + gt = srGt; } } - builder.genotypes(newGenotypes); - isRevisedEvent = true; + if (gt > 2) { + numGtOver2++; + } + } + if (numGtOver2 > maxVF) { + builder.attribute(GATKSVVCFConstants.PESR_GT_OVERDISPERSION, true); + } + } + + private List processLargeDeletions(final VariantContext variant, final VariantContextBuilder builder, List genotypes) { + if (!variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, "").equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL)) { + return genotypes; } - // Identify multiple genotypes if within recordStart and recordEnd - if (recordIdx >= recordStart && recordIdx < recordEnd) { - int numGtOver2 = 0; + boolean multiallelicFilter = false; + if (variant.getEnd() - variant.getStart() >= MIN_LARGE_EVENT_SIZE) { + Map sampleRdCn = new HashMap<>(); for (Genotype genotype : genotypes) { - Integer peGt = genotype.hasExtendedAttribute(GATKSVVCFConstants.PE_GT) ? - Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.PE_GT).toString()) : null; - Integer srGt = genotype.hasExtendedAttribute(GATKSVVCFConstants.SR_GT) ? - Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.SR_GT).toString()) : null; - int gt; - if (peGt == null) { - continue; - } else if (srGt == null) { - gt = peGt; - } else if (peGt > 0 && srGt == 0) { - gt = peGt; - } else if (peGt == 0) { - gt = srGt; + if (!outlierSamples.contains(genotype.getSampleName()) && genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { + sampleRdCn.put(genotype.getSampleName(), Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString())); + } + } + if (sampleRdCn.values().stream().filter(value -> value > 3).count() > maxVF) { + multiallelicFilter = true; + } + } + + boolean gt5kbFilter = false; + if (!genotypes.stream().allMatch(g -> g.getAlleles().size() > 2)) { // TODO: Verify logic for allele count > 2 + gt5kbFilter = true; + } else if (variant.getEnd() - variant.getStart() >= MIN_MULTIALLELIC_EVENT_SIZE && !multiallelicFilter) { + gt5kbFilter = true; + } + + List updatedGenotypes = new ArrayList<>(genotypes.size()); + if (gt5kbFilter) { + for (Genotype genotype : genotypes) { + GenotypeBuilder gb = new GenotypeBuilder(genotype); + if (Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 0).toString()) >= 2) { // TODO: Verify that removal of sample_obj[GQ] is None condition is okay + gb.alleles(Arrays.asList(variant.getReference(), variant.getReference())); + } else if (Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 0).toString()) == 1) { + gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); } else { - Integer peGq = genotype.hasExtendedAttribute(GATKSVVCFConstants.PE_GQ) ? - Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.PE_GQ).toString()) : null; - Integer srGq = genotype.hasExtendedAttribute(GATKSVVCFConstants.SR_GQ) ? - Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.SR_GQ).toString()) : null; - if (peGq != null && srGq != null && peGq >= srGq) { - gt = peGt; - } else { - gt = srGt; - } + gb.alleles(Arrays.asList(variant.getAlternateAllele(0), variant.getAlternateAllele(0))); } - if (gt > 2) { - numGtOver2++; + updatedGenotypes.add(gb.make()); + } + genotypes = updatedGenotypes; + } + + updatedGenotypes = new ArrayList<>(genotypes.size()); + if (multiallelicFilter) { + builder.filter(GATKSVVCFConstants.MULTIALLELIC); + for (Genotype genotype : genotypes) { + GenotypeBuilder gb = new GenotypeBuilder(genotype); + gb.noGQ(); + gb.alleles(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)); + gb.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN)); + gb.attribute(GATKSVVCFConstants.COPY_NUMBER_QUALITY_FORMAT, genotype.getExtendedAttribute(GATKSVVCFConstants.RD_GQ)); + updatedGenotypes.add(gb.make()); + } + builder.alleles(Arrays.asList(variant.getReference(), Allele.create("<" + GATKSVVCFConstants.CNV + ">", false))); + builder.attribute(GATKSVVCFConstants.SVTYPE, GATKSVVCFConstants.CNV); + genotypes = updatedGenotypes; + } + + return genotypes; + } + + private List processLargeDuplications(final VariantContext variant, final VariantContextBuilder builder, List genotypes) { + if (!variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, "").equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP)) { + return genotypes; + } + + boolean multiallelicFilter = false; + if (variant.getEnd() - variant.getStart() >= MIN_LARGE_EVENT_SIZE) { + Map sampleRdCn = new HashMap<>(); + for (Genotype genotype : genotypes) { + if (!outlierSamples.contains(genotype.getSampleName()) && genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { + sampleRdCn.put(genotype.getSampleName(), Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString())); } } - if (numGtOver2 > maxVF) { - isMultiGeno = true; + if (sampleRdCn.values().stream().filter(value -> value > 4).count() > maxVF + && sampleRdCn.values().stream().filter(value -> (value < 1 || value > 4)).count() > maxVF + && sampleRdCn.values().stream().filter(value -> (value < 1 || value > 4)).count() > 4) { + multiallelicFilter = true; } } - if (isRevisedEvent) { - if (isMultiGeno) { - builder.attribute(GATKSVVCFConstants.MULTI_GENO, true); + boolean gt5kbFilter = false; + if (!genotypes.stream().allMatch(g -> g.getAlleles().size() > 2)) { + gt5kbFilter = true; + } else if (variant.getEnd() - variant.getStart() >= MIN_MULTIALLELIC_EVENT_SIZE && !multiallelicFilter) { + gt5kbFilter = true; + } + + List updatedGenotypes = new ArrayList<>(genotypes.size()); + if (gt5kbFilter) { + for (Genotype genotype : genotypes) { + GenotypeBuilder gb = new GenotypeBuilder(genotype); + if (Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 0).toString()) <= 2) { + gb.alleles(Arrays.asList(variant.getReference(), variant.getReference())); + } else if (Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 0).toString()) == 3) { + gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); + } else { + gb.alleles(Arrays.asList(variant.getAlternateAllele(0), variant.getAlternateAllele(0))); + } + updatedGenotypes.add(gb.make()); } - vcfWriter.add(builder.make()); + genotypes = updatedGenotypes; + } + + updatedGenotypes = new ArrayList<>(genotypes.size()); + if (multiallelicFilter) { + builder.filter(GATKSVVCFConstants.MULTIALLELIC); + for (Genotype genotype : genotypes) { + GenotypeBuilder gb = new GenotypeBuilder(genotype); + gb.noGQ(); + gb.alleles(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)); + gb.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN)); + gb.attribute(GATKSVVCFConstants.COPY_NUMBER_QUALITY_FORMAT, genotype.getExtendedAttribute(GATKSVVCFConstants.RD_GQ)); + updatedGenotypes.add(gb.make()); + } + builder.alleles(Arrays.asList(variant.getReference(), Allele.create("<" + GATKSVVCFConstants.CNV + ">", false))); + builder.attribute(GATKSVVCFConstants.SVTYPE, GATKSVVCFConstants.CNV); + genotypes = updatedGenotypes; } - // TODO: Sex Revisions + return genotypes; } - private Integer getIntegerAttribute(final Genotype genotype, final String attributeName) { - if (genotype.hasExtendedAttribute(attributeName)) { - Object attr = genotype.getExtendedAttribute(attributeName); - if (attr instanceof Integer) { - return (Integer) attr; - } else if (attr instanceof String) { - try { - return Integer.parseInt((String) attr); - } catch (NumberFormatException e) { - return null; + private List processRevisedSex(final VariantContext variant, List genotypes) { + if (!variant.getAttributeAsBoolean(GATKSVVCFConstants.REVISED_EVENT, false)) { + return genotypes; + } + + List updatedGenotypes = new ArrayList<>(genotypes.size()); + for (Genotype genotype : genotypes) { + if (Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 0).toString()) > 0) { + int newRdCn = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()) - 1; + GenotypeBuilder gb = new GenotypeBuilder(genotype); + gb.attribute(GATKSVVCFConstants.RD_CN, newRdCn); + if (genotype.hasExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT)) { + gb.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, newRdCn); } + updatedGenotypes.add(gb.make()); + } else { + updatedGenotypes.add(genotype); } } - return null; + return updatedGenotypes; } private Map> readRevisedEvents(final GATKPath filePath) { From 31f70321562031a05e02894bbb9f903a95daa67e Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Sat, 26 Oct 2024 09:32:51 -0400 Subject: [PATCH 21/58] Reformatting & restructuring --- .../hellbender/tools/walkers/sv/SVCleanPt1a.java | 7 +------ .../hellbender/tools/walkers/sv/SVCleanPt1b.java | 4 +--- .../hellbender/tools/walkers/sv/SVCleanPt2.java | 5 +---- .../hellbender/tools/walkers/sv/SVCleanPt4.java | 6 +----- 4 files changed, 4 insertions(+), 18 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java index f0f10fe7aa9..7504969632e 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java @@ -22,7 +22,6 @@ import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; import org.broadinstitute.hellbender.utils.tsv.TableUtils; import org.broadinstitute.hellbender.utils.tsv.TableReader; -import org.broadinstitute.hellbender.utils.variant.GATKSVVariantContextUtils; import java.io.IOException; import java.nio.file.Path; @@ -56,11 +55,7 @@ * *

          Usage Example

          *
          - *     gatk SVCleanPt1a \
          - *       -V input.vcf.gz \
          - *       -O output.vcf.gz
          - *       --fail-list background_fail.txt
          - *       --pass-list bothsides_pass.txt
          + *     TODO
            * 
          * *

          Processing Steps

          diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java index 1579848bfe6..26cc47b2c82 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java @@ -51,9 +51,7 @@ * *

          Usage Example

          *
          - *     gatk SVCleanPt1b \
          - *       -V input.vcf.gz \
          - *       -O output.vcf.gz
          + *     TODO
            * 
          * *

          Processing Steps

          diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java index 409c3d418f7..e6010b24405 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java @@ -48,10 +48,7 @@ * *

          Usage Example

          *
          - *     gatk SVCleanPt2 \
          - *       -V input.vcf.gz \
          - *       --sample-list samples.txt \
          - * 	     --output-prefix result
          + *     TODO
            * 
          * *

          Processing Steps

          diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java index 0842e586c9a..45e02aedfd7 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java @@ -16,8 +16,6 @@ import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; import org.broadinstitute.hellbender.engine.*; import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; -import org.broadinstitute.hellbender.utils.variant.GATKSVVariantContextUtils; -import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; import java.io.BufferedReader; import java.io.FileReader; @@ -49,9 +47,7 @@ * *

          Usage Example

          *
          - *     gatk SVCleanPt4 \
          - *       -V input.vcf.gz \
          - *       --revised-cn-list revised.txt \
          + *     TODO
            * 
          * *

          Processing Steps

          From 2b97b7b4f06236bba4b9b0570ffbd2ed9ed99667 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Mon, 28 Oct 2024 13:35:00 -0400 Subject: [PATCH 22/58] Completed CleanVcf4 / implemented skeleton walker for CleanVcf5 --- .../spark/sv/utils/GATKSVVCFConstants.java | 10 ++- .../tools/walkers/sv/SVCleanPt4.java | 76 ++++++++++++++----- .../tools/walkers/sv/SVCleanPt5.java | 66 +++------------- 3 files changed, 73 insertions(+), 79 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java index 6dc216090af..f4d1f1be600 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java @@ -5,8 +5,9 @@ import java.util.Arrays; import java.util.List; +import java.util.Set; +import java.util.HashSet; import java.util.Map; - import static java.util.Map.entry; @@ -170,6 +171,7 @@ public enum ComplexVariantSubtype { // CleanPt4 public static final String PESR_GT_OVERDISPERSION = "PESR_GT_OVERDISPERSION"; + public static final String NO_CALLED_SAMPLES = "NO_CALLED_SAMPLES"; public static final String GT = "GT"; public static final String GQ = "GQ"; public static final String PE_GT = "PE_GT"; @@ -177,6 +179,12 @@ public enum ComplexVariantSubtype { public static final String PE_GQ = "PE_GQ"; public static final String SR_GQ = "SR_GQ"; public static final String CNV = "CNV"; + public static final Set> BIALLELIC_GTS = new HashSet<>(Arrays.asList( + Arrays.asList(0, 0), + Arrays.asList(1, 1), + Arrays.asList(0, 1), + Arrays.asList(null, null) + )); // Clustering public static final String CLUSTER_MEMBER_IDS_KEY = "MEMBERS"; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java index 45e02aedfd7..41bcdc97cbe 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java @@ -1,10 +1,6 @@ package org.broadinstitute.hellbender.tools.walkers.sv; -import htsjdk.variant.variantcontext.Allele; -import htsjdk.variant.variantcontext.Genotype; -import htsjdk.variant.variantcontext.GenotypeBuilder; -import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.variantcontext.*; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.vcf.*; @@ -16,6 +12,7 @@ import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; import org.broadinstitute.hellbender.engine.*; import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; +import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; import java.io.BufferedReader; import java.io.FileReader; @@ -23,7 +20,15 @@ import java.nio.file.Files; import java.nio.file.Paths; -import java.util.*; + +import java.util.Arrays; +import java.util.List; +import java.util.ArrayList; +import java.util.Set; +import java.util.Map; +import java.util.HashSet; +import java.util.HashMap; +import java.util.stream.Collectors; /** * Completes an initial series of cleaning steps for a VCF produced by the GATK-SV pipeline. @@ -68,7 +73,6 @@ public class SVCleanPt4 extends VariantWalker { public static final String REVISED_CN_LIST_LONG_NAME = "revised-cn-list"; public static final String OUTLIERS_LIST_LONG_NAME = "outliers-list"; public static final String OUTPUT_MULTIALLELIC_VCF_LONG_NAME = "output-multiallelic-vcf"; - public static final String OUTPUT_NO_CALLS_VCF_LONG_NAME = "output-no-calls-vcf"; @Argument( fullName = REVISED_CN_LIST_LONG_NAME, @@ -96,17 +100,10 @@ public class SVCleanPt4 extends VariantWalker { doc = "Output multiallelic VCF name" ) private GATKPath outputVcfMultiallelic; - - @Argument( - fullName = OUTPUT_NO_CALLS_VCF_LONG_NAME, - doc = "Output no sample calls VCF name" - ) - private GATKPath outputVcfNoCalls; */ private VariantContextWriter vcfWriter; private VariantContextWriter vcfWriterMultiallelic; - private VariantContextWriter vcfWriterNoCalls; private Map> revisedCopyNumbers; private Set outlierSamples; @@ -158,17 +155,17 @@ public void onTraversalStart() { header.addMetaDataLine(new VCFFormatHeaderLine(GATKSVVCFConstants.COPY_NUMBER_FORMAT, 1, VCFHeaderLineType.Integer, "Predicted copy state")); header.addMetaDataLine(new VCFFormatHeaderLine(GATKSVVCFConstants.COPY_NUMBER_QUALITY_FORMAT, 1, VCFHeaderLineType.Integer, "Read-depth genotype quality")); header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.PESR_GT_OVERDISPERSION, 0, VCFHeaderLineType.Flag, "High PESR dispersion count")); + header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.NO_CALLED_SAMPLES, 0, VCFHeaderLineType.Flag, "No samples called")); vcfWriter.writeHeader(header); // Create supporting output VCFs /* vcfWriterMultiallelic = createVCFWriter(outputVcfMultiallelic); vcfWriterMultiallelic.writeHeader(header); - vcfWriterNoCalls = createVCFWriter(outputVcfNoCalls); - vcfWriterNoCalls.writeHeader(header); */ } + @Override public void closeTool() { if (vcfWriter != null) { vcfWriter.close(); @@ -177,10 +174,6 @@ public void closeTool() { if (vcfWriterMultiallelic != null) { vcfWriterMultiallelic.close(); } - - if (vcfWriterNoCalls != null) { - vcfWriterNoCalls.close(); - } } @Override @@ -202,6 +195,7 @@ public void apply(final VariantContext variant, final ReadsContext readsContext, genotypes = processLargeDeletions(variant, builder, genotypes); genotypes = processLargeDuplications(variant, builder, genotypes); genotypes = processRevisedSex(variant, genotypes); + processNoCalls(variant, builder, genotypes); // Build genotypes builder.genotypes(genotypes); @@ -284,7 +278,7 @@ private List processLargeDeletions(final VariantContext variant, final } boolean gt5kbFilter = false; - if (!genotypes.stream().allMatch(g -> g.getAlleles().size() > 2)) { // TODO: Verify logic for allele count > 2 + if (genotypes.stream().anyMatch(g -> !isBiallelic(g))) { gt5kbFilter = true; } else if (variant.getEnd() - variant.getStart() >= MIN_MULTIALLELIC_EVENT_SIZE && !multiallelicFilter) { gt5kbFilter = true; @@ -409,6 +403,46 @@ private List processRevisedSex(final VariantContext variant, List genotypes) { + boolean hasCalledSample = false; + + for (Genotype genotype : genotypes) { + if (!isNoCallGt(genotype.getAlleles())) { + hasCalledSample = true; + break; + } + } + + if (!hasCalledSample && builder.getAttributes().getOrDefault(GATKSVVCFConstants.SVTYPE, "").equals(GATKSVVCFConstants.CNV)) { + for (Genotype genotype : genotypes) { + Integer cn = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.CNV, 2).toString()); + if (cn != null && cn != 2) { + + hasCalledSample = true; + break; + } + } + } + + if (!hasCalledSample) { + builder.attribute(GATKSVVCFConstants.NO_CALLED_SAMPLES, true); + } + } + + private boolean isBiallelic(Genotype genotype) { + List gt = genotype.getAlleles().stream() + .map(allele -> allele.isNoCall() ? null : allele.getDisplayString().equals("1") ? 1 : 0) + .toList(); + return GATKSVVCFConstants.BIALLELIC_GTS.contains(gt); + } + + private boolean isNoCallGt(List alleles) { + if (alleles.size() == 1 && alleles.get(0).isReference()) return true; + if (alleles.size() == 2 && alleles.get(0).isReference() && alleles.get(1).isReference()) return true; + if (alleles.size() == 1 && alleles.get(0).isNoCall()) return true; + return false; + } + private Map> readRevisedEvents(final GATKPath filePath) { try (BufferedReader reader = new BufferedReader(new FileReader(filePath.toPath().toFile()))) { final Map> result = new HashMap<>(); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java index b84d3f95633..e71a3d4655d 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java @@ -4,6 +4,7 @@ import htsjdk.variant.variantcontext.VariantContext; import htsjdk.samtools.util.OverlapDetector; +import htsjdk.variant.variantcontext.VariantContextBuilder; import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.BetaFeature; import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; @@ -48,11 +49,7 @@ * *

          Usage Example

          *
          - *     gatk SVCleanPt5 \
          - *       -V input.vcf.gz \
          - *       --sample-list samples.txt \
          - * 	     --multi-cnv-list multi.cnvs.txt
          - * 	     --output-prefix result
          + *     TODO
            * 
          * *

          Processing Steps

          @@ -69,7 +66,8 @@ ) @BetaFeature @DocumentedFeature -public class SVCleanPt5 extends MultiplePassVariantWalker { // MultiVariantWalker? +public class SVCleanPt5 extends MultiplePassVariantWalker { + @Override protected int numberOfPasses() { return 2; @@ -77,70 +75,24 @@ protected int numberOfPasses() { @Override public void onTraversalStart() { - /* - try { - revisedCnWriter = Files.newBufferedWriter(Paths.get(outputPrefix + ".txt")); - - sampleWhitelist = new HashSet<>(Files.readAllLines(sampleListPath.toPath())); - multiallelicCnvs = new HashSet<>(Files.readAllLines(multiCnvPath.toPath())); - } catch (IOException e) { - throw new RuntimeException("Error reading input file", e); - } - */ return; } @Override public Object onTraversalSuccess() { - /* - try { - List variantIDs = new ArrayList<>(revisedCopyNumbers.keySet()); - Collections.sort(variantIDs); - - for (String variantID : variantIDs) { - Map sampleMap = revisedCopyNumbers.get(variantID); - - List samples = new ArrayList<>(sampleMap.keySet()); - Collections.sort(samples); - - for (String sample : samples) { - int rdCn = sampleMap.get(sample); - revisedCnWriter.write(variantID + "\t" + sample + "\t" + rdCn); - revisedCnWriter.newLine(); - } - } - - if (revisedCnWriter != null) { - revisedCnWriter.close(); - } - - return null; - } catch (IOException e) { - throw new RuntimeException("Error writing multiallelic CNVs", e); - } - */ return null; } @Override - protected void nthPassApply(VariantContext variant, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext, int n) { - /* - switch (n) { - case 0: - firstPassApply(variant); - break; - case 1: - secondPassApply(variant); - break; - default: - throw new IllegalArgumentException("Invalid pass number: " + n); - } - */ + public void closeTool() { return; } @Override - protected void afterNthPass(int n) { + public void nthPassApply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext, int n) { return; } + + @Override + protected void afterNthPass(final int n) {} } From 45443f9e95f6e0901d8c8989ab8f6b7e19115a14 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Wed, 30 Oct 2024 08:39:57 -0400 Subject: [PATCH 23/58] Revert SVCleanPt2 to use overlap buffer --- .../spark/sv/utils/GATKSVVCFConstants.java | 2 +- .../tools/walkers/sv/SVCleanPt1a.java | 2 +- .../tools/walkers/sv/SVCleanPt1b.java | 72 ++++++++++--------- .../tools/walkers/sv/SVCleanPt2.java | 53 +++++--------- .../tools/walkers/sv/SVCleanPt4.java | 72 +++++++++++-------- 5 files changed, 100 insertions(+), 101 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java index f4d1f1be600..37c3d0b4f39 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java @@ -8,6 +8,7 @@ import java.util.Set; import java.util.HashSet; import java.util.Map; + import static java.util.Map.entry; @@ -171,7 +172,6 @@ public enum ComplexVariantSubtype { // CleanPt4 public static final String PESR_GT_OVERDISPERSION = "PESR_GT_OVERDISPERSION"; - public static final String NO_CALLED_SAMPLES = "NO_CALLED_SAMPLES"; public static final String GT = "GT"; public static final String GQ = "GQ"; public static final String PE_GT = "PE_GT"; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java index 7504969632e..58285fa12d7 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java @@ -125,7 +125,7 @@ public void onTraversalStart() { failSet = readLastColumn(failList); passSet = readLastColumn(passList); - // Create header without the 'UNRESOLVED' INFO line + // Filter specific header lines final VCFHeader header = getHeaderForVariants(); final Set newHeaderLines = new LinkedHashSet<>(); for (final VCFHeaderLine line : header.getMetaDataInInputOrder()) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java index 26cc47b2c82..3b5cb7343f8 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java @@ -83,6 +83,9 @@ public class SVCleanPt1b extends MultiplePassVariantWalker { final private Map> revisedEventsFiltered = new HashMap<>(); final private Map> revisedRdCn = new HashMap<>(); + private static final int MIN_VARIANT_SIZE_CNV = 1000; + private static final int MIN_VARIANT_SIZE = 5000; + @Override protected int numberOfPasses() { return 3; @@ -128,24 +131,37 @@ protected void afterNthPass(final int n) { } public void firstPassApply(final VariantContext variant) { - final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); - final boolean isDelDup = svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL); - final boolean isLarge = Math.abs(variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) >= 5000; - if (isDelDup && isLarge) { - overlappingVariantsBuffer.removeIf(vc -> vc.getEnd() < variant.getStart()); - for (VariantContext bufferedVariant : overlappingVariantsBuffer) { - if (overlaps(bufferedVariant, variant)) { - processOverlap(bufferedVariant, variant); - } + if (!isDelDup(variant) || !isLarge(variant, MIN_VARIANT_SIZE)) { + return; + } + + // Process overlaps with variants in the buffer + overlappingVariantsBuffer.removeIf(vc -> !vc.getContig().equals(variant.getContig()) || vc.getEnd() < variant.getStart()); + for (VariantContext bufferedVariant : overlappingVariantsBuffer) { + if (overlaps(bufferedVariant, variant)) { + processOverlap(bufferedVariant, variant); } - overlappingVariantsBuffer.add(variant); } + overlappingVariantsBuffer.add(variant); } public void secondPassApply(final VariantContext variant) { - if (revisedEventsFiltered.containsKey(variant.getID())) { - initializeRdCn(variant); + if (!revisedEventsFiltered.containsKey(variant.getID())) { + return; } + + // Initialize data structures + final String variantId = variant.getID(); + final Set samples = revisedEventsFiltered.get(variantId); + final Map variantRdCn = new HashMap<>(); + + // Initialize revisedRdCn value for each variant + for (final String sampleName : samples) { + final Genotype genotype = variant.getGenotype(sampleName); + final String rdCn = (String) genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN); + variantRdCn.put(sampleName, Integer.parseInt(rdCn)); + } + revisedRdCn.put(variantId, variantRdCn); } public void thirdPassApply(final VariantContext variant) { @@ -153,11 +169,7 @@ public void thirdPassApply(final VariantContext variant) { if (revisedEventsAll.containsKey(variant.getID())) { processVariant(builder, variant); } - - final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); - final boolean isDelDup = svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL); - final boolean isLarge = variant.getEnd() - variant.getStart() >= 1000; - if (isDelDup && isLarge) { + if (isDelDup(variant) && isLarge(variant, MIN_VARIANT_SIZE_CNV)) { processCnvs(builder, variant); } vcfWriter.add(builder.make()); @@ -221,21 +233,6 @@ private void processCollectedVariants() { } } - private void initializeRdCn(final VariantContext variant) { - // Initialize data structures - final String variantId = variant.getID(); - final Set samples = revisedEventsFiltered.get(variantId); - final Map variantRdCn = new HashMap<>(); - - // Initialize revisedRdCn value for each variant - for (final String sampleName : samples) { - final Genotype genotype = variant.getGenotype(sampleName); - final String rdCn = (String) genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN); - variantRdCn.put(sampleName, Integer.parseInt(rdCn)); - } - revisedRdCn.put(variantId, variantRdCn); - } - private void processVariant(final VariantContextBuilder builder, final VariantContext variant) { // Initialize data structures final String variantId = variant.getID(); @@ -264,6 +261,7 @@ private void processVariant(final VariantContextBuilder builder, final VariantCo } if (newVal != -1) { + System.out.println(variantId); final GenotypeBuilder gb = new GenotypeBuilder(oldGenotype); gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); gb.GQ(Integer.parseInt((String) oldGenotype.getExtendedAttribute(GATKSVVCFConstants.RD_GQ))); @@ -291,6 +289,16 @@ private void processCnvs(final VariantContextBuilder builder, final VariantConte } } + private boolean isDelDup(final VariantContext variant) { + String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + return svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP); + } + + private boolean isLarge(final VariantContext variant, final int minSize) { + int variantLength = Math.abs(variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); + return variantLength >= minSize; + } + private boolean overlaps(final VariantContext v1, final VariantContext v2) { return v1.getContig().equals(v2.getContig()) && v1.getStart() <= v2.getEnd() && v2.getStart() <= v1.getEnd(); } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java index e6010b24405..4dc71825008 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java @@ -2,7 +2,6 @@ import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.samtools.util.OverlapDetector; import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.BetaFeature; @@ -65,7 +64,7 @@ ) @BetaFeature @DocumentedFeature -public class SVCleanPt2 extends MultiplePassVariantWalker { +public class SVCleanPt2 extends VariantWalker { public static final String SAMPLE_LIST_LONG_NAME = "sample-list"; public static final String OUTPUT_PREFIX_LONG_NAME = "output-prefix"; @@ -86,17 +85,12 @@ public class SVCleanPt2 extends MultiplePassVariantWalker { private Set sampleWhitelist; private final Map> abnormalRdCn = new HashMap<>(); - private final OverlapDetector overlapDetector = new OverlapDetector<>(0, 0); + private final List overlappingVariantsBuffer = new ArrayList<>(); private final Map> revisedCopyNumbers = new HashMap<>(); private final Set revisedComplete = new HashSet<>(); private static final int MIN_VARIANT_SIZE = 5000; - @Override - protected int numberOfPasses() { - return 2; - } - @Override public void onTraversalStart() { try { @@ -138,28 +132,12 @@ public Object onTraversalSuccess() { } @Override - protected void nthPassApply(final VariantContext variant, final ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext, int n) { + public void apply(final VariantContext variant, final ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext) { // Skip if not expected SVTYPE or below SVLEN threshold - if (!isDelDup(variant) || !isLargeVariant(variant, MIN_VARIANT_SIZE)) { + if (!isDelDup(variant) || !isLarge(variant, MIN_VARIANT_SIZE)) { return; } - switch (n) { - case 0: - firstPassApply(variant); - break; - case 1: - secondPassApply(variant); - break; - default: - throw new IllegalArgumentException("Invalid pass number: " + n); - } - } - - @Override - protected void afterNthPass(final int n) {} - - private void firstPassApply(final VariantContext variant) { // Flag sample as having an abnormal copy number if it passes certain conditions for (String sample : variant.getSampleNames()) { Genotype genotype = variant.getGenotype(sample); @@ -174,18 +152,15 @@ private void firstPassApply(final VariantContext variant) { } } - // Add variant to overlap detector - overlapDetector.addLhs(variant, variant); - } - - private void secondPassApply(final VariantContext variant) { - // Check if copy number needs to be adjusted for samples within overlapping variants - Set overlappingVariants = overlapDetector.getOverlaps(variant); - for (VariantContext otherVariant : overlappingVariants) { - if (!variant.getID().equals(otherVariant.getID())) { - adjustCopyNumber(variant, otherVariant); + // Process overlaps with variants in the buffer + overlappingVariantsBuffer.removeIf(vc -> !vc.getContig().equals(variant.getContig()) || vc.getEnd() < variant.getStart()); + for (VariantContext bufferedVariant : overlappingVariantsBuffer) { + if (overlaps(variant, bufferedVariant)) { + adjustCopyNumber(bufferedVariant, variant); + adjustCopyNumber(variant, bufferedVariant); } } + overlappingVariantsBuffer.add(variant); } private void adjustCopyNumber(final VariantContext v1, final VariantContext v2) { @@ -288,12 +263,16 @@ else if (support1.contains(GATKSVVCFConstants.EV_VALUES.get(1)) } } + private boolean overlaps(final VariantContext v1, final VariantContext v2) { + return v1.getContig().equals(v2.getContig()) && v1.getStart() <= v2.getEnd() && v2.getStart() <= v1.getEnd(); + } + private boolean isDelDup(final VariantContext variant) { String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); return svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP); } - private boolean isLargeVariant(final VariantContext variant, final int minSize) { + private boolean isLarge(final VariantContext variant, final int minSize) { int variantLength = Math.abs(variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); return variantLength >= minSize; } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java index 41bcdc97cbe..e2800c67ba3 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java @@ -12,6 +12,7 @@ import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; import org.broadinstitute.hellbender.engine.*; import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; +import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFHeaderLines; import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; import java.io.BufferedReader; @@ -21,13 +22,7 @@ import java.nio.file.Files; import java.nio.file.Paths; -import java.util.Arrays; -import java.util.List; -import java.util.ArrayList; -import java.util.Set; -import java.util.Map; -import java.util.HashSet; -import java.util.HashMap; +import java.util.*; import java.util.stream.Collectors; /** @@ -148,15 +143,27 @@ public void onTraversalStart() { maxVF = Math.max((int) ((getHeaderForVariants().getGenotypeSamples().size() - outlierSamples.size()) * 0.01), 2); recordIdx = 0; - // Create primary output VCF - vcfWriter = createVCFWriter(outputVcf); + // Filter specific header lines final VCFHeader header = getHeaderForVariants(); - header.addMetaDataLine(new VCFFilterHeaderLine(GATKSVVCFConstants.MULTIALLELIC, "Multiallelic site")); - header.addMetaDataLine(new VCFFormatHeaderLine(GATKSVVCFConstants.COPY_NUMBER_FORMAT, 1, VCFHeaderLineType.Integer, "Predicted copy state")); - header.addMetaDataLine(new VCFFormatHeaderLine(GATKSVVCFConstants.COPY_NUMBER_QUALITY_FORMAT, 1, VCFHeaderLineType.Integer, "Read-depth genotype quality")); - header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.PESR_GT_OVERDISPERSION, 0, VCFHeaderLineType.Flag, "High PESR dispersion count")); - header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.NO_CALLED_SAMPLES, 0, VCFHeaderLineType.Flag, "No samples called")); - vcfWriter.writeHeader(header); + final Set newHeaderLines = new LinkedHashSet<>(); + for (final VCFHeaderLine line : header.getMetaDataInInputOrder()) { + if (!(line instanceof VCFInfoHeaderLine) + || (!((VCFInfoHeaderLine) line).getID().equals(GATKSVVCFConstants.MULTI_CNV) + && !((VCFInfoHeaderLine) line).getID().equals(GATKSVVCFConstants.REVISED_EVENT))) { + newHeaderLines.add(line); + } + } + + // Add new header lines + VCFHeader newHeader = new VCFHeader(newHeaderLines, header.getGenotypeSamples()); + newHeader.addMetaDataLine(new VCFFilterHeaderLine(GATKSVVCFConstants.MULTIALLELIC, "Multiallelic site")); + newHeader.addMetaDataLine(new VCFFormatHeaderLine(GATKSVVCFConstants.COPY_NUMBER_FORMAT, 1, VCFHeaderLineType.Integer, "Predicted copy state")); + newHeader.addMetaDataLine(new VCFFormatHeaderLine(GATKSVVCFConstants.COPY_NUMBER_QUALITY_FORMAT, 1, VCFHeaderLineType.Integer, "Read-depth genotype quality")); + newHeader.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.PESR_GT_OVERDISPERSION, 0, VCFHeaderLineType.Flag, "High PESR dispersion count")); + + // Write header + vcfWriter = createVCFWriter(outputVcf); + vcfWriter.writeHeader(newHeader); // Create supporting output VCFs /* @@ -182,7 +189,7 @@ public void apply(final VariantContext variant, final ReadsContext readsContext, recordIdx++; VariantContextBuilder builder = new VariantContextBuilder(variant); - // Exit if outside batch range // TODO: Does this have to move to after processRevisedCn? + // Exit if outside batch range if (recordIdx < recordStart || recordIdx >= recordEnd) { vcfWriter.add(builder.make()); return; @@ -195,11 +202,13 @@ public void apply(final VariantContext variant, final ReadsContext readsContext, genotypes = processLargeDeletions(variant, builder, genotypes); genotypes = processLargeDuplications(variant, builder, genotypes); genotypes = processRevisedSex(variant, genotypes); - processNoCalls(variant, builder, genotypes); + processInfoFields(builder); // Build genotypes - builder.genotypes(genotypes); - vcfWriter.add(builder.make()); + if (isCalled(variant, builder, genotypes)) { + builder.genotypes(genotypes); + vcfWriter.add(builder.make()); + } } private List processRevisedCn(final VariantContext variant, final List genotypes) { @@ -403,30 +412,33 @@ private List processRevisedSex(final VariantContext variant, List genotypes) { - boolean hasCalledSample = false; + private void processInfoFields(final VariantContextBuilder builder) { + Map attributes = builder.getAttributes(); + if (attributes.containsKey(GATKSVVCFConstants.MULTI_CNV)) { + builder.rmAttribute(GATKSVVCFConstants.MULTI_CNV); + } + if (attributes.containsKey(GATKSVVCFConstants.REVISED_EVENT)) { + builder.rmAttribute(GATKSVVCFConstants.REVISED_EVENT); + } + } + public boolean isCalled(final VariantContext variant, final VariantContextBuilder builder, final List genotypes) { for (Genotype genotype : genotypes) { if (!isNoCallGt(genotype.getAlleles())) { - hasCalledSample = true; - break; + return true; } } - if (!hasCalledSample && builder.getAttributes().getOrDefault(GATKSVVCFConstants.SVTYPE, "").equals(GATKSVVCFConstants.CNV)) { + if (builder.getAttributes().getOrDefault(GATKSVVCFConstants.SVTYPE, "").equals(GATKSVVCFConstants.CNV)) { for (Genotype genotype : genotypes) { Integer cn = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.CNV, 2).toString()); if (cn != null && cn != 2) { - - hasCalledSample = true; - break; + return true; } } } - if (!hasCalledSample) { - builder.attribute(GATKSVVCFConstants.NO_CALLED_SAMPLES, true); - } + return false; } private boolean isBiallelic(Genotype genotype) { From 7fbc3a2f43bc7e51053fe13322472dad9882d094 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Wed, 30 Oct 2024 14:03:20 -0400 Subject: [PATCH 24/58] Working implementation of SVCleanPt5 --- .../spark/sv/utils/GATKSVVCFConstants.java | 20 ++- .../tools/walkers/sv/SVCleanPt1a.java | 24 +-- .../tools/walkers/sv/SVCleanPt1b.java | 8 +- .../tools/walkers/sv/SVCleanPt4.java | 54 +++--- .../tools/walkers/sv/SVCleanPt5.java | 161 +++++++++++++++--- 5 files changed, 178 insertions(+), 89 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java index 37c3d0b4f39..ab2e03130a9 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java @@ -152,7 +152,7 @@ public enum ComplexVariantSubtype { public static final String LOW_QS_SCORE_FILTER_KEY = "LOW_QS"; public static final String FREQUENCY_FILTER_KEY = "FREQ"; - // CleanPt1a + // SVCleanPt1a public static final String EV = "EV"; public static final List EV_VALUES = Arrays.asList( null, "RD", "PE", "RD,PE", "SR", "RD,SR", "PE,SR", "RD,PE,SR" @@ -166,25 +166,35 @@ public enum ComplexVariantSubtype { public static final String REVISED_EVENT = "REVISED_EVENT"; public static final String RD_CN = "RD_CN"; - // CleanPt1b + // SVCleanPt1b public static final String RD_GQ = "RD_GQ"; public static final String MULTI_CNV = "MULTI_CNV"; - // CleanPt4 + // SVCleanPt4 public static final String PESR_GT_OVERDISPERSION = "PESR_GT_OVERDISPERSION"; - public static final String GT = "GT"; - public static final String GQ = "GQ"; public static final String PE_GT = "PE_GT"; public static final String SR_GT = "SR_GT"; public static final String PE_GQ = "PE_GQ"; public static final String SR_GQ = "SR_GQ"; public static final String CNV = "CNV"; + + // SVCleanPt5 + public static final String UNR = "UNR"; + public static final String EVENT = "EVENT"; public static final Set> BIALLELIC_GTS = new HashSet<>(Arrays.asList( Arrays.asList(0, 0), Arrays.asList(1, 1), Arrays.asList(0, 1), Arrays.asList(null, null) )); + public static final Set FILTER_VCF_LINES = new HashSet<>(Arrays.asList( + "CIPOS", "CIEND", "RMSSTD", "source", "bcftools", "GATKCommandLine", "#CHROM" + )); + + public static final Set FILTER_VCF_INFO_LINES = new HashSet<>(Arrays.asList( + GATKSVVCFConstants.UNRESOLVED, GATKSVVCFConstants.MULTIALLELIC, GATKSVVCFConstants.VAR_GQ, + GATKSVVCFConstants.MULTI_CNV, GATKSVVCFConstants.REVISED_EVENT, GATKSVVCFConstants.EVENT + )); // Clustering public static final String CLUSTER_MEMBER_IDS_KEY = "MEMBERS"; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java index 58285fa12d7..2981c9dca72 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java @@ -125,28 +125,16 @@ public void onTraversalStart() { failSet = readLastColumn(failList); passSet = readLastColumn(passList); - // Filter specific header lines - final VCFHeader header = getHeaderForVariants(); - final Set newHeaderLines = new LinkedHashSet<>(); - for (final VCFHeaderLine line : header.getMetaDataInInputOrder()) { - if (!(line instanceof VCFInfoHeaderLine) - || (!((VCFInfoHeaderLine) line).getID().equals(GATKSVVCFConstants.UNRESOLVED) - && !((VCFInfoHeaderLine) line).getID().equals(GATKSVVCFConstants.MULTIALLELIC) - && !((VCFInfoHeaderLine) line).getID().equals(GATKSVVCFConstants.VAR_GQ))) { - newHeaderLines.add(line); - } - } - // Add new header lines - VCFHeader newHeader = new VCFHeader(newHeaderLines, header.getGenotypeSamples()); - newHeader.addMetaDataLine(new VCFFilterHeaderLine(GATKSVVCFConstants.UNRESOLVED, "Variant is unresolved")); - newHeader.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.HIGH_SR_BACKGROUND, 0, VCFHeaderLineType.Flag, "High number of SR splits in background samples indicating messy region")); - newHeader.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.BOTHSIDES_SUPPORT, 0, VCFHeaderLineType.Flag, "Variant has read-level support for both sides of breakpoint")); - newHeader.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.REVISED_EVENT, 0, VCFHeaderLineType.Flag, "Variant has been revised due to a copy number mismatch")); + VCFHeader header = getHeaderForVariants(); + header.addMetaDataLine(new VCFFilterHeaderLine(GATKSVVCFConstants.UNRESOLVED, "Variant is unresolved")); + header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.HIGH_SR_BACKGROUND, 0, VCFHeaderLineType.Flag, "High number of SR splits in background samples indicating messy region")); + header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.BOTHSIDES_SUPPORT, 0, VCFHeaderLineType.Flag, "Variant has read-level support for both sides of breakpoint")); + header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.REVISED_EVENT, 0, VCFHeaderLineType.Flag, "Variant has been revised due to a copy number mismatch")); // Write header vcfWriter = createVCFWriter(outputVcf); - vcfWriter.writeHeader(newHeader); + vcfWriter.writeHeader(header); } @Override diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java index 3b5cb7343f8..7399d06cf9d 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java @@ -5,10 +5,10 @@ import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.variant.variantcontext.writer.VariantContextWriter; - import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; + import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.BetaFeature; import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; @@ -79,9 +79,9 @@ public class SVCleanPt1b extends MultiplePassVariantWalker { private VariantContextWriter vcfWriter; private final List overlappingVariantsBuffer = new ArrayList<>(); - final private Map>> revisedEventsAll = new HashMap<>(); - final private Map> revisedEventsFiltered = new HashMap<>(); - final private Map> revisedRdCn = new HashMap<>(); + private final Map>> revisedEventsAll = new HashMap<>(); + private final Map> revisedEventsFiltered = new HashMap<>(); + private final Map> revisedRdCn = new HashMap<>(); private static final int MIN_VARIANT_SIZE_CNV = 1000; private static final int MIN_VARIANT_SIZE = 5000; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java index e2800c67ba3..d8e15b213bf 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java @@ -12,18 +12,19 @@ import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; import org.broadinstitute.hellbender.engine.*; import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; -import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFHeaderLines; -import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; - import java.nio.file.Files; -import java.nio.file.Paths; -import java.util.*; -import java.util.stream.Collectors; +import java.util.Arrays; +import java.util.List; +import java.util.ArrayList; +import java.util.Set; +import java.util.Map; +import java.util.HashSet; +import java.util.HashMap; /** * Completes an initial series of cleaning steps for a VCF produced by the GATK-SV pipeline. @@ -67,7 +68,6 @@ public class SVCleanPt4 extends VariantWalker { public static final String REVISED_CN_LIST_LONG_NAME = "revised-cn-list"; public static final String OUTLIERS_LIST_LONG_NAME = "outliers-list"; - public static final String OUTPUT_MULTIALLELIC_VCF_LONG_NAME = "output-multiallelic-vcf"; @Argument( fullName = REVISED_CN_LIST_LONG_NAME, @@ -89,16 +89,7 @@ public class SVCleanPt4 extends VariantWalker { ) private GATKPath outputVcf; - /* - @Argument( - fullName = OUTPUT_MULTIALLELIC_VCF_LONG_NAME, - doc = "Output multiallelic VCF name" - ) - private GATKPath outputVcfMultiallelic; - */ - private VariantContextWriter vcfWriter; - private VariantContextWriter vcfWriterMultiallelic; private Map> revisedCopyNumbers; private Set outlierSamples; @@ -145,13 +136,16 @@ public void onTraversalStart() { // Filter specific header lines final VCFHeader header = getHeaderForVariants(); - final Set newHeaderLines = new LinkedHashSet<>(); + final Set newHeaderLines = new HashSet<>(); for (final VCFHeaderLine line : header.getMetaDataInInputOrder()) { - if (!(line instanceof VCFInfoHeaderLine) - || (!((VCFInfoHeaderLine) line).getID().equals(GATKSVVCFConstants.MULTI_CNV) - && !((VCFInfoHeaderLine) line).getID().equals(GATKSVVCFConstants.REVISED_EVENT))) { - newHeaderLines.add(line); + if (line instanceof VCFInfoHeaderLine) { + String id = ((VCFInfoHeaderLine) line).getID(); + if (id.equals(GATKSVVCFConstants.MULTI_CNV) || + id.equals(GATKSVVCFConstants.REVISED_EVENT)) { + continue; + } } + newHeaderLines.add(line); } // Add new header lines @@ -164,12 +158,6 @@ public void onTraversalStart() { // Write header vcfWriter = createVCFWriter(outputVcf); vcfWriter.writeHeader(newHeader); - - // Create supporting output VCFs - /* - vcfWriterMultiallelic = createVCFWriter(outputVcfMultiallelic); - vcfWriterMultiallelic.writeHeader(header); - */ } @Override @@ -177,10 +165,6 @@ public void closeTool() { if (vcfWriter != null) { vcfWriter.close(); } - - if (vcfWriterMultiallelic != null) { - vcfWriterMultiallelic.close(); - } } @Override @@ -205,7 +189,7 @@ public void apply(final VariantContext variant, final ReadsContext readsContext, processInfoFields(builder); // Build genotypes - if (isCalled(variant, builder, genotypes)) { + if (isCalled(builder, genotypes)) { builder.genotypes(genotypes); vcfWriter.add(builder.make()); } @@ -422,7 +406,7 @@ private void processInfoFields(final VariantContextBuilder builder) { } } - public boolean isCalled(final VariantContext variant, final VariantContextBuilder builder, final List genotypes) { + public boolean isCalled(final VariantContextBuilder builder, final List genotypes) { for (Genotype genotype : genotypes) { if (!isNoCallGt(genotype.getAlleles())) { return true; @@ -450,8 +434,8 @@ private boolean isBiallelic(Genotype genotype) { private boolean isNoCallGt(List alleles) { if (alleles.size() == 1 && alleles.get(0).isReference()) return true; - if (alleles.size() == 2 && alleles.get(0).isReference() && alleles.get(1).isReference()) return true; - if (alleles.size() == 1 && alleles.get(0).isNoCall()) return true; + else if (alleles.size() == 2 && alleles.get(0).isReference() && alleles.get(1).isReference()) return true; + else if (alleles.size() == 1 && alleles.get(0).isNoCall()) return true; return false; } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java index e71a3d4655d..4edfac2d8b7 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java @@ -1,31 +1,24 @@ package org.broadinstitute.hellbender.tools.walkers.sv; +import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.GenotypeBuilder; import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.samtools.util.OverlapDetector; - import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; + +import htsjdk.variant.vcf.*; import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.BetaFeature; import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; import org.broadinstitute.hellbender.engine.*; import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; +import org.broadinstitute.hellbender.utils.variant.GATKSVVariantContextUtils; -import java.io.BufferedWriter; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Paths; - -import java.util.Arrays; -import java.util.List; -import java.util.ArrayList; -import java.util.Set; -import java.util.Map; -import java.util.HashSet; -import java.util.HashMap; -import java.util.Collections; +import java.util.*; /** * Completes an initial series of cleaning steps for a VCF produced by the GATK-SV pipeline. @@ -67,32 +60,146 @@ @BetaFeature @DocumentedFeature public class SVCleanPt5 extends MultiplePassVariantWalker { + @Argument( + fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + doc = "Output VCF name" + ) + private GATKPath outputVcf; + + private VariantContextWriter vcfWriter; + + private final List overlappingVariantsBuffer = new ArrayList<>(); + private final Set filteredVariantIds = new HashSet<>(); @Override - protected int numberOfPasses() { - return 2; - } + protected int numberOfPasses() { return 2; } @Override public void onTraversalStart() { - return; - } + final VCFHeader header = getHeaderForVariants(); + final Set originalHeaderLines = header.getMetaDataInInputOrder(); - @Override - public Object onTraversalSuccess() { - return null; + // Add new header lines + final Set newHeaderLines = new HashSet<>(); + for (final VCFHeaderLine line : header.getMetaDataInInputOrder()) { + if (line instanceof VCFInfoHeaderLine) { + if (GATKSVVCFConstants.FILTER_VCF_INFO_LINES.contains(((VCFInfoHeaderLine) line).getID())) { + continue; + } + } else if (line instanceof VCFAltHeaderLine) { + if (((VCFAltHeaderLine) line).getID().equals(GATKSVVCFConstants.UNR)) { + continue; + } + } + if (GATKSVVCFConstants.FILTER_VCF_LINES.stream().anyMatch(line.toString()::contains)) { + continue; + } + newHeaderLines.add(line); + } + + // Write header + VCFHeader newHeader = new VCFHeader(newHeaderLines, header.getGenotypeSamples()); + vcfWriter = createVCFWriter(outputVcf); + vcfWriter.writeHeader(newHeader); } @Override public void closeTool() { - return; + if (vcfWriter != null) { + vcfWriter.close(); + } } @Override - public void nthPassApply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext, int n) { - return; + protected void nthPassApply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext, int n) { + switch (n) { + case 0: + firstPassApply(variant); + break; + case 1: + secondPassApply(variant); + break; + } } @Override - protected void afterNthPass(final int n) {} + protected void afterNthPass(int n) {} + + public void firstPassApply(final VariantContext variant) { + if (!variant.getFilters().contains(GATKSVVCFConstants.MULTIALLELIC)) { + return; + } + + overlappingVariantsBuffer.removeIf(vc -> !vc.getContig().equals(variant.getContig()) || vc.getEnd() < variant.getStart()); + for (VariantContext bufferedVariant : overlappingVariantsBuffer) { + if (overlaps(bufferedVariant, variant)) { + processVariantPair(bufferedVariant, variant); + processVariantPair(variant, bufferedVariant); + } + } + overlappingVariantsBuffer.add(variant); + } + + public void secondPassApply(final VariantContext variant) { + if (filteredVariantIds.contains(variant.getID())) { + System.out.println(variant.getID()); + return; + } + + VariantContextBuilder builder = new VariantContextBuilder(variant); + processSvType(variant, builder); + vcfWriter.add(builder.make()); + } + + private void processVariantPair(VariantContext largerVariant, VariantContext smallerVariant) { + int lengthLarger = largerVariant.getEnd() - largerVariant.getStart() + 1; + int lengthSmaller = smallerVariant.getEnd() - smallerVariant.getStart() + 1; + if (lengthLarger < lengthSmaller) { + return; + } + + int overlapStart = Math.max(largerVariant.getStart(), smallerVariant.getStart()); + int overlapEnd = Math.min(largerVariant.getEnd(), smallerVariant.getEnd()); + int overlapLength = overlapEnd - overlapStart + 1; + if (overlapLength <= 0) { + return; + } + + double smallCoverage = (double) overlapLength / lengthSmaller; + if (smallCoverage > 0.5) { + if (!filteredVariantIds.contains(largerVariant.getID())) { + filteredVariantIds.add(smallerVariant.getID()); + } + } + } + + private void processSvType(final VariantContext variant, final VariantContextBuilder builder) { + final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, null); + boolean hasMobileElement = variant.getAlleles().stream() + .map(GATKSVVariantContextUtils::getSymbolicAlleleSymbols) + .flatMap(Arrays::stream) + .anyMatch(symbol -> symbol.equals(GATKSVVCFConstants.ME)); + if (svType == null || hasMobileElement) { + return; + } + + List genotypes = variant.getGenotypes(); + List updatedGenotypes = new ArrayList<>(genotypes.size()); + for (Genotype genotype : genotypes) { + GenotypeBuilder gb = new GenotypeBuilder(genotype); + gb.alleles(Arrays.asList(variant.getReference(), Allele.create("<" + svType + ">", false))); + updatedGenotypes.add(gb.make()); + } + + final Allele refAllele = variant.getReference(); + final Allele altAllele = Allele.create("<" + svType + ">", false); + List newAlleles = Arrays.asList(refAllele, altAllele); + builder.alleles(newAlleles); + builder.genotypes(updatedGenotypes); + } + + private boolean overlaps(final VariantContext v1, final VariantContext v2) { + return v1.getContig().equals(v2.getContig()) && v1.getStart() <= v2.getEnd() && v2.getStart() <= v1.getEnd(); + } } From 181b3524b5ecedd6d3224bb659d012433d1e7fbb Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Wed, 30 Oct 2024 20:02:37 -0400 Subject: [PATCH 25/58] Modified param name for chrX/chrY --- .../hellbender/tools/walkers/sv/SVCleanPt1a.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java index 2981c9dca72..749e514d9ab 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java @@ -73,8 +73,8 @@ @BetaFeature @DocumentedFeature public final class SVCleanPt1a extends VariantWalker { - public static final String CHRX_LONG_NAME = "chrX"; - public static final String CHRY_LONG_NAME = "chrY"; + public static final String CHRX_LONG_NAME = "chr-X"; + public static final String CHRY_LONG_NAME = "chr-Y"; public static final String FAIL_LIST_LONG_NAME = "fail-list"; public static final String PASS_LIST_LONG_NAME = "pass-list"; From 3eb5c3d38d6c8c65e71f29abe9346c98bfbb1cbe Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Thu, 31 Oct 2024 09:03:38 -0400 Subject: [PATCH 26/58] Changes to test --- .../tools/walkers/sv/SVCleanPt1a.java | 27 ++++++++++++++++--- .../tools/walkers/sv/SVCleanPt1b.java | 4 --- .../tools/walkers/sv/SVCleanPt5.java | 1 - 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java index 749e514d9ab..584de4cd7a4 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java @@ -7,7 +7,6 @@ import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.vcf.VCFHeader; -import htsjdk.variant.vcf.VCFHeaderLine; import htsjdk.variant.vcf.VCFInfoHeaderLine; import htsjdk.variant.vcf.VCFFilterHeaderLine; import htsjdk.variant.vcf.VCFHeaderLineType; @@ -23,6 +22,8 @@ import org.broadinstitute.hellbender.utils.tsv.TableUtils; import org.broadinstitute.hellbender.utils.tsv.TableReader; +import java.io.BufferedWriter; +import java.io.FileWriter; import java.io.IOException; import java.nio.file.Path; @@ -77,6 +78,7 @@ public final class SVCleanPt1a extends VariantWalker { public static final String CHRY_LONG_NAME = "chr-Y"; public static final String FAIL_LIST_LONG_NAME = "fail-list"; public static final String PASS_LIST_LONG_NAME = "pass-list"; + public static final String OUTPUT_SAMPLES_LIST_LONG_NAME = "output-samples-list"; @Argument( fullName = CHRX_LONG_NAME, @@ -94,16 +96,22 @@ public final class SVCleanPt1a extends VariantWalker { @Argument( fullName = FAIL_LIST_LONG_NAME, - doc = "File with complex variants failing the background test" + doc = "File with variants failing the background test" ) private GATKPath failList; @Argument( fullName = PASS_LIST_LONG_NAME, - doc = "Fail with complex variants passing both sides" + doc = "File with variants passing both sides" ) private GATKPath passList; + @Argument( + fullName = OUTPUT_SAMPLES_LIST_LONG_NAME, + doc = "Output file with samples" + ) + private GATKPath outputSamplesList; + @Argument( fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, @@ -112,6 +120,7 @@ public final class SVCleanPt1a extends VariantWalker { private GATKPath outputVcf; private VariantContextWriter vcfWriter; + private BufferedWriter samplesWriter = null; private Set failSet; private Set passSet; @@ -135,6 +144,18 @@ public void onTraversalStart() { // Write header vcfWriter = createVCFWriter(outputVcf); vcfWriter.writeHeader(header); + + // Write samples list + try { + samplesWriter = new BufferedWriter(new FileWriter(outputSamplesList.toPath().toFile())); + for (String sample : header.getGenotypeSamples()) { + samplesWriter.write(sample); + samplesWriter.newLine(); + } + samplesWriter.flush(); + } catch (IOException e) { + throw new RuntimeException("Can't create output file", e); + } } @Override diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java index 7399d06cf9d..a7cb920d6a2 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java @@ -249,9 +249,6 @@ private void processVariant(final VariantContextBuilder builder, final VariantCo final String widerSvType = event.getRight(); final int currentRdCn = revisedRdCn.get(variantId).getOrDefault(sample, 0); final int widerRdCn = revisedRdCn.getOrDefault(widerVariantId, new HashMap<>()).getOrDefault(sample, 0); - if (!revisedEventsFiltered.getOrDefault(widerVariantId, new HashSet<>()).contains(sample)) { - System.err.println(sample + " " + widerVariantId); - } int newVal = -1; if (widerSvType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) && currentRdCn == 2 && widerRdCn == 3) { @@ -261,7 +258,6 @@ private void processVariant(final VariantContextBuilder builder, final VariantCo } if (newVal != -1) { - System.out.println(variantId); final GenotypeBuilder gb = new GenotypeBuilder(oldGenotype); gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); gb.GQ(Integer.parseInt((String) oldGenotype.getExtendedAttribute(GATKSVVCFConstants.RD_GQ))); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java index 4edfac2d8b7..5f1b822bee5 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java @@ -143,7 +143,6 @@ public void firstPassApply(final VariantContext variant) { public void secondPassApply(final VariantContext variant) { if (filteredVariantIds.contains(variant.getID())) { - System.out.println(variant.getID()); return; } From 52daa217fc84df59e3500a3c0624a9c537bff400 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Thu, 31 Oct 2024 11:38:18 -0400 Subject: [PATCH 27/58] Changes to test --- .../hellbender/tools/walkers/sv/SVCleanPt2.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java index 4dc71825008..d79689e60e4 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java @@ -66,7 +66,7 @@ @DocumentedFeature public class SVCleanPt2 extends VariantWalker { public static final String SAMPLE_LIST_LONG_NAME = "sample-list"; - public static final String OUTPUT_PREFIX_LONG_NAME = "output-prefix"; + public static final String OUTPUT_REVISED_LIST_LONG_NAME = "output-revised-list"; @Argument( fullName = SAMPLE_LIST_LONG_NAME, @@ -75,10 +75,10 @@ public class SVCleanPt2 extends VariantWalker { private GATKPath sampleListPath; @Argument( - fullName = OUTPUT_PREFIX_LONG_NAME, + fullName = OUTPUT_REVISED_LIST_LONG_NAME, doc = "Prefix for output files" ) - private GATKPath outputPrefix; + private GATKPath outputRevisedList; private BufferedWriter revisedCnWriter; @@ -94,7 +94,7 @@ public class SVCleanPt2 extends VariantWalker { @Override public void onTraversalStart() { try { - revisedCnWriter = Files.newBufferedWriter(Paths.get(outputPrefix.toString() + ".txt")); + revisedCnWriter = Files.newBufferedWriter(Paths.get(outputRevisedList.toString())); sampleWhitelist = new HashSet<>(Files.readAllLines(sampleListPath.toPath())); } catch (IOException e) { From d438bb955bd17383fcf7d014a467c5eb519a68e6 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Thu, 31 Oct 2024 12:54:11 -0400 Subject: [PATCH 28/58] CleanVcf4 added exit if not in range --- .../broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java index d8e15b213bf..21f05e2b655 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java @@ -175,7 +175,6 @@ public void apply(final VariantContext variant, final ReadsContext readsContext, // Exit if outside batch range if (recordIdx < recordStart || recordIdx >= recordEnd) { - vcfWriter.add(builder.make()); return; } From 9aaf2f16cb71aa6356714ddc2c791cd5014422ee Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Thu, 31 Oct 2024 16:33:22 -0400 Subject: [PATCH 29/58] Updated type of EV format field --- .../hellbender/tools/walkers/sv/SVCleanPt4.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java index 21f05e2b655..4b72ecd9566 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java @@ -141,7 +141,8 @@ public void onTraversalStart() { if (line instanceof VCFInfoHeaderLine) { String id = ((VCFInfoHeaderLine) line).getID(); if (id.equals(GATKSVVCFConstants.MULTI_CNV) || - id.equals(GATKSVVCFConstants.REVISED_EVENT)) { + id.equals(GATKSVVCFConstants.REVISED_EVENT) || + id.equals(GATKSVVCFConstants.EV)) { continue; } } @@ -153,6 +154,7 @@ public void onTraversalStart() { newHeader.addMetaDataLine(new VCFFilterHeaderLine(GATKSVVCFConstants.MULTIALLELIC, "Multiallelic site")); newHeader.addMetaDataLine(new VCFFormatHeaderLine(GATKSVVCFConstants.COPY_NUMBER_FORMAT, 1, VCFHeaderLineType.Integer, "Predicted copy state")); newHeader.addMetaDataLine(new VCFFormatHeaderLine(GATKSVVCFConstants.COPY_NUMBER_QUALITY_FORMAT, 1, VCFHeaderLineType.Integer, "Read-depth genotype quality")); + newHeader.addMetaDataLine(new VCFFormatHeaderLine(GATKSVVCFConstants.EV, 0, VCFHeaderLineType.String, "Classes of evidence supporting final genotype")); newHeader.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.PESR_GT_OVERDISPERSION, 0, VCFHeaderLineType.Flag, "High PESR dispersion count")); // Write header From 1de3e2ff51a700f2dfce53dbea628ff58d7cb702 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Thu, 31 Oct 2024 18:04:25 -0400 Subject: [PATCH 30/58] Minor changes - replaced EV type --- .../tools/walkers/sv/SVCleanPt1b.java | 14 +++++----- .../tools/walkers/sv/SVCleanPt4.java | 26 ++++--------------- .../tools/walkers/sv/SVCleanPt5.java | 9 ++++++- 3 files changed, 21 insertions(+), 28 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java index a7cb920d6a2..35096b6ca08 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java @@ -91,6 +91,14 @@ protected int numberOfPasses() { return 3; } + @Override + public void onTraversalStart() { + vcfWriter = createVCFWriter(outputVcf); + final VCFHeader header = getHeaderForVariants(); + header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.MULTI_CNV, 0, VCFHeaderLineType.Flag, "Variant is a multiallelic CNV")); + vcfWriter.writeHeader(header); + } + @Override public void closeTool() { if (vcfWriter != null) { @@ -121,12 +129,6 @@ protected void afterNthPass(final int n) { case 0: processCollectedVariants(); break; - case 1: - vcfWriter = createVCFWriter(outputVcf); - final VCFHeader header = getHeaderForVariants(); - header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.MULTI_CNV, 0, VCFHeaderLineType.Flag, "Variant is a multiallelic CNV")); - vcfWriter.writeHeader(header); - break; } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java index 4b72ecd9566..a8cfa984e33 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java @@ -136,30 +136,14 @@ public void onTraversalStart() { // Filter specific header lines final VCFHeader header = getHeaderForVariants(); - final Set newHeaderLines = new HashSet<>(); - for (final VCFHeaderLine line : header.getMetaDataInInputOrder()) { - if (line instanceof VCFInfoHeaderLine) { - String id = ((VCFInfoHeaderLine) line).getID(); - if (id.equals(GATKSVVCFConstants.MULTI_CNV) || - id.equals(GATKSVVCFConstants.REVISED_EVENT) || - id.equals(GATKSVVCFConstants.EV)) { - continue; - } - } - newHeaderLines.add(line); - } - - // Add new header lines - VCFHeader newHeader = new VCFHeader(newHeaderLines, header.getGenotypeSamples()); - newHeader.addMetaDataLine(new VCFFilterHeaderLine(GATKSVVCFConstants.MULTIALLELIC, "Multiallelic site")); - newHeader.addMetaDataLine(new VCFFormatHeaderLine(GATKSVVCFConstants.COPY_NUMBER_FORMAT, 1, VCFHeaderLineType.Integer, "Predicted copy state")); - newHeader.addMetaDataLine(new VCFFormatHeaderLine(GATKSVVCFConstants.COPY_NUMBER_QUALITY_FORMAT, 1, VCFHeaderLineType.Integer, "Read-depth genotype quality")); - newHeader.addMetaDataLine(new VCFFormatHeaderLine(GATKSVVCFConstants.EV, 0, VCFHeaderLineType.String, "Classes of evidence supporting final genotype")); - newHeader.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.PESR_GT_OVERDISPERSION, 0, VCFHeaderLineType.Flag, "High PESR dispersion count")); + header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.PESR_GT_OVERDISPERSION, 0, VCFHeaderLineType.Flag, "High PESR dispersion count")); + header.addMetaDataLine(new VCFFormatHeaderLine(GATKSVVCFConstants.COPY_NUMBER_FORMAT, 1, VCFHeaderLineType.Integer, "Predicted copy state")); + header.addMetaDataLine(new VCFFormatHeaderLine(GATKSVVCFConstants.COPY_NUMBER_QUALITY_FORMAT, 1, VCFHeaderLineType.Integer, "Read-depth genotype quality")); + header.addMetaDataLine(new VCFFilterHeaderLine(GATKSVVCFConstants.MULTIALLELIC, "Multiallelic site")); // Write header vcfWriter = createVCFWriter(outputVcf); - vcfWriter.writeHeader(newHeader); + vcfWriter.writeHeader(header); } @Override diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java index 5f1b822bee5..23875c7866e 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java @@ -87,6 +87,10 @@ public void onTraversalStart() { if (GATKSVVCFConstants.FILTER_VCF_INFO_LINES.contains(((VCFInfoHeaderLine) line).getID())) { continue; } + } else if (line instanceof VCFFormatHeaderLine) { + if (((VCFFormatHeaderLine) line).getID().equals(GATKSVVCFConstants.EV)) { + continue; + } } else if (line instanceof VCFAltHeaderLine) { if (((VCFAltHeaderLine) line).getID().equals(GATKSVVCFConstants.UNR)) { continue; @@ -98,8 +102,11 @@ public void onTraversalStart() { newHeaderLines.add(line); } - // Write header + // Add new header lines VCFHeader newHeader = new VCFHeader(newHeaderLines, header.getGenotypeSamples()); + newHeader.addMetaDataLine(new VCFFormatHeaderLine(GATKSVVCFConstants.EV, 1, VCFHeaderLineType.String, "Classes of evidence supporting final genotype")); + + // Write header vcfWriter = createVCFWriter(outputVcf); vcfWriter.writeHeader(newHeader); } From a57601f51738dbf8168139d51cdc92ca5402a962 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Fri, 1 Nov 2024 12:57:14 -0400 Subject: [PATCH 31/58] Skip no-call genotypes --- .../tools/walkers/sv/SVCleanPt4.java | 28 +++++++++++-------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java index a8cfa984e33..7310d981cbd 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java @@ -266,12 +266,14 @@ private List processLargeDeletions(final VariantContext variant, final if (gt5kbFilter) { for (Genotype genotype : genotypes) { GenotypeBuilder gb = new GenotypeBuilder(genotype); - if (Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 0).toString()) >= 2) { // TODO: Verify that removal of sample_obj[GQ] is None condition is okay - gb.alleles(Arrays.asList(variant.getReference(), variant.getReference())); - } else if (Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 0).toString()) == 1) { - gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); - } else { - gb.alleles(Arrays.asList(variant.getAlternateAllele(0), variant.getAlternateAllele(0))); + if (!genotype.isNoCall()) { + if (genotype.hasGQ() && Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 0).toString()) >= 2) { // TODO: Verify that removal of sample_obj[GQ] is None condition is okay + gb.alleles(Arrays.asList(variant.getReference(), variant.getReference())); + } else if (genotype.hasGQ() && Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 0).toString()) == 1) { + gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); + } else if (genotype.hasGQ()) { + gb.alleles(Arrays.asList(variant.getAlternateAllele(0), variant.getAlternateAllele(0))); + } } updatedGenotypes.add(gb.make()); } @@ -328,12 +330,14 @@ private List processLargeDuplications(final VariantContext variant, fi if (gt5kbFilter) { for (Genotype genotype : genotypes) { GenotypeBuilder gb = new GenotypeBuilder(genotype); - if (Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 0).toString()) <= 2) { - gb.alleles(Arrays.asList(variant.getReference(), variant.getReference())); - } else if (Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 0).toString()) == 3) { - gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); - } else { - gb.alleles(Arrays.asList(variant.getAlternateAllele(0), variant.getAlternateAllele(0))); + if (!genotype.isNoCall()) { + if (genotype.hasGQ() && Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 3).toString()) <= 2) { + gb.alleles(Arrays.asList(variant.getReference(), variant.getReference())); + } else if (genotype.hasGQ() && Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 0).toString()) == 3) { + gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); + } else if (genotype.hasGQ()) { + gb.alleles(Arrays.asList(variant.getAlternateAllele(0), variant.getAlternateAllele(0))); + } } updatedGenotypes.add(gb.make()); } From 2f17eecd68052f556e902a32c50d0dbe4e057e36 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Mon, 4 Nov 2024 18:25:22 -0500 Subject: [PATCH 32/58] Changes post-debugging: modify .getEnd() to use SVLEN --- .../tools/walkers/sv/SVCleanPt1a.java | 163 +++++++++--------- .../tools/walkers/sv/SVCleanPt4.java | 31 ++-- .../tools/walkers/sv/SVCleanPt5.java | 16 +- 3 files changed, 102 insertions(+), 108 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java index 584de4cd7a4..462cab0a6b4 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java @@ -27,11 +27,7 @@ import java.io.IOException; import java.nio.file.Path; -import java.util.Arrays; -import java.util.List; -import java.util.Set; -import java.util.LinkedHashSet; -import java.util.HashSet; +import java.util.*; import java.util.stream.Collectors; /** @@ -124,7 +120,6 @@ public final class SVCleanPt1a extends VariantWalker { private Set failSet; private Set passSet; - private final Set revisedSet = new HashSet<>(); private static final int MIN_ALLOSOME_EVENT_SIZE = 5000; @@ -167,78 +162,119 @@ public void closeTool() { @Override public void apply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext) { + // Create core data VariantContextBuilder builder = new VariantContextBuilder(variant); - final List processedGenotypes = processGenotypes(variant); - builder.genotypes(processedGenotypes); - processVariant(variant, builder); - vcfWriter.add(builder.make()); - } + List genotypes = variant.getGenotypes(); - private List processGenotypes(final VariantContext variant) { - return variant.getGenotypes().stream() - .map(genotype -> { - GenotypeBuilder genotypeBuilder = new GenotypeBuilder(genotype); - processEVGenotype(genotype, genotypeBuilder); - processAllosomesGenotype(variant, genotype, genotypeBuilder); - return genotypeBuilder.make(); - }) - .collect(Collectors.toList()); - } - - private void processVariant(final VariantContext variant, final VariantContextBuilder builder) { + // Process variant + genotypes = processEV(genotypes); processVarGQ(variant, builder); processMultiallelic(variant, builder); processUnresolved(variant, builder); processNoisyEvents(variant, builder); processBothsidesSupportEvents(variant, builder); - processAllosomes(variant, builder); + genotypes = processAllosomes(variant, builder, genotypes); + + builder.genotypes(genotypes); + vcfWriter.add(builder.make()); } - private void processEVGenotype(final Genotype genotype, final GenotypeBuilder genotypeBuilder) { - if (genotype.hasExtendedAttribute(GATKSVVCFConstants.EV)) { - String evAttribute = (String) genotype.getExtendedAttribute(GATKSVVCFConstants.EV); - final int evIndex = Integer.parseInt(evAttribute); - if (evIndex >= 0 && evIndex < GATKSVVCFConstants.EV_VALUES.size()) { - genotypeBuilder.attribute(GATKSVVCFConstants.EV, GATKSVVCFConstants.EV_VALUES.get(evIndex)); + private List processEV(final List genotypes) { + List updatedGenotypes = new ArrayList<>(genotypes.size()); + for (Genotype genotype : genotypes) { + if (genotype.hasExtendedAttribute(GATKSVVCFConstants.EV)) { + GenotypeBuilder gb = new GenotypeBuilder(genotype); + String evAttribute = (String) genotype.getExtendedAttribute(GATKSVVCFConstants.EV); + final int evIndex = Integer.parseInt(evAttribute); + if (evIndex >= 0 && evIndex < GATKSVVCFConstants.EV_VALUES.size()) { + gb.attribute(GATKSVVCFConstants.EV, GATKSVVCFConstants.EV_VALUES.get(evIndex)); + } + updatedGenotypes.add(gb.make()); + } else { + updatedGenotypes.add(genotype); } } + return updatedGenotypes; + } + + private void processVarGQ(final VariantContext variant, final VariantContextBuilder builder) { + if (variant.hasAttribute(GATKSVVCFConstants.VAR_GQ)) { + final double varGQ = variant.getAttributeAsDouble(GATKSVVCFConstants.VAR_GQ, 0); + builder.rmAttribute(GATKSVVCFConstants.VAR_GQ); + builder.log10PError(varGQ / -10.0); + } + } + + private void processMultiallelic(final VariantContext variant, final VariantContextBuilder builder) { + if (variant.hasAttribute(GATKSVVCFConstants.MULTIALLELIC)) { + builder.rmAttribute(GATKSVVCFConstants.MULTIALLELIC); + } + } + + private void processUnresolved(final VariantContext variant, final VariantContextBuilder builder) { + if (variant.hasAttribute(GATKSVVCFConstants.UNRESOLVED)) { + builder.rmAttribute(GATKSVVCFConstants.UNRESOLVED); + builder.filter(GATKSVVCFConstants.UNRESOLVED); + } + } + + private void processNoisyEvents(final VariantContext variant, final VariantContextBuilder builder) { + if (failSet.contains(variant.getID())) { + builder.attribute(GATKSVVCFConstants.HIGH_SR_BACKGROUND, true); + } + } + + private void processBothsidesSupportEvents(final VariantContext variant, final VariantContextBuilder builder) { + if (passSet.contains(variant.getID())) { + builder.attribute(GATKSVVCFConstants.BOTHSIDES_SUPPORT, true); + } } - private void processAllosomesGenotype(final VariantContext variant, final Genotype genotype, final GenotypeBuilder genotypeBuilder) { + private List processAllosomes(final VariantContext variant, final VariantContextBuilder builder, final List genotypes) { final String chromosome = variant.getContig(); - if (chromosome.equals(chrX) || chromosome.equals(chrY)) { + if (!chromosome.equals(chrX) && !chromosome.equals(chrY)) { + return genotypes; + } + + List updatedGenotypes = new ArrayList<>(genotypes.size()); + for (Genotype genotype : genotypes) { final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); if ((svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP)) && (variant.getEnd() - variant.getStart() >= MIN_ALLOSOME_EVENT_SIZE)) { + GenotypeBuilder gb = new GenotypeBuilder(genotype); final boolean isY = chromosome.equals(chrY); final int sex = (int) genotype.getExtendedAttribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT); - if (sex == 1 && isRevisableEvent(variant, isY, sex)) { // Male - revisedSet.add(variant.getID()); - adjustMaleGenotype(genotype, genotypeBuilder, svType); + builder.attribute(GATKSVVCFConstants.REVISED_EVENT, true); + adjustMaleGenotype(genotype, gb, svType); } else if (sex == 2 && isY) { // Female - genotypeBuilder.alleles(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)); + gb.alleles(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)); } else if (sex == 0) { // Unknown - genotypeBuilder.alleles(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)); + gb.alleles(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)); } + updatedGenotypes.add(gb.make()); + } + else { + updatedGenotypes.add(genotype); } } + return updatedGenotypes; } - private void adjustMaleGenotype(final Genotype genotype, final GenotypeBuilder genotypeBuilder, final String svType) { + private void adjustMaleGenotype(final Genotype genotype, final GenotypeBuilder gb, final String svType) { if (genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { final int rdCN = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()); - genotypeBuilder.attribute(GATKSVVCFConstants.RD_CN, rdCN + 1); + gb.attribute(GATKSVVCFConstants.RD_CN, rdCN + 1); final Allele refAllele = genotype.getAllele(0); final Allele altAllele = genotype.getAllele(1); if (svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL)) { - if (rdCN >= 1) genotypeBuilder.alleles(Arrays.asList(refAllele, refAllele)); - else if (rdCN == 0) genotypeBuilder.alleles(Arrays.asList(refAllele, altAllele)); + if (rdCN >= 1) gb.alleles(Arrays.asList(refAllele, refAllele)); + else if (rdCN == 0) gb.alleles(Arrays.asList(refAllele, altAllele)); } else if (svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP)) { - if (rdCN <= 1) genotypeBuilder.alleles(Arrays.asList(refAllele, refAllele)); - else if (rdCN == 2) genotypeBuilder.alleles(Arrays.asList(refAllele, altAllele)); - else genotypeBuilder.alleles(Arrays.asList(altAllele, altAllele)); + if (rdCN <= 1) gb.alleles(Arrays.asList(refAllele, refAllele)); + else if (rdCN == 2) gb.alleles(Arrays.asList(refAllele, altAllele)); + else gb.alleles(Arrays.asList(altAllele, altAllele)); } } } @@ -281,45 +317,6 @@ private int calcMedianDistribution(final int[] counts) { throw new RuntimeException("Error calculating median"); } - private void processVarGQ(final VariantContext variant, final VariantContextBuilder builder) { - if (variant.hasAttribute(GATKSVVCFConstants.VAR_GQ)) { - final double varGQ = variant.getAttributeAsDouble(GATKSVVCFConstants.VAR_GQ, 0); - builder.rmAttribute(GATKSVVCFConstants.VAR_GQ); - builder.log10PError(varGQ / -10.0); - } - } - - private void processMultiallelic(final VariantContext variant, final VariantContextBuilder builder) { - if (variant.hasAttribute(GATKSVVCFConstants.MULTIALLELIC)) { - builder.rmAttribute(GATKSVVCFConstants.MULTIALLELIC); - } - } - - private void processUnresolved(final VariantContext variant, final VariantContextBuilder builder) { - if (variant.hasAttribute(GATKSVVCFConstants.UNRESOLVED)) { - builder.rmAttribute(GATKSVVCFConstants.UNRESOLVED); - builder.filter(GATKSVVCFConstants.UNRESOLVED); - } - } - - private void processNoisyEvents(final VariantContext variant, final VariantContextBuilder builder) { - if (failSet.contains(variant.getID())) { - builder.attribute(GATKSVVCFConstants.HIGH_SR_BACKGROUND, true); - } - } - - private void processBothsidesSupportEvents(final VariantContext variant, final VariantContextBuilder builder) { - if (passSet.contains(variant.getID())) { - builder.attribute(GATKSVVCFConstants.BOTHSIDES_SUPPORT, true); - } - } - - private void processAllosomes(final VariantContext variant, final VariantContextBuilder builder) { - if (revisedSet.contains(variant.getID())) { - builder.attribute(GATKSVVCFConstants.REVISED_EVENT, true); - } - } - private Set readLastColumn(final GATKPath filePath) { try { final Path path = filePath.toPath(); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java index 7310d981cbd..25134323d8b 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java @@ -171,7 +171,6 @@ public void apply(final VariantContext variant, final ReadsContext readsContext, genotypes = processLargeDeletions(variant, builder, genotypes); genotypes = processLargeDuplications(variant, builder, genotypes); genotypes = processRevisedSex(variant, genotypes); - processInfoFields(builder); // Build genotypes if (isCalled(builder, genotypes)) { @@ -181,18 +180,18 @@ public void apply(final VariantContext variant, final ReadsContext readsContext, } private List processRevisedCn(final VariantContext variant, final List genotypes) { - if (!revisedCopyNumbers.containsKey(variant.getID())) { + final String variantID = variant.getID(); + if (!revisedCopyNumbers.containsKey(variantID)) { return genotypes; } List updatedGenotypes = new ArrayList<>(genotypes.size()); - Map sampleCnMap = revisedCopyNumbers.get(variant.getID()); for (Genotype genotype : genotypes) { String sampleName = genotype.getSampleName(); - if (sampleCnMap.containsKey(sampleName)) { - GenotypeBuilder gb = new GenotypeBuilder(genotype) - .alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))) - .attribute(GATKSVVCFConstants.RD_CN, sampleCnMap.get(sampleName)); + if (revisedCopyNumbers.get(variantID).containsKey(sampleName)) { + GenotypeBuilder gb = new GenotypeBuilder(genotype); + gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); + gb.attribute(GATKSVVCFConstants.RD_CN, revisedCopyNumbers.get(variantID).get(sampleName)); updatedGenotypes.add(gb.make()); } else { updatedGenotypes.add(genotype); @@ -243,7 +242,7 @@ private List processLargeDeletions(final VariantContext variant, final } boolean multiallelicFilter = false; - if (variant.getEnd() - variant.getStart() >= MIN_LARGE_EVENT_SIZE) { + if (variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) >= MIN_LARGE_EVENT_SIZE) { Map sampleRdCn = new HashMap<>(); for (Genotype genotype : genotypes) { if (!outlierSamples.contains(genotype.getSampleName()) && genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { @@ -258,7 +257,7 @@ private List processLargeDeletions(final VariantContext variant, final boolean gt5kbFilter = false; if (genotypes.stream().anyMatch(g -> !isBiallelic(g))) { gt5kbFilter = true; - } else if (variant.getEnd() - variant.getStart() >= MIN_MULTIALLELIC_EVENT_SIZE && !multiallelicFilter) { + } else if (variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) >= MIN_MULTIALLELIC_EVENT_SIZE && !multiallelicFilter) { gt5kbFilter = true; } @@ -305,7 +304,7 @@ private List processLargeDuplications(final VariantContext variant, fi } boolean multiallelicFilter = false; - if (variant.getEnd() - variant.getStart() >= MIN_LARGE_EVENT_SIZE) { + if (variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) >= MIN_LARGE_EVENT_SIZE) { Map sampleRdCn = new HashMap<>(); for (Genotype genotype : genotypes) { if (!outlierSamples.contains(genotype.getSampleName()) && genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { @@ -322,7 +321,7 @@ private List processLargeDuplications(final VariantContext variant, fi boolean gt5kbFilter = false; if (!genotypes.stream().allMatch(g -> g.getAlleles().size() > 2)) { gt5kbFilter = true; - } else if (variant.getEnd() - variant.getStart() >= MIN_MULTIALLELIC_EVENT_SIZE && !multiallelicFilter) { + } else if (variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) >= MIN_MULTIALLELIC_EVENT_SIZE && !multiallelicFilter) { gt5kbFilter = true; } @@ -385,16 +384,6 @@ private List processRevisedSex(final VariantContext variant, List attributes = builder.getAttributes(); - if (attributes.containsKey(GATKSVVCFConstants.MULTI_CNV)) { - builder.rmAttribute(GATKSVVCFConstants.MULTI_CNV); - } - if (attributes.containsKey(GATKSVVCFConstants.REVISED_EVENT)) { - builder.rmAttribute(GATKSVVCFConstants.REVISED_EVENT); - } - } - public boolean isCalled(final VariantContextBuilder builder, final List genotypes) { for (Genotype genotype : genotypes) { if (!isNoCallGt(genotype.getAlleles())) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java index 23875c7866e..e78f0899cef 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java @@ -77,11 +77,9 @@ public class SVCleanPt5 extends MultiplePassVariantWalker { @Override public void onTraversalStart() { - final VCFHeader header = getHeaderForVariants(); - final Set originalHeaderLines = header.getMetaDataInInputOrder(); - - // Add new header lines + // Remove unnecessary header lines final Set newHeaderLines = new HashSet<>(); + final VCFHeader header = getHeaderForVariants(); for (final VCFHeaderLine line : header.getMetaDataInInputOrder()) { if (line instanceof VCFInfoHeaderLine) { if (GATKSVVCFConstants.FILTER_VCF_INFO_LINES.contains(((VCFInfoHeaderLine) line).getID())) { @@ -155,6 +153,7 @@ public void secondPassApply(final VariantContext variant) { VariantContextBuilder builder = new VariantContextBuilder(variant); processSvType(variant, builder); + cleanseInfoFields(builder); vcfWriter.add(builder.make()); } @@ -205,6 +204,15 @@ private void processSvType(final VariantContext variant, final VariantContextBui builder.genotypes(updatedGenotypes); } + private void cleanseInfoFields(final VariantContextBuilder builder) { + Map attributes = builder.getAttributes(); + for (String field : GATKSVVCFConstants.FILTER_VCF_INFO_LINES) { + if (attributes.containsKey(field)) { + builder.rmAttribute(GATKSVVCFConstants.MULTI_CNV); + } + } + } + private boolean overlaps(final VariantContext v1, final VariantContext v2) { return v1.getContig().equals(v2.getContig()) && v1.getStart() <= v2.getEnd() && v2.getStart() <= v1.getEnd(); } From 4c60608ea2c4a00fbfd55f7e36109743f03a78b4 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Mon, 4 Nov 2024 19:13:37 -0500 Subject: [PATCH 33/58] Undo use of getEnd() --- .../tools/walkers/sv/SVCleanPt1a.java | 2 +- .../tools/walkers/sv/SVCleanPt1b.java | 7 ++++-- .../tools/walkers/sv/SVCleanPt2.java | 15 ++++++++---- .../tools/walkers/sv/SVCleanPt5.java | 23 +++++++++++-------- 4 files changed, 29 insertions(+), 18 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java index 462cab0a6b4..b6261d1c4bc 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java @@ -240,7 +240,7 @@ private List processAllosomes(final VariantContext variant, final Vari for (Genotype genotype : genotypes) { final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); if ((svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP)) && - (variant.getEnd() - variant.getStart() >= MIN_ALLOSOME_EVENT_SIZE)) { + (variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) >= MIN_ALLOSOME_EVENT_SIZE)) { GenotypeBuilder gb = new GenotypeBuilder(genotype); final boolean isY = chromosome.equals(chrY); final int sex = (int) genotype.getExtendedAttribute(GATKSVVCFConstants.EXPECTED_COPY_NUMBER_FORMAT); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java index 35096b6ca08..9c6d2769c34 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java @@ -138,7 +138,8 @@ public void firstPassApply(final VariantContext variant) { } // Process overlaps with variants in the buffer - overlappingVariantsBuffer.removeIf(vc -> !vc.getContig().equals(variant.getContig()) || vc.getEnd() < variant.getStart()); + overlappingVariantsBuffer.removeIf(vc -> !vc.getContig().equals(variant.getContig()) + || (vc.getStart() + vc.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) < variant.getStart()); for (VariantContext bufferedVariant : overlappingVariantsBuffer) { if (overlaps(bufferedVariant, variant)) { processOverlap(bufferedVariant, variant); @@ -298,7 +299,9 @@ private boolean isLarge(final VariantContext variant, final int minSize) { } private boolean overlaps(final VariantContext v1, final VariantContext v2) { - return v1.getContig().equals(v2.getContig()) && v1.getStart() <= v2.getEnd() && v2.getStart() <= v1.getEnd(); + return v1.getContig().equals(v2.getContig()) + && v1.getStart() <= (v2.getStart() + v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) + && v2.getStart() <= (v1.getStart() + v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); } private Set getNonReferenceSamples(final VariantContext variant) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java index d79689e60e4..85b4e0d1b51 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java @@ -153,7 +153,8 @@ public void apply(final VariantContext variant, final ReadsContext readsContext, } // Process overlaps with variants in the buffer - overlappingVariantsBuffer.removeIf(vc -> !vc.getContig().equals(variant.getContig()) || vc.getEnd() < variant.getStart()); + overlappingVariantsBuffer.removeIf(vc -> !vc.getContig().equals(variant.getContig()) + || (vc.getStart() + vc.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) < variant.getStart()); for (VariantContext bufferedVariant : overlappingVariantsBuffer) { if (overlaps(variant, bufferedVariant)) { adjustCopyNumber(bufferedVariant, variant); @@ -175,9 +176,11 @@ private void adjustCopyNumber(final VariantContext v1, final VariantContext v2) String svtype2 = v2.getAttributeAsString("SVTYPE", ""); // Calculate overlap metadata - int length1 = v1.getEnd() - v1.getStart(); - int length2 = v2.getEnd() - v2.getStart(); - int lengthOverlap = Math.min(v2.getEnd(), v1.getEnd()) - Math.max(v1.getStart(), v2.getStart()); + int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0);; + int length2 = v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + int minEnd = Math.min(v1.getStart() + v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0), v2.getStart() + v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); + int maxStart = Math.max(v1.getStart(), v2.getStart()); + int lengthOverlap = minEnd - maxStart; double overlap1 = (double) lengthOverlap / (double) length1; double overlap2 = (double) lengthOverlap / (double) length2; @@ -264,7 +267,9 @@ else if (support1.contains(GATKSVVCFConstants.EV_VALUES.get(1)) } private boolean overlaps(final VariantContext v1, final VariantContext v2) { - return v1.getContig().equals(v2.getContig()) && v1.getStart() <= v2.getEnd() && v2.getStart() <= v1.getEnd(); + return v1.getContig().equals(v2.getContig()) + && v1.getStart() <= (v2.getStart() + v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) + && v2.getStart() <= (v1.getStart() + v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); } private boolean isDelDup(final VariantContext variant) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java index e78f0899cef..a1642adab8f 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java @@ -136,7 +136,8 @@ public void firstPassApply(final VariantContext variant) { return; } - overlappingVariantsBuffer.removeIf(vc -> !vc.getContig().equals(variant.getContig()) || vc.getEnd() < variant.getStart()); + overlappingVariantsBuffer.removeIf(vc -> !vc.getContig().equals(variant.getContig()) + || (vc.getStart() + vc.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) < variant.getStart()); for (VariantContext bufferedVariant : overlappingVariantsBuffer) { if (overlaps(bufferedVariant, variant)) { processVariantPair(bufferedVariant, variant); @@ -157,24 +158,24 @@ public void secondPassApply(final VariantContext variant) { vcfWriter.add(builder.make()); } - private void processVariantPair(VariantContext largerVariant, VariantContext smallerVariant) { - int lengthLarger = largerVariant.getEnd() - largerVariant.getStart() + 1; - int lengthSmaller = smallerVariant.getEnd() - smallerVariant.getStart() + 1; + private void processVariantPair(VariantContext v1, VariantContext v2) { + int lengthLarger = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + int lengthSmaller = v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); if (lengthLarger < lengthSmaller) { return; } - int overlapStart = Math.max(largerVariant.getStart(), smallerVariant.getStart()); - int overlapEnd = Math.min(largerVariant.getEnd(), smallerVariant.getEnd()); - int overlapLength = overlapEnd - overlapStart + 1; + int minEnd = Math.min(v1.getStart() + v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0), v2.getStart() + v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); + int maxStart = Math.max(v1.getStart(), v2.getStart()); + int overlapLength = minEnd - maxStart + 1; if (overlapLength <= 0) { return; } double smallCoverage = (double) overlapLength / lengthSmaller; if (smallCoverage > 0.5) { - if (!filteredVariantIds.contains(largerVariant.getID())) { - filteredVariantIds.add(smallerVariant.getID()); + if (!filteredVariantIds.contains(v1.getID())) { + filteredVariantIds.add(v2.getID()); } } } @@ -214,6 +215,8 @@ private void cleanseInfoFields(final VariantContextBuilder builder) { } private boolean overlaps(final VariantContext v1, final VariantContext v2) { - return v1.getContig().equals(v2.getContig()) && v1.getStart() <= v2.getEnd() && v2.getStart() <= v1.getEnd(); + return v1.getContig().equals(v2.getContig()) + && v1.getStart() <= (v2.getStart() + v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) + && v2.getStart() <= (v1.getStart() + v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); } } From 56ecaeb5a00c64ec62cbad5730989ccd69996c52 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Tue, 5 Nov 2024 10:21:10 -0500 Subject: [PATCH 34/58] Furhter debugging: modified SVTYPE update & corresponding genotype assignment --- .../tools/walkers/sv/SVCleanPt1b.java | 8 ++++--- .../tools/walkers/sv/SVCleanPt2.java | 6 ++--- .../tools/walkers/sv/SVCleanPt5.java | 24 ++++++++++++------- 3 files changed, 23 insertions(+), 15 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java index 9c6d2769c34..894cf156536 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java @@ -182,10 +182,12 @@ private void processOverlap(final VariantContext v1, final VariantContext v2) { // Get overlap data VariantContext wider; VariantContext narrower; - if (v1.getLengthOnReference() > v2.getLengthOnReference()) { + final int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + final int length2 = v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + if (length1 > length2) { wider = v1; narrower = v2; - } else if (v2.getLengthOnReference() > v1.getLengthOnReference()) { + } else if (length2 > length1) { wider = v2; narrower = v1; } else { @@ -221,9 +223,9 @@ private void processOverlap(final VariantContext v1, final VariantContext v2) { } private void processCollectedVariants() { + // Prunes variant-sample pairs we need RD_CN values for for (final Map.Entry>> entry : revisedEventsAll.entrySet()) { for (final Map.Entry> innerEntry : entry.getValue().entrySet()) { - // Identifies variant-sample pairs we need RD_CN values for to improve speed final String sampleName = innerEntry.getKey(); final String variantId = entry.getKey(); final String widerVariantId = innerEntry.getValue().getLeft(); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java index 85b4e0d1b51..807030b61c6 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java @@ -172,8 +172,8 @@ private void adjustCopyNumber(final VariantContext v1, final VariantContext v2) Map variantRdCn2 = getRdCnForVariant(v2); Map> variantSupport1 = getSupportForVariant(v1); Map> variantSupport2 = getSupportForVariant(v2); - String svtype1 = v1.getAttributeAsString("SVTYPE", ""); - String svtype2 = v2.getAttributeAsString("SVTYPE", ""); + String svType1 = v1.getAttributeAsString("SVTYPE", ""); + String svType2 = v2.getAttributeAsString("SVTYPE", ""); // Calculate overlap metadata int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0);; @@ -238,7 +238,7 @@ else if (support1.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get( // Condition 3: Depth-only calls where smaller call is driven by a larger call else if (support1.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) && support2.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) - && overlap2 > 0.5 && !v1.hasAttribute(GATKSVVCFConstants.MULTI_CNV) && svtype1.equals(svtype2)) { + && overlap2 > 0.5 && !v1.hasAttribute(GATKSVVCFConstants.MULTI_CNV) && svType1.equals(svType2)) { if (rdCn1 == 0 && rdCn1 != rdCn2) { makeRevision(id2, rdCn2 + 2); } else if (rdCn1 == 1 && rdCn1 > rdCn2) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java index a1642adab8f..bb5ffb4e853 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java @@ -159,9 +159,9 @@ public void secondPassApply(final VariantContext variant) { } private void processVariantPair(VariantContext v1, VariantContext v2) { - int lengthLarger = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); - int lengthSmaller = v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); - if (lengthLarger < lengthSmaller) { + int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + int length2 = v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + if (length1 < length2) { return; } @@ -172,7 +172,7 @@ private void processVariantPair(VariantContext v1, VariantContext v2) { return; } - double smallCoverage = (double) overlapLength / lengthSmaller; + double smallCoverage = (double) overlapLength / length2; if (smallCoverage > 0.5) { if (!filteredVariantIds.contains(v1.getID())) { filteredVariantIds.add(v2.getID()); @@ -190,17 +190,23 @@ private void processSvType(final VariantContext variant, final VariantContextBui return; } + final Allele refAllele = variant.getReference(); + final Allele altAllele = Allele.create("<" + svType + ">", false); + List newAlleles = Arrays.asList(refAllele, altAllele); + List genotypes = variant.getGenotypes(); List updatedGenotypes = new ArrayList<>(genotypes.size()); for (Genotype genotype : genotypes) { GenotypeBuilder gb = new GenotypeBuilder(genotype); - gb.alleles(Arrays.asList(variant.getReference(), Allele.create("<" + svType + ">", false))); + long altCount = genotype.getAlleles().stream().filter(allele -> !allele.isReference()).count(); + if (altCount == 1) { // Heterozygous case (0/1) + gb.alleles(Arrays.asList(refAllele, altAllele)); + } else if (altCount == 2) { // Homozygous alternate case (1/1) + gb.alleles(Arrays.asList(altAllele, altAllele)); + } updatedGenotypes.add(gb.make()); } - final Allele refAllele = variant.getReference(); - final Allele altAllele = Allele.create("<" + svType + ">", false); - List newAlleles = Arrays.asList(refAllele, altAllele); builder.alleles(newAlleles); builder.genotypes(updatedGenotypes); } @@ -209,7 +215,7 @@ private void cleanseInfoFields(final VariantContextBuilder builder) { Map attributes = builder.getAttributes(); for (String field : GATKSVVCFConstants.FILTER_VCF_INFO_LINES) { if (attributes.containsKey(field)) { - builder.rmAttribute(GATKSVVCFConstants.MULTI_CNV); + builder.rmAttribute(field); } } } From 82ecea816406d974dd951aae9811f5535b613ffe Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Tue, 5 Nov 2024 11:16:32 -0500 Subject: [PATCH 35/58] Handled no-call rewriting bug --- .../hellbender/tools/walkers/sv/SVCleanPt4.java | 4 ++-- .../hellbender/tools/walkers/sv/SVCleanPt5.java | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java index 25134323d8b..eff184181e3 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java @@ -393,8 +393,8 @@ public boolean isCalled(final VariantContextBuilder builder, final List updatedGenotypes = new ArrayList<>(genotypes.size()); for (Genotype genotype : genotypes) { GenotypeBuilder gb = new GenotypeBuilder(genotype); - long altCount = genotype.getAlleles().stream().filter(allele -> !allele.isReference()).count(); + long altCount = genotype.getAlleles().stream().filter(allele -> allele.isCalled() && !allele.isReference()).count(); if (altCount == 1) { // Heterozygous case (0/1) gb.alleles(Arrays.asList(refAllele, altAllele)); } else if (altCount == 2) { // Homozygous alternate case (1/1) From d96d810080c7bde881bfd073da211c2cb6d0bbe3 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Wed, 6 Nov 2024 10:37:55 -0500 Subject: [PATCH 36/58] Modifications to large del/dup event logic --- .../tools/walkers/sv/SVCleanPt4.java | 29 ++++++++++++------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java index eff184181e3..aaacbbca359 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java @@ -97,7 +97,7 @@ public class SVCleanPt4 extends VariantWalker { private double recordStart; private double recordEnd; private long recordIdx; - private int maxVF; + private double maxVF; private static final int MIN_LARGE_EVENT_SIZE = 1000; private static final int MIN_MULTIALLELIC_EVENT_SIZE = 5000; @@ -131,7 +131,7 @@ public void onTraversalStart() { double segments = totalNumVariants / (double) totalBatch; recordStart = (batchNum - 1) * segments; recordEnd = batchNum * segments; - maxVF = Math.max((int) ((getHeaderForVariants().getGenotypeSamples().size() - outlierSamples.size()) * 0.01), 2); + maxVF = Math.max((getHeaderForVariants().getGenotypeSamples().size() - outlierSamples.size()) * 0.01, 2); recordIdx = 0; // Filter specific header lines @@ -255,7 +255,10 @@ private List processLargeDeletions(final VariantContext variant, final } boolean gt5kbFilter = false; - if (genotypes.stream().anyMatch(g -> !isBiallelic(g))) { + List allowedAlleles = Arrays.asList( + Allele.NO_CALL.getBaseString(), variant.getReference().getBaseString(), variant.getAlternateAllele(0).getBaseString() + ); + if (genotypes.stream().anyMatch(g -> g.getAlleles().stream().anyMatch(a -> !allowedAlleles.contains(a.getBaseString())))) { gt5kbFilter = true; } else if (variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) >= MIN_MULTIALLELIC_EVENT_SIZE && !multiallelicFilter) { gt5kbFilter = true; @@ -266,7 +269,7 @@ private List processLargeDeletions(final VariantContext variant, final for (Genotype genotype : genotypes) { GenotypeBuilder gb = new GenotypeBuilder(genotype); if (!genotype.isNoCall()) { - if (genotype.hasGQ() && Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 0).toString()) >= 2) { // TODO: Verify that removal of sample_obj[GQ] is None condition is okay + if (genotype.hasGQ() && Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 0).toString()) >= 2) { gb.alleles(Arrays.asList(variant.getReference(), variant.getReference())); } else if (genotype.hasGQ() && Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 0).toString()) == 1) { gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); @@ -285,7 +288,7 @@ private List processLargeDeletions(final VariantContext variant, final for (Genotype genotype : genotypes) { GenotypeBuilder gb = new GenotypeBuilder(genotype); gb.noGQ(); - gb.alleles(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)); + gb.alleles(Arrays.asList(Allele.NO_CALL)); gb.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN)); gb.attribute(GATKSVVCFConstants.COPY_NUMBER_QUALITY_FORMAT, genotype.getExtendedAttribute(GATKSVVCFConstants.RD_GQ)); updatedGenotypes.add(gb.make()); @@ -311,15 +314,21 @@ private List processLargeDuplications(final VariantContext variant, fi sampleRdCn.put(genotype.getSampleName(), Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString())); } } - if (sampleRdCn.values().stream().filter(value -> value > 4).count() > maxVF - && sampleRdCn.values().stream().filter(value -> (value < 1 || value > 4)).count() > maxVF - && sampleRdCn.values().stream().filter(value -> (value < 1 || value > 4)).count() > 4) { + if (sampleRdCn.values().stream().filter(value -> value > 4).count() > maxVF) { multiallelicFilter = true; } + if (sampleRdCn.values().stream().filter(value -> (value < 1 || value > 4)).count() > 4) { + if (sampleRdCn.values().stream().filter(value -> (value < 1 || value > 4)).count() > maxVF) { + multiallelicFilter = true; + } + } } boolean gt5kbFilter = false; - if (!genotypes.stream().allMatch(g -> g.getAlleles().size() > 2)) { + List allowedAlleles = Arrays.asList( + Allele.NO_CALL.getBaseString(), variant.getReference().getBaseString(), variant.getAlternateAllele(0).getBaseString() + ); + if (genotypes.stream().anyMatch(g -> g.getAlleles().stream().anyMatch(a -> !allowedAlleles.contains(a.getBaseString())))) { gt5kbFilter = true; } else if (variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) >= MIN_MULTIALLELIC_EVENT_SIZE && !multiallelicFilter) { gt5kbFilter = true; @@ -349,7 +358,7 @@ private List processLargeDuplications(final VariantContext variant, fi for (Genotype genotype : genotypes) { GenotypeBuilder gb = new GenotypeBuilder(genotype); gb.noGQ(); - gb.alleles(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)); + gb.alleles(Arrays.asList(Allele.NO_CALL)); gb.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN)); gb.attribute(GATKSVVCFConstants.COPY_NUMBER_QUALITY_FORMAT, genotype.getExtendedAttribute(GATKSVVCFConstants.RD_GQ)); updatedGenotypes.add(gb.make()); From 0fa77382240aee3a711d21a5abd971d6530d8bef Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Wed, 6 Nov 2024 12:25:00 -0500 Subject: [PATCH 37/58] Modified conditions for biallelic filtering --- .../spark/sv/utils/GATKSVVCFConstants.java | 1 - .../tools/walkers/sv/SVCleanPt1a.java | 5 ++++- .../tools/walkers/sv/SVCleanPt4.java | 22 +++++-------------- .../tools/walkers/sv/SVCleanPt5.java | 7 +++++- 4 files changed, 16 insertions(+), 19 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java index ab2e03130a9..26e71323554 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java @@ -11,7 +11,6 @@ import static java.util.Map.entry; - public final class GATKSVVCFConstants { // todo: add these and the other standard SV info fields from the VCF spec to htsjdk VCFStandardHeaderLines diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java index b6261d1c4bc..07a29d95f5a 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1a.java @@ -27,7 +27,10 @@ import java.io.IOException; import java.nio.file.Path; -import java.util.*; +import java.util.Arrays; +import java.util.List; +import java.util.ArrayList; +import java.util.Set; import java.util.stream.Collectors; /** diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java index aaacbbca359..fa401aa70ba 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java @@ -25,6 +25,7 @@ import java.util.Map; import java.util.HashSet; import java.util.HashMap; +import java.util.stream.Collectors; /** * Completes an initial series of cleaning steps for a VCF produced by the GATK-SV pipeline. @@ -255,10 +256,8 @@ private List processLargeDeletions(final VariantContext variant, final } boolean gt5kbFilter = false; - List allowedAlleles = Arrays.asList( - Allele.NO_CALL.getBaseString(), variant.getReference().getBaseString(), variant.getAlternateAllele(0).getBaseString() - ); - if (genotypes.stream().anyMatch(g -> g.getAlleles().stream().anyMatch(a -> !allowedAlleles.contains(a.getBaseString())))) { + List allowedAlleleIndices = Arrays.asList(-1, 0, 1); + if (genotypes.stream().anyMatch(g -> g.getAlleles().stream().anyMatch(a -> !allowedAlleleIndices.contains(variant.getAlleleIndex(a))))) { gt5kbFilter = true; } else if (variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) >= MIN_MULTIALLELIC_EVENT_SIZE && !multiallelicFilter) { gt5kbFilter = true; @@ -325,10 +324,8 @@ private List processLargeDuplications(final VariantContext variant, fi } boolean gt5kbFilter = false; - List allowedAlleles = Arrays.asList( - Allele.NO_CALL.getBaseString(), variant.getReference().getBaseString(), variant.getAlternateAllele(0).getBaseString() - ); - if (genotypes.stream().anyMatch(g -> g.getAlleles().stream().anyMatch(a -> !allowedAlleles.contains(a.getBaseString())))) { + List allowedAlleleIndices = Arrays.asList(-1, 0, 1); + if (genotypes.stream().anyMatch(g -> g.getAlleles().stream().anyMatch(a -> !allowedAlleleIndices.contains(variant.getAlleleIndex(a))))) { gt5kbFilter = true; } else if (variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) >= MIN_MULTIALLELIC_EVENT_SIZE && !multiallelicFilter) { gt5kbFilter = true; @@ -402,7 +399,7 @@ public boolean isCalled(final VariantContextBuilder builder, final List gt = genotype.getAlleles().stream() - .map(allele -> allele.isNoCall() ? null : allele.getDisplayString().equals("1") ? 1 : 0) - .toList(); - return GATKSVVCFConstants.BIALLELIC_GTS.contains(gt); - } - private boolean isNoCallGt(List alleles) { if (alleles.size() == 1 && alleles.get(0).isReference()) return true; else if (alleles.size() == 2 && alleles.get(0).isReference() && alleles.get(1).isReference()) return true; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java index 334e7285e2f..ab30b4b443e 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java @@ -18,7 +18,12 @@ import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; import org.broadinstitute.hellbender.utils.variant.GATKSVVariantContextUtils; -import java.util.*; +import java.util.Arrays; +import java.util.List; +import java.util.ArrayList; +import java.util.Set; +import java.util.Map; +import java.util.HashSet; /** * Completes an initial series of cleaning steps for a VCF produced by the GATK-SV pipeline. From a281d65e49db8b8ba2ef94bc388a9ad76093c138 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Wed, 6 Nov 2024 13:48:26 -0500 Subject: [PATCH 38/58] Modified filter to only use distinct pairs --- .../hellbender/tools/walkers/sv/SVCleanPt4.java | 16 +++++++++------- .../hellbender/tools/walkers/sv/SVCleanPt5.java | 4 ++-- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java index fa401aa70ba..a5550fc7ed4 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java @@ -283,7 +283,6 @@ private List processLargeDeletions(final VariantContext variant, final updatedGenotypes = new ArrayList<>(genotypes.size()); if (multiallelicFilter) { - builder.filter(GATKSVVCFConstants.MULTIALLELIC); for (Genotype genotype : genotypes) { GenotypeBuilder gb = new GenotypeBuilder(genotype); gb.noGQ(); @@ -292,9 +291,11 @@ private List processLargeDeletions(final VariantContext variant, final gb.attribute(GATKSVVCFConstants.COPY_NUMBER_QUALITY_FORMAT, genotype.getExtendedAttribute(GATKSVVCFConstants.RD_GQ)); updatedGenotypes.add(gb.make()); } - builder.alleles(Arrays.asList(variant.getReference(), Allele.create("<" + GATKSVVCFConstants.CNV + ">", false))); - builder.attribute(GATKSVVCFConstants.SVTYPE, GATKSVVCFConstants.CNV); genotypes = updatedGenotypes; + + builder.filter(GATKSVVCFConstants.MULTIALLELIC); + builder.attribute(GATKSVVCFConstants.SVTYPE, GATKSVVCFConstants.CNV); + builder.alleles(Arrays.asList(variant.getReference(), Allele.create("<" + GATKSVVCFConstants.CNV + ">", false))); } return genotypes; @@ -317,7 +318,7 @@ private List processLargeDuplications(final VariantContext variant, fi multiallelicFilter = true; } if (sampleRdCn.values().stream().filter(value -> (value < 1 || value > 4)).count() > 4) { - if (sampleRdCn.values().stream().filter(value -> (value < 1 || value > 4)).count() > maxVF) { + if (sampleRdCn.values().stream().filter(value -> (value < 1 || value > 4)).distinct().count() > maxVF) { multiallelicFilter = true; } } @@ -351,7 +352,6 @@ private List processLargeDuplications(final VariantContext variant, fi updatedGenotypes = new ArrayList<>(genotypes.size()); if (multiallelicFilter) { - builder.filter(GATKSVVCFConstants.MULTIALLELIC); for (Genotype genotype : genotypes) { GenotypeBuilder gb = new GenotypeBuilder(genotype); gb.noGQ(); @@ -360,9 +360,11 @@ private List processLargeDuplications(final VariantContext variant, fi gb.attribute(GATKSVVCFConstants.COPY_NUMBER_QUALITY_FORMAT, genotype.getExtendedAttribute(GATKSVVCFConstants.RD_GQ)); updatedGenotypes.add(gb.make()); } - builder.alleles(Arrays.asList(variant.getReference(), Allele.create("<" + GATKSVVCFConstants.CNV + ">", false))); - builder.attribute(GATKSVVCFConstants.SVTYPE, GATKSVVCFConstants.CNV); genotypes = updatedGenotypes; + + builder.filter(GATKSVVCFConstants.MULTIALLELIC); + builder.attribute(GATKSVVCFConstants.SVTYPE, GATKSVVCFConstants.CNV); + builder.alleles(Arrays.asList(variant.getReference(), Allele.create("<" + GATKSVVCFConstants.CNV + ">", false))); } return genotypes; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java index ab30b4b443e..e3302f2536e 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java @@ -204,9 +204,9 @@ private void processSvType(final VariantContext variant, final VariantContextBui for (Genotype genotype : genotypes) { GenotypeBuilder gb = new GenotypeBuilder(genotype); long altCount = genotype.getAlleles().stream().filter(allele -> allele.isCalled() && !allele.isReference()).count(); - if (altCount == 1) { // Heterozygous case (0/1) + if (altCount == 1) { // Heterozygous (0/1) gb.alleles(Arrays.asList(refAllele, altAllele)); - } else if (altCount == 2) { // Homozygous alternate case (1/1) + } else if (altCount == 2) { // Homozygous Alternate (1/1) gb.alleles(Arrays.asList(altAllele, altAllele)); } updatedGenotypes.add(gb.make()); From f7215eee52f40d20168c7a9bdc27ee8b3dc0f2aa Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Thu, 7 Nov 2024 12:00:25 +0000 Subject: [PATCH 39/58] Moved svtype modification to cleanpt4 --- .../tools/walkers/sv/SVCleanPt4.java | 32 +++++++++++++++++++ .../tools/walkers/sv/SVCleanPt5.java | 2 +- 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java index a5550fc7ed4..212897654c7 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java @@ -12,6 +12,7 @@ import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; import org.broadinstitute.hellbender.engine.*; import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; +import org.broadinstitute.hellbender.utils.variant.GATKSVVariantContextUtils; import java.io.BufferedReader; import java.io.FileReader; @@ -172,6 +173,7 @@ public void apply(final VariantContext variant, final ReadsContext readsContext, genotypes = processLargeDeletions(variant, builder, genotypes); genotypes = processLargeDuplications(variant, builder, genotypes); genotypes = processRevisedSex(variant, genotypes); + genotypes = processSvType(variant, builder, genotypes); // Build genotypes if (isCalled(builder, genotypes)) { @@ -392,6 +394,36 @@ private List processRevisedSex(final VariantContext variant, List processSvType(final VariantContext variant, final VariantContextBuilder builder, final List genotypes) { + final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, null); + boolean hasMobileElement = variant.getAlleles().stream() + .map(GATKSVVariantContextUtils::getSymbolicAlleleSymbols) + .flatMap(Arrays::stream) + .anyMatch(symbol -> symbol.equals(GATKSVVCFConstants.ME)); + if (svType == null || hasMobileElement) { + return genotypes; + } + + final Allele refAllele = builder.make().getReference(); + final Allele altAllele = Allele.create("<" + svType + ">", false); + List newAlleles = Arrays.asList(refAllele, altAllele); + + List updatedGenotypes = new ArrayList<>(genotypes.size()); + for (Genotype genotype : genotypes) { + GenotypeBuilder gb = new GenotypeBuilder(genotype); + long altCount = genotype.getAlleles().stream().filter(allele -> allele.isCalled() && !allele.isReference()).count(); + if (altCount == 1) { // Heterozygous (0/1) + gb.alleles(Arrays.asList(refAllele, altAllele)); + } else if (altCount == 2) { // Homozygous Alternate (1/1) + gb.alleles(Arrays.asList(altAllele, altAllele)); + } + updatedGenotypes.add(gb.make()); + } + + builder.alleles(newAlleles); + return updatedGenotypes; + } + public boolean isCalled(final VariantContextBuilder builder, final List genotypes) { for (Genotype genotype : genotypes) { if (!isNoCallGt(genotype.getAlleles())) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java index e3302f2536e..b3a1f97f7c7 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java @@ -158,7 +158,7 @@ public void secondPassApply(final VariantContext variant) { } VariantContextBuilder builder = new VariantContextBuilder(variant); - processSvType(variant, builder); + // processSvType(variant, builder); cleanseInfoFields(builder); vcfWriter.add(builder.make()); } From 383a01cf206b564946b939d2fad7e82f8b9ac14b Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Thu, 7 Nov 2024 07:42:24 -0500 Subject: [PATCH 40/58] Reverted svtype changes --- .../tools/walkers/sv/SVCleanPt4.java | 31 ------------------- .../tools/walkers/sv/SVCleanPt5.java | 2 +- 2 files changed, 1 insertion(+), 32 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java index 212897654c7..f23823ff3b8 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java @@ -173,7 +173,6 @@ public void apply(final VariantContext variant, final ReadsContext readsContext, genotypes = processLargeDeletions(variant, builder, genotypes); genotypes = processLargeDuplications(variant, builder, genotypes); genotypes = processRevisedSex(variant, genotypes); - genotypes = processSvType(variant, builder, genotypes); // Build genotypes if (isCalled(builder, genotypes)) { @@ -394,36 +393,6 @@ private List processRevisedSex(final VariantContext variant, List processSvType(final VariantContext variant, final VariantContextBuilder builder, final List genotypes) { - final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, null); - boolean hasMobileElement = variant.getAlleles().stream() - .map(GATKSVVariantContextUtils::getSymbolicAlleleSymbols) - .flatMap(Arrays::stream) - .anyMatch(symbol -> symbol.equals(GATKSVVCFConstants.ME)); - if (svType == null || hasMobileElement) { - return genotypes; - } - - final Allele refAllele = builder.make().getReference(); - final Allele altAllele = Allele.create("<" + svType + ">", false); - List newAlleles = Arrays.asList(refAllele, altAllele); - - List updatedGenotypes = new ArrayList<>(genotypes.size()); - for (Genotype genotype : genotypes) { - GenotypeBuilder gb = new GenotypeBuilder(genotype); - long altCount = genotype.getAlleles().stream().filter(allele -> allele.isCalled() && !allele.isReference()).count(); - if (altCount == 1) { // Heterozygous (0/1) - gb.alleles(Arrays.asList(refAllele, altAllele)); - } else if (altCount == 2) { // Homozygous Alternate (1/1) - gb.alleles(Arrays.asList(altAllele, altAllele)); - } - updatedGenotypes.add(gb.make()); - } - - builder.alleles(newAlleles); - return updatedGenotypes; - } - public boolean isCalled(final VariantContextBuilder builder, final List genotypes) { for (Genotype genotype : genotypes) { if (!isNoCallGt(genotype.getAlleles())) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java index b3a1f97f7c7..e3302f2536e 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java @@ -158,7 +158,7 @@ public void secondPassApply(final VariantContext variant) { } VariantContextBuilder builder = new VariantContextBuilder(variant); - // processSvType(variant, builder); + processSvType(variant, builder); cleanseInfoFields(builder); vcfWriter.add(builder.make()); } From ee1603793ea48292f2afe7ddb11d396e66e0bb6a Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Tue, 12 Nov 2024 11:26:12 -0500 Subject: [PATCH 41/58] Overlap logic change to align with GATK-SV outputs --- .../hellbender/tools/walkers/sv/SVCleanPt2.java | 14 ++++++++------ .../hellbender/tools/walkers/sv/SVCleanPt4.java | 11 ++++++----- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java index 807030b61c6..5cd38d56be3 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java @@ -141,7 +141,7 @@ public void apply(final VariantContext variant, final ReadsContext readsContext, // Flag sample as having an abnormal copy number if it passes certain conditions for (String sample : variant.getSampleNames()) { Genotype genotype = variant.getGenotype(sample); - int rdCn = Integer.parseInt(genotype.getExtendedAttribute("RD_CN").toString()); + int rdCn = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()); if (!sampleWhitelist.contains(sample) || !genotype.isCalled() || rdCn == 2) { continue; } @@ -172,8 +172,8 @@ private void adjustCopyNumber(final VariantContext v1, final VariantContext v2) Map variantRdCn2 = getRdCnForVariant(v2); Map> variantSupport1 = getSupportForVariant(v1); Map> variantSupport2 = getSupportForVariant(v2); - String svType1 = v1.getAttributeAsString("SVTYPE", ""); - String svType2 = v2.getAttributeAsString("SVTYPE", ""); + String svType1 = v1.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + String svType2 = v2.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); // Calculate overlap metadata int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0);; @@ -195,7 +195,7 @@ private void adjustCopyNumber(final VariantContext v1, final VariantContext v2) String id2 = variantId2 + "@" + sample; int rdCn1 = revisedCopyNumbers.getOrDefault(variantId1, Collections.emptyMap()).getOrDefault(sample, variantRdCn1.get(sample)); int rdCn2 = revisedCopyNumbers.getOrDefault(variantId2, Collections.emptyMap()).getOrDefault(sample, variantRdCn2.get(sample)); - if (revisedComplete.contains(id1)) { + if (revisedComplete.contains(id1) || revisedComplete.contains(id2)) { continue; } @@ -278,8 +278,7 @@ private boolean isDelDup(final VariantContext variant) { } private boolean isLarge(final VariantContext variant, final int minSize) { - int variantLength = Math.abs(variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); - return variantLength >= minSize; + return Math.abs(variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) >= minSize; } private Map> getSupportForVariant(final VariantContext variant) { @@ -308,6 +307,9 @@ private Map getRdCnForVariant(final VariantContext variant) { } private void makeRevision(final String id, final int val) { + if (id.contains("brainvar_all_samples_DUP_chr1_890")) { + System.out.println(id + " --> " + Integer.toString(val)) ; + } String[] tokens = id.split("@"); String variantId = tokens[0]; String sample = tokens[1]; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java index f23823ff3b8..32707bc1307 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java @@ -157,17 +157,18 @@ public void closeTool() { @Override public void apply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext) { - // Initialize data structures - recordIdx++; - VariantContextBuilder builder = new VariantContextBuilder(variant); - // Exit if outside batch range if (recordIdx < recordStart || recordIdx >= recordEnd) { + recordIdx++; return; } + recordIdx++; - // Process variants + // Initialize data structures + VariantContextBuilder builder = new VariantContextBuilder(variant); List genotypes = variant.getGenotypes(); + + // Process variants genotypes = processRevisedCn(variant, genotypes); processMultiallelic(builder, genotypes); genotypes = processLargeDeletions(variant, builder, genotypes); From 3f901f4e3d5f604afd644d0b9f05f08c80d71b2a Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Tue, 12 Nov 2024 14:19:10 -0500 Subject: [PATCH 42/58] Modified overlap logic to only be unidirectional --- .../tools/walkers/sv/SVCleanPt2.java | 83 +++++++++++-------- 1 file changed, 48 insertions(+), 35 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java index 5cd38d56be3..f65c51524fb 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java @@ -157,7 +157,6 @@ public void apply(final VariantContext variant, final ReadsContext readsContext, || (vc.getStart() + vc.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) < variant.getStart()); for (VariantContext bufferedVariant : overlappingVariantsBuffer) { if (overlaps(variant, bufferedVariant)) { - adjustCopyNumber(bufferedVariant, variant); adjustCopyNumber(variant, bufferedVariant); } } @@ -165,24 +164,40 @@ public void apply(final VariantContext variant, final ReadsContext readsContext, } private void adjustCopyNumber(final VariantContext v1, final VariantContext v2) { - // Track metadata through data structures - String variantId1 = v1.getID(); - String variantId2 = v2.getID(); - Map variantRdCn1 = getRdCnForVariant(v1); - Map variantRdCn2 = getRdCnForVariant(v2); - Map> variantSupport1 = getSupportForVariant(v1); - Map> variantSupport2 = getSupportForVariant(v2); - String svType1 = v1.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); - String svType2 = v2.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); - - // Calculate overlap metadata - int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0);; + // Determine larger variant + VariantContext largerVariant = v1; + VariantContext smallerVariant = v2; + int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); int length2 = v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); - int minEnd = Math.min(v1.getStart() + v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0), v2.getStart() + v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); - int maxStart = Math.max(v1.getStart(), v2.getStart()); - int lengthOverlap = minEnd - maxStart; - double overlap1 = (double) lengthOverlap / (double) length1; - double overlap2 = (double) lengthOverlap / (double) length2; + int largerLength = length1; + int smallerLength = length2; + + // Swap variants if necessary + if (length2 > length1) { + largerVariant = v2; + smallerVariant = v1; + largerLength = length2; + smallerLength = length1; + } + + // Get IDs and other attributes + String variantId1 = largerVariant.getID(); + String variantId2 = smallerVariant.getID(); + Map variantRdCn1 = getRdCnForVariant(largerVariant); + Map variantRdCn2 = getRdCnForVariant(smallerVariant); + Map> variantSupport1 = getSupportForVariant(largerVariant); + Map> variantSupport2 = getSupportForVariant(smallerVariant); + String svType1 = largerVariant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + String svType2 = smallerVariant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + + // Calculate overlap + int end1 = largerVariant.getEnd(); + int end2 = smallerVariant.getEnd(); + int minEnd = Math.min(end1, end2); + int maxStart = Math.max(largerVariant.getStart(), smallerVariant.getStart()); + int lengthOverlap = minEnd - maxStart + 1; + double overlap1 = (double) lengthOverlap / (double) largerLength; + double overlap2 = (double) lengthOverlap / (double) smallerLength; // Get samples with abnormal CN across both variants Set samples = new HashSet<>(abnormalRdCn.getOrDefault(variantId1, Collections.emptySet())); @@ -190,24 +205,23 @@ private void adjustCopyNumber(final VariantContext v1, final VariantContext v2) // Iterate through samples to test against conditions for (String sample : samples) { - // Validate baseline filters String id1 = variantId1 + "@" + sample; String id2 = variantId2 + "@" + sample; - int rdCn1 = revisedCopyNumbers.getOrDefault(variantId1, Collections.emptyMap()).getOrDefault(sample, variantRdCn1.get(sample)); - int rdCn2 = revisedCopyNumbers.getOrDefault(variantId2, Collections.emptyMap()).getOrDefault(sample, variantRdCn2.get(sample)); - if (revisedComplete.contains(id1) || revisedComplete.contains(id2)) { + if (revisedComplete.contains(id1)) { continue; } - // Initialize fields for evaluation + // Initialize variables for evaluation + int rdCn1 = revisedCopyNumbers.getOrDefault(variantId1, Collections.emptyMap()).getOrDefault(sample, variantRdCn1.get(sample)); + int rdCn2 = revisedCopyNumbers.getOrDefault(variantId2, Collections.emptyMap()).getOrDefault(sample, variantRdCn2.get(sample)); Set support1 = variantSupport1.get(sample); Set support2 = variantSupport2.get(sample); - Genotype genotype2 = v2.getGenotype(sample); + Genotype genotype2 = smallerVariant.getGenotype(sample); - // Condition 1: Smaller depth call is being driven by a larger call + // Condition 1: Smaller depth call is being driven by larger call if (support1.contains(GATKSVVCFConstants.EV_VALUES.get(1)) && support1.size() > 1 && support2.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) - && overlap2 > 0.5 && !v1.hasAttribute(GATKSVVCFConstants.MULTI_CNV)) { + && overlap2 > 0.5 && !largerVariant.hasAttribute(GATKSVVCFConstants.MULTI_CNV)) { if (rdCn1 == 0) { makeRevision(id2, rdCn2 + 2); } else if (rdCn1 == 1) { @@ -219,10 +233,10 @@ private void adjustCopyNumber(final VariantContext v1, final VariantContext v2) } } - // Condition 2: Smaller CNV is driven by a larger CNV genotype + // Condition 2: Smaller CNV is driven by larger CNV genotype else if (support1.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) && support2.contains(GATKSVVCFConstants.EV_VALUES.get(1)) && support2.size() > 1 - && overlap1 > 0.5 && overlap2 > 0.5 && !v2.hasAttribute(GATKSVVCFConstants.MULTI_CNV) + && overlap1 > 0.5 && overlap2 > 0.5 && !smallerVariant.hasAttribute(GATKSVVCFConstants.MULTI_CNV) && !genotype2.isHomRef()) { if (rdCn2 == 0) { makeRevision(id1, rdCn1 + 2); @@ -235,16 +249,18 @@ else if (support1.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get( } } - // Condition 3: Depth-only calls where smaller call is driven by a larger call + // Condition 3: Depth-only calls where smaller call is driven by larger call else if (support1.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) && support2.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) - && overlap2 > 0.5 && !v1.hasAttribute(GATKSVVCFConstants.MULTI_CNV) && svType1.equals(svType2)) { + && overlap2 > 0.5 && !largerVariant.hasAttribute(GATKSVVCFConstants.MULTI_CNV) && svType1.equals(svType2)) { if (rdCn1 == 0 && rdCn1 != rdCn2) { makeRevision(id2, rdCn2 + 2); } else if (rdCn1 == 1 && rdCn1 > rdCn2) { makeRevision(id2, 1); } else if (rdCn1 > 1 && rdCn1 < rdCn2) { - makeRevision(id2, Math.max(rdCn2 - rdCn1 + 2, 0)); + int newCN = rdCn2 - rdCn1 + 2; + newCN = Math.max(newCN, 0); + makeRevision(id2, newCN); } else { makeRevision(id2, 2); } @@ -252,7 +268,7 @@ else if (support1.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get( // Condition 4: Any other time a larger call drives a smaller call else if (support1.contains(GATKSVVCFConstants.EV_VALUES.get(1)) - && overlap2 > 0.5 && !v1.hasAttribute(GATKSVVCFConstants.MULTI_CNV) && length2 > MIN_VARIANT_SIZE) { + && overlap2 > 0.5 && !largerVariant.hasAttribute(GATKSVVCFConstants.MULTI_CNV) && largerLength > MIN_VARIANT_SIZE) { if (rdCn1 == 0) { makeRevision(id2, rdCn2 + 2); } else if (rdCn1 == 1) { @@ -307,9 +323,6 @@ private Map getRdCnForVariant(final VariantContext variant) { } private void makeRevision(final String id, final int val) { - if (id.contains("brainvar_all_samples_DUP_chr1_890")) { - System.out.println(id + " --> " + Integer.toString(val)) ; - } String[] tokens = id.split("@"); String variantId = tokens[0]; String sample = tokens[1]; From e8ce4c57909a407c02cc630ed04674e2438cfbe4 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Tue, 12 Nov 2024 14:55:33 -0500 Subject: [PATCH 43/58] Standardized variant overlap code --- .../spark/sv/utils/GATKSVVCFConstants.java | 6 ----- .../tools/walkers/sv/SVCleanPt2.java | 9 +++---- .../tools/walkers/sv/SVCleanPt5.java | 24 +++++++++++++------ 3 files changed, 22 insertions(+), 17 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java index 26e71323554..6f6c93b1a86 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java @@ -180,12 +180,6 @@ public enum ComplexVariantSubtype { // SVCleanPt5 public static final String UNR = "UNR"; public static final String EVENT = "EVENT"; - public static final Set> BIALLELIC_GTS = new HashSet<>(Arrays.asList( - Arrays.asList(0, 0), - Arrays.asList(1, 1), - Arrays.asList(0, 1), - Arrays.asList(null, null) - )); public static final Set FILTER_VCF_LINES = new HashSet<>(Arrays.asList( "CIPOS", "CIEND", "RMSSTD", "source", "bcftools", "GATKCommandLine", "#CHROM" )); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java index f65c51524fb..ba32a181a01 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java @@ -180,7 +180,7 @@ private void adjustCopyNumber(final VariantContext v1, final VariantContext v2) smallerLength = length1; } - // Get IDs and other attributes + // Get variant attributes String variantId1 = largerVariant.getID(); String variantId2 = smallerVariant.getID(); Map variantRdCn1 = getRdCnForVariant(largerVariant); @@ -191,9 +191,10 @@ private void adjustCopyNumber(final VariantContext v1, final VariantContext v2) String svType2 = smallerVariant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); // Calculate overlap - int end1 = largerVariant.getEnd(); - int end2 = smallerVariant.getEnd(); - int minEnd = Math.min(end1, end2); + int minEnd = Math.min( + largerVariant.getStart() + largerVariant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0), + smallerVariant.getStart() + smallerVariant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) + ); int maxStart = Math.max(largerVariant.getStart(), smallerVariant.getStart()); int lengthOverlap = minEnd - maxStart + 1; double overlap1 = (double) lengthOverlap / (double) largerLength; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java index e3302f2536e..1ef4e21d17a 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java @@ -146,7 +146,6 @@ public void firstPassApply(final VariantContext variant) { for (VariantContext bufferedVariant : overlappingVariantsBuffer) { if (overlaps(bufferedVariant, variant)) { processVariantPair(bufferedVariant, variant); - processVariantPair(variant, bufferedVariant); } } overlappingVariantsBuffer.add(variant); @@ -164,21 +163,32 @@ public void secondPassApply(final VariantContext variant) { } private void processVariantPair(VariantContext v1, VariantContext v2) { + // Determine larger variant + VariantContext largerVariant = v1; + VariantContext smallerVariant = v2; int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); int length2 = v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); - if (length1 < length2) { - return; + + // Swap variants if necessary + if (length2 > length1) { + largerVariant = v2; + smallerVariant = v1; } - int minEnd = Math.min(v1.getStart() + v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0), v2.getStart() + v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); - int maxStart = Math.max(v1.getStart(), v2.getStart()); + // Calculate overlap + int minEnd = Math.min( + largerVariant.getStart() + largerVariant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0), + smallerVariant.getStart() + smallerVariant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) + ); + int maxStart = Math.max(largerVariant.getStart(), smallerVariant.getStart()); int overlapLength = minEnd - maxStart + 1; if (overlapLength <= 0) { return; } - double smallCoverage = (double) overlapLength / length2; - if (smallCoverage > 0.5) { + // Filter variant based on conditions + double coverage = (double) overlapLength / length2; + if (coverage > 0.5) { if (!filteredVariantIds.contains(v1.getID())) { filteredVariantIds.add(v2.getID()); } From 29dc96f4620140e378413ca7a9c0c1c07cc15331 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Tue, 12 Nov 2024 15:24:57 -0500 Subject: [PATCH 44/58] Modified overlap logic - minor bug --- .../hellbender/tools/walkers/sv/SVCleanPt5.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java index 1ef4e21d17a..92871c5f168 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt5.java @@ -168,11 +168,13 @@ private void processVariantPair(VariantContext v1, VariantContext v2) { VariantContext smallerVariant = v2; int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); int length2 = v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + int smallerLength = length2; // Swap variants if necessary if (length2 > length1) { largerVariant = v2; smallerVariant = v1; + smallerLength = length1; } // Calculate overlap @@ -187,11 +189,9 @@ private void processVariantPair(VariantContext v1, VariantContext v2) { } // Filter variant based on conditions - double coverage = (double) overlapLength / length2; - if (coverage > 0.5) { - if (!filteredVariantIds.contains(v1.getID())) { - filteredVariantIds.add(v2.getID()); - } + double coverage = (double) overlapLength / smallerLength; + if (coverage > 0.5 && !filteredVariantIds.contains(largerVariant.getID())) { + filteredVariantIds.add(smallerVariant.getID()); } } From a68dcc62d12426e70dba4a17ed8c016f9c91e0e7 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Thu, 14 Nov 2024 12:08:10 -0500 Subject: [PATCH 45/58] Modified CleanPt4 imports --- .../broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java index 32707bc1307..c8ebce29dd3 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java @@ -12,7 +12,6 @@ import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; import org.broadinstitute.hellbender.engine.*; import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; -import org.broadinstitute.hellbender.utils.variant.GATKSVVariantContextUtils; import java.io.BufferedReader; import java.io.FileReader; @@ -26,7 +25,6 @@ import java.util.Map; import java.util.HashSet; import java.util.HashMap; -import java.util.stream.Collectors; /** * Completes an initial series of cleaning steps for a VCF produced by the GATK-SV pipeline. From 8ba660ef61e6385ed4bcb910fbf5058d80bfa6bc Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Mon, 18 Nov 2024 16:35:02 -0500 Subject: [PATCH 46/58] Minor change to remove redundant size/svtype check --- .../hellbender/tools/walkers/sv/SVCleanPt1b.java | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java index 894cf156536..872f840c9e5 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java @@ -83,7 +83,6 @@ public class SVCleanPt1b extends MultiplePassVariantWalker { private final Map> revisedEventsFiltered = new HashMap<>(); private final Map> revisedRdCn = new HashMap<>(); - private static final int MIN_VARIANT_SIZE_CNV = 1000; private static final int MIN_VARIANT_SIZE = 5000; @Override @@ -172,9 +171,7 @@ public void thirdPassApply(final VariantContext variant) { if (revisedEventsAll.containsKey(variant.getID())) { processVariant(builder, variant); } - if (isDelDup(variant) && isLarge(variant, MIN_VARIANT_SIZE_CNV)) { - processCnvs(builder, variant); - } + processCnvs(builder, variant); vcfWriter.add(builder.make()); } From 707ada67fc06bdf2781786b6f0ba2be2014f7be7 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Mon, 25 Nov 2024 15:58:51 -0500 Subject: [PATCH 47/58] Minor file creation --- .../hellbender/tools/walkers/sv/SVReviseOverlappingCNV.java | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCNV.java diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCNV.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCNV.java new file mode 100644 index 00000000000..92f3412f6ee --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCNV.java @@ -0,0 +1,4 @@ +package org.broadinstitute.hellbender.tools.walkers.sv; + +public class SVReviseOverlappingCNV { +} From 1e30b4b7d62c7db309e00fa668e15481335fe3e4 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Wed, 11 Dec 2024 10:28:30 -0500 Subject: [PATCH 48/58] New tools - modular implementations --- .../tools/walkers/sv/SVCleanPt4.java | 10 +- .../walkers/sv/SVReviseAbnormalAllosomes.java | 106 ++++++ .../tools/walkers/sv/SVReviseLargeCnvs.java | 346 ++++++++++++++++++ .../walkers/sv/SVReviseMultiallelicCnvs.java | 169 +++++++++ .../walkers/sv/SVReviseOverlappingCNV.java | 4 - .../walkers/sv/SVReviseOverlappingCnvCns.java | 337 +++++++++++++++++ .../walkers/sv/SVReviseOverlappingCnvGts.java | 339 +++++++++++++++++ 7 files changed, 1306 insertions(+), 5 deletions(-) create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseAbnormalAllosomes.java create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseLargeCnvs.java create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseMultiallelicCnvs.java delete mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCNV.java create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvCns.java create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvGts.java diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java index c8ebce29dd3..af96fd295df 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt4.java @@ -1,7 +1,15 @@ package org.broadinstitute.hellbender.tools.walkers.sv; -import htsjdk.variant.variantcontext.*; +import htsjdk.variant.variantcontext.Allele; + +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.GenotypeBuilder; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLineType; +import htsjdk.variant.vcf.VCFInfoHeaderLine; import htsjdk.variant.vcf.*; import org.broadinstitute.barclay.argparser.Argument; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseAbnormalAllosomes.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseAbnormalAllosomes.java new file mode 100644 index 00000000000..30dcf803201 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseAbnormalAllosomes.java @@ -0,0 +1,106 @@ +package org.broadinstitute.hellbender.tools.walkers.sv; + +import htsjdk.variant.variantcontext.*; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; +import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; + +import java.util.List; +import java.util.ArrayList; + +/** + * Completes an initial series of cleaning steps for a VCF produced by the GATK-SV pipeline. + * + *

          Inputs

          + *
            + *
          • + * VCF containing structural variant (SV) records from the GATK-SV pipeline. + *
          • + *
          • + * TODO + *
          • + *
          + * + *

          Output

          + *
            + *
          • + * Cleansed VCF. + *
          • + *
          + * + *

          Usage Example

          + *
          + *     TODO
          + * 
          + * + *

          Processing Steps

          + *
            + *
          1. + * TODO + *
          2. + *
          + */ +@CommandLineProgramProperties( + summary = "Clean and format SV VCF", + oneLineSummary = "Clean and format SV VCF", + programGroup = StructuralVariantDiscoveryProgramGroup.class +) +@BetaFeature +@DocumentedFeature +public class SVReviseAbnormalAllosomes extends VariantWalker { + @Argument( + fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + doc = "Output VCF name" + ) + private GATKPath outputVcf; + + private VariantContextWriter vcfWriter; + + @Override + public void onTraversalStart() { + vcfWriter = createVCFWriter(outputVcf); + vcfWriter.writeHeader(getHeaderForVariants()); + } + + @Override + public void closeTool() { + if (vcfWriter != null) { + vcfWriter.close(); + } + } + + @Override + public void apply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext) { + VariantContextBuilder builder = new VariantContextBuilder(variant); + if (!variant.getAttributeAsBoolean(GATKSVVCFConstants.REVISED_EVENT, false)) { + processRevisedSex(variant, builder); + } + vcfWriter.add(builder.make()); + } + + private void processRevisedSex(final VariantContext variant, final VariantContextBuilder builder) { + List genotypes = variant.getGenotypes(); + List updatedGenotypes = new ArrayList<>(genotypes.size()); + for (Genotype genotype : genotypes) { + if (Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 0).toString()) > 0) { + int newRdCn = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()) - 1; + GenotypeBuilder gb = new GenotypeBuilder(genotype); + gb.attribute(GATKSVVCFConstants.RD_CN, newRdCn); + if (genotype.hasExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT)) { + gb.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, newRdCn); + } + updatedGenotypes.add(gb.make()); + } else { + updatedGenotypes.add(genotype); + } + } + builder.genotypes(genotypes); + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseLargeCnvs.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseLargeCnvs.java new file mode 100644 index 00000000000..dd1637adfdf --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseLargeCnvs.java @@ -0,0 +1,346 @@ +package org.broadinstitute.hellbender.tools.walkers.sv; + +import htsjdk.variant.variantcontext.Allele; + +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.GenotypeBuilder; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLineType; +import htsjdk.variant.vcf.VCFInfoHeaderLine; + +import htsjdk.variant.vcf.*; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; +import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; + +import java.io.IOException; +import java.nio.file.Files; + +import java.util.Arrays; +import java.util.List; +import java.util.ArrayList; +import java.util.Set; +import java.util.Map; +import java.util.HashSet; +import java.util.HashMap; + +/** + * Completes an initial series of cleaning steps for a VCF produced by the GATK-SV pipeline. + * + *

          Inputs

          + *
            + *
          • + * VCF containing structural variant (SV) records from the GATK-SV pipeline. + *
          • + *
          • + * TODO + *
          • + *
          + * + *

          Output

          + *
            + *
          • + * Cleansed VCF. + *
          • + *
          + * + *

          Usage Example

          + *
          + *     TODO
          + * 
          + * + *

          Processing Steps

          + *
            + *
          1. + * TODO + *
          2. + *
          + */ +@CommandLineProgramProperties( + summary = "Clean and format SV VCF", + oneLineSummary = "Clean and format SV VCF", + programGroup = StructuralVariantDiscoveryProgramGroup.class +) +@BetaFeature +@DocumentedFeature +public class SVReviseLargeCnvs extends VariantWalker { + public static final String OUTLIERS_LIST_LONG_NAME = "outliers-list"; + + @Argument( + fullName = OUTLIERS_LIST_LONG_NAME, + doc = "File with outlier samples", + optional = true + ) + private GATKPath outliersListPath; + + @Argument( + fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + doc = "Output VCF name" + ) + private GATKPath outputVcf; + + private VariantContextWriter vcfWriter; + + private Set outlierSamples; + + private double maxVF; + + private static final int MIN_LARGE_EVENT_SIZE = 1000; + private static final int MIN_MULTIALLELIC_EVENT_SIZE = 5000; + + @Override + public void onTraversalStart() { + // Read and parse input files + try { + outlierSamples = new HashSet<>(); + if (outliersListPath != null) { + outlierSamples = new HashSet<>(Files.readAllLines(outliersListPath.toPath())); + } + } catch (IOException e) { + throw new RuntimeException("Error reading input file", e); + } + + // Populate maxVf based on sample information + maxVF = Math.max((getHeaderForVariants().getGenotypeSamples().size() - outlierSamples.size()) * 0.01, 2); + + // Filter specific header lines + final VCFHeader header = getHeaderForVariants(); + header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.PESR_GT_OVERDISPERSION, 0, VCFHeaderLineType.Flag, "High PESR dispersion count")); + header.addMetaDataLine(new VCFFormatHeaderLine(GATKSVVCFConstants.COPY_NUMBER_FORMAT, 1, VCFHeaderLineType.Integer, "Predicted copy state")); + header.addMetaDataLine(new VCFFormatHeaderLine(GATKSVVCFConstants.COPY_NUMBER_QUALITY_FORMAT, 1, VCFHeaderLineType.Integer, "Read-depth genotype quality")); + header.addMetaDataLine(new VCFFilterHeaderLine(GATKSVVCFConstants.MULTIALLELIC, "Multiallelic site")); + + // Write header + vcfWriter = createVCFWriter(outputVcf); + vcfWriter.writeHeader(header); + } + + @Override + public void closeTool() { + if (vcfWriter != null) { + vcfWriter.close(); + } + } + + @Override + public void apply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext) { + // Initialize data structures + VariantContextBuilder builder = new VariantContextBuilder(variant); + List genotypes = variant.getGenotypes(); + + // Process variants + processMultiallelic(builder, genotypes); + genotypes = processLargeDeletions(variant, builder, genotypes); + genotypes = processLargeDuplications(variant, builder, genotypes); + + // Build genotypes + if (isCalled(builder, genotypes)) { + builder.genotypes(genotypes); + vcfWriter.add(builder.make()); + } + } + + private void processMultiallelic(final VariantContextBuilder builder, final List genotypes) { + int numGtOver2 = 0; + for (Genotype genotype : genotypes) { + Integer peGt = genotype.hasExtendedAttribute(GATKSVVCFConstants.PE_GT) ? + Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.PE_GT).toString()) : null; + Integer srGt = genotype.hasExtendedAttribute(GATKSVVCFConstants.SR_GT) ? + Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.SR_GT).toString()) : null; + int gt; + if (peGt == null) { + continue; + } else if (srGt == null) { + gt = peGt; + } else if (peGt > 0 && srGt == 0) { + gt = peGt; + } else if (peGt == 0) { + gt = srGt; + } else { + Integer peGq = genotype.hasExtendedAttribute(GATKSVVCFConstants.PE_GQ) ? + Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.PE_GQ).toString()) : null; + Integer srGq = genotype.hasExtendedAttribute(GATKSVVCFConstants.SR_GQ) ? + Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.SR_GQ).toString()) : null; + if (peGq != null && srGq != null && peGq >= srGq) { + gt = peGt; + } else { + gt = srGt; + } + } + if (gt > 2) { + numGtOver2++; + } + } + if (numGtOver2 > maxVF) { + builder.attribute(GATKSVVCFConstants.PESR_GT_OVERDISPERSION, true); + } + } + + private List processLargeDeletions(final VariantContext variant, final VariantContextBuilder builder, List genotypes) { + if (!variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, "").equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL)) { + return genotypes; + } + + boolean multiallelicFilter = false; + if (variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) >= MIN_LARGE_EVENT_SIZE) { + Map sampleRdCn = new HashMap<>(); + for (Genotype genotype : genotypes) { + if (!outlierSamples.contains(genotype.getSampleName()) && genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { + sampleRdCn.put(genotype.getSampleName(), Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString())); + } + } + if (sampleRdCn.values().stream().filter(value -> value > 3).count() > maxVF) { + multiallelicFilter = true; + } + } + + boolean gt5kbFilter = false; + List allowedAlleleIndices = Arrays.asList(-1, 0, 1); + if (genotypes.stream().anyMatch(g -> g.getAlleles().stream().anyMatch(a -> !allowedAlleleIndices.contains(variant.getAlleleIndex(a))))) { + gt5kbFilter = true; + } else if (variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) >= MIN_MULTIALLELIC_EVENT_SIZE && !multiallelicFilter) { + gt5kbFilter = true; + } + + List updatedGenotypes = new ArrayList<>(genotypes.size()); + if (gt5kbFilter) { + for (Genotype genotype : genotypes) { + GenotypeBuilder gb = new GenotypeBuilder(genotype); + if (!genotype.isNoCall()) { + if (genotype.hasGQ() && Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 0).toString()) >= 2) { + gb.alleles(Arrays.asList(variant.getReference(), variant.getReference())); + } else if (genotype.hasGQ() && Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 0).toString()) == 1) { + gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); + } else if (genotype.hasGQ()) { + gb.alleles(Arrays.asList(variant.getAlternateAllele(0), variant.getAlternateAllele(0))); + } + } + updatedGenotypes.add(gb.make()); + } + genotypes = updatedGenotypes; + } + + updatedGenotypes = new ArrayList<>(genotypes.size()); + if (multiallelicFilter) { + for (Genotype genotype : genotypes) { + GenotypeBuilder gb = new GenotypeBuilder(genotype); + gb.noGQ(); + gb.alleles(Arrays.asList(Allele.NO_CALL)); + gb.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN)); + gb.attribute(GATKSVVCFConstants.COPY_NUMBER_QUALITY_FORMAT, genotype.getExtendedAttribute(GATKSVVCFConstants.RD_GQ)); + updatedGenotypes.add(gb.make()); + } + genotypes = updatedGenotypes; + + builder.filter(GATKSVVCFConstants.MULTIALLELIC); + builder.attribute(GATKSVVCFConstants.SVTYPE, GATKSVVCFConstants.CNV); + builder.alleles(Arrays.asList(variant.getReference(), Allele.create("<" + GATKSVVCFConstants.CNV + ">", false))); + } + + return genotypes; + } + + private List processLargeDuplications(final VariantContext variant, final VariantContextBuilder builder, List genotypes) { + if (!variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, "").equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP)) { + return genotypes; + } + + boolean multiallelicFilter = false; + if (variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) >= MIN_LARGE_EVENT_SIZE) { + Map sampleRdCn = new HashMap<>(); + for (Genotype genotype : genotypes) { + if (!outlierSamples.contains(genotype.getSampleName()) && genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { + sampleRdCn.put(genotype.getSampleName(), Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString())); + } + } + if (sampleRdCn.values().stream().filter(value -> value > 4).count() > maxVF) { + multiallelicFilter = true; + } + if (sampleRdCn.values().stream().filter(value -> (value < 1 || value > 4)).count() > 4) { + if (sampleRdCn.values().stream().filter(value -> (value < 1 || value > 4)).distinct().count() > maxVF) { + multiallelicFilter = true; + } + } + } + + boolean gt5kbFilter = false; + List allowedAlleleIndices = Arrays.asList(-1, 0, 1); + if (genotypes.stream().anyMatch(g -> g.getAlleles().stream().anyMatch(a -> !allowedAlleleIndices.contains(variant.getAlleleIndex(a))))) { + gt5kbFilter = true; + } else if (variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) >= MIN_MULTIALLELIC_EVENT_SIZE && !multiallelicFilter) { + gt5kbFilter = true; + } + + List updatedGenotypes = new ArrayList<>(genotypes.size()); + if (gt5kbFilter) { + for (Genotype genotype : genotypes) { + GenotypeBuilder gb = new GenotypeBuilder(genotype); + if (!genotype.isNoCall()) { + if (genotype.hasGQ() && Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 3).toString()) <= 2) { + gb.alleles(Arrays.asList(variant.getReference(), variant.getReference())); + } else if (genotype.hasGQ() && Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 0).toString()) == 3) { + gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); + } else if (genotype.hasGQ()) { + gb.alleles(Arrays.asList(variant.getAlternateAllele(0), variant.getAlternateAllele(0))); + } + } + updatedGenotypes.add(gb.make()); + } + genotypes = updatedGenotypes; + } + + updatedGenotypes = new ArrayList<>(genotypes.size()); + if (multiallelicFilter) { + for (Genotype genotype : genotypes) { + GenotypeBuilder gb = new GenotypeBuilder(genotype); + gb.noGQ(); + gb.alleles(Arrays.asList(Allele.NO_CALL)); + gb.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN)); + gb.attribute(GATKSVVCFConstants.COPY_NUMBER_QUALITY_FORMAT, genotype.getExtendedAttribute(GATKSVVCFConstants.RD_GQ)); + updatedGenotypes.add(gb.make()); + } + genotypes = updatedGenotypes; + + builder.filter(GATKSVVCFConstants.MULTIALLELIC); + builder.attribute(GATKSVVCFConstants.SVTYPE, GATKSVVCFConstants.CNV); + builder.alleles(Arrays.asList(variant.getReference(), Allele.create("<" + GATKSVVCFConstants.CNV + ">", false))); + } + + return genotypes; + } + + public boolean isCalled(final VariantContextBuilder builder, final List genotypes) { + for (Genotype genotype : genotypes) { + if (!isNoCallGt(genotype.getAlleles())) { + return true; + } + } + + if (builder.getAttributes().getOrDefault(GATKSVVCFConstants.SVTYPE, "").equals(GATKSVVCFConstants.CNV)) { + for (Genotype genotype : genotypes) { + final int cn = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, 2).toString()); + if (cn != 2) { + return true; + } + } + } + + return false; + } + + private boolean isNoCallGt(List alleles) { + if (alleles.size() == 1 && alleles.get(0).isReference()) return true; + else if (alleles.size() == 2 && alleles.get(0).isReference() && alleles.get(1).isReference()) return true; + else if (alleles.size() == 1 && alleles.get(0).isNoCall()) return true; + return false; + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseMultiallelicCnvs.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseMultiallelicCnvs.java new file mode 100644 index 00000000000..3690e28470d --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseMultiallelicCnvs.java @@ -0,0 +1,169 @@ +package org.broadinstitute.hellbender.tools.walkers.sv; + +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; + +import htsjdk.variant.vcf.*; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; +import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; + +import java.util.List; +import java.util.ArrayList; +import java.util.Set; +import java.util.HashSet; + +/** + * Completes an initial series of cleaning steps for a VCF produced by the GATK-SV pipeline. + * + *

          Inputs

          + *
            + *
          • + * VCF containing structural variant (SV) records from the GATK-SV pipeline. + *
          • + *
          • + * TODO + *
          • + *
          + * + *

          Output

          + *
            + *
          • + * Cleansed VCF. + *
          • + *
          + * + *

          Usage Example

          + *
          + *     TODO
          + * 
          + * + *

          Processing Steps

          + *
            + *
          1. + * TODO + *
          2. + *
          + */ +@CommandLineProgramProperties( + summary = "Clean and format SV VCF", + oneLineSummary = "Clean and format SV VCF", + programGroup = StructuralVariantDiscoveryProgramGroup.class +) +@BetaFeature +@DocumentedFeature +public class SVReviseMultiallelicCnvs extends MultiplePassVariantWalker { + @Argument( + fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + doc = "Output VCF name" + ) + private GATKPath outputVcf; + + private VariantContextWriter vcfWriter; + + private final List overlappingVariantsBuffer = new ArrayList<>(); + private final Set filteredVariantIds = new HashSet<>(); + + @Override + protected int numberOfPasses() { return 2; } + + @Override + protected void afterNthPass(int n) {} + + @Override + public void onTraversalStart() { + final VCFHeader header = getHeaderForVariants(); + header.addMetaDataLine(new VCFFormatHeaderLine(GATKSVVCFConstants.EV, 1, VCFHeaderLineType.String, "Classes of evidence supporting final genotype")); + vcfWriter = createVCFWriter(outputVcf); + vcfWriter.writeHeader(header); + } + + @Override + public void closeTool() { + if (vcfWriter != null) { + vcfWriter.close(); + } + } + + @Override + protected void nthPassApply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext, int n) { + switch (n) { + case 0: + firstPassApply(variant); + break; + case 1: + secondPassApply(variant); + break; + } + } + + public void firstPassApply(final VariantContext variant) { + if (!variant.getFilters().contains(GATKSVVCFConstants.MULTIALLELIC)) { + return; + } + + overlappingVariantsBuffer.removeIf(vc -> !vc.getContig().equals(variant.getContig()) + || (vc.getStart() + vc.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) < variant.getStart()); + for (VariantContext bufferedVariant : overlappingVariantsBuffer) { + if (overlaps(bufferedVariant, variant)) { + processVariantPair(bufferedVariant, variant); + } + } + overlappingVariantsBuffer.add(variant); + } + + public void secondPassApply(final VariantContext variant) { + if (filteredVariantIds.contains(variant.getID())) { + return; + } + + VariantContextBuilder builder = new VariantContextBuilder(variant); + vcfWriter.add(builder.make()); + } + + private void processVariantPair(VariantContext v1, VariantContext v2) { + // Determine larger variant + VariantContext largerVariant = v1; + VariantContext smallerVariant = v2; + int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + int length2 = v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + int smallerLength = length2; + + // Swap variants if necessary + if (length2 > length1) { + largerVariant = v2; + smallerVariant = v1; + smallerLength = length1; + } + + // Calculate overlap + int minEnd = Math.min( + largerVariant.getStart() + largerVariant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0), + smallerVariant.getStart() + smallerVariant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) + ); + int maxStart = Math.max(largerVariant.getStart(), smallerVariant.getStart()); + int overlapLength = minEnd - maxStart + 1; + if (overlapLength <= 0) { + return; + } + + // Filter variant based on conditions + double coverage = (double) overlapLength / smallerLength; + if (coverage > 0.5 && !filteredVariantIds.contains(largerVariant.getID())) { + filteredVariantIds.add(smallerVariant.getID()); + } + } + + private boolean overlaps(final VariantContext v1, final VariantContext v2) { + return v1.getContig().equals(v2.getContig()) + && v1.getStart() <= (v2.getStart() + v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) + && v2.getStart() <= (v1.getStart() + v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCNV.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCNV.java deleted file mode 100644 index 92f3412f6ee..00000000000 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCNV.java +++ /dev/null @@ -1,4 +0,0 @@ -package org.broadinstitute.hellbender.tools.walkers.sv; - -public class SVReviseOverlappingCNV { -} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvCns.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvCns.java new file mode 100644 index 00000000000..219647e5467 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvCns.java @@ -0,0 +1,337 @@ +package org.broadinstitute.hellbender.tools.walkers.sv; + +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.GenotypeBuilder; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLineType; +import htsjdk.variant.vcf.VCFInfoHeaderLine; +import org.apache.commons.lang3.tuple.ImmutablePair; +import org.apache.commons.lang3.tuple.Pair; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; +import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; + +import java.util.*; + +/** + * Completes a series of cleaning steps for a VCF produced by the GATK-SV pipeline. + * + *

          Inputs

          + *
            + *
          • + * TODO + *
          • + *
          + * + *

          Output

          + *
            + *
          • + * TODO + *
          • + *
          + * + *

          Usage Example

          + *
          + *     TODO
          + * 
          + * + *

          Processing Steps

          + *
            + *
          1. + * TODO + *
          2. + *
          + */ +@CommandLineProgramProperties( + summary = "Clean and format SV VCF", + oneLineSummary = "Clean and format SV VCF", + programGroup = StructuralVariantDiscoveryProgramGroup.class +) +@BetaFeature +@DocumentedFeature +public class SVReviseOverlappingCnvCns extends MultiplePassVariantWalker { + @Argument( + fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + doc = "Output VCF name" + ) + private GATKPath outputVcf; + + private VariantContextWriter vcfWriter; + + // Data structures to hold accumulated data across variants + private static final List overlappingVariantsBuffer = new ArrayList<>(); + + private static final Map> abnormalRdCn = new HashMap<>(); + private static final Map> revisedCopyNumbers = new HashMap<>(); + private static final Set revisedComplete = new HashSet<>(); + + private static final int MIN_VARIANT_SIZE = 5000; + + @Override + public void onTraversalStart() { + vcfWriter = createVCFWriter(outputVcf); + final VCFHeader header = getHeaderForVariants(); + header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.MULTI_CNV, 0, VCFHeaderLineType.Flag, "Variant is a multiallelic CNV")); + vcfWriter.writeHeader(header); + } + + @Override + public void closeTool() { + if (vcfWriter != null) { + vcfWriter.close(); + } + } + + @Override + protected int numberOfPasses() { return 2; } + + @Override + protected void afterNthPass(int n) {} + + @Override + protected void nthPassApply(VariantContext variant, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext, int n) { + switch (n) { + case 0: + firstPassApply(variant); + break; + case 1: + secondPassApply(variant); + break; + } + } + + public void firstPassApply(final VariantContext variant) { + // Skip if not expected SVTYPE or below SVLEN threshold + if (!isDelDup(variant) || !isLarge(variant, MIN_VARIANT_SIZE)) { + return; + } + + // Flag sample as having an abnormal copy number if it passes certain conditions + for (String sample : variant.getSampleNames()) { + Genotype genotype = variant.getGenotype(sample); + int rdCn = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()); + + String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + if ((svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) && rdCn < 2) || (svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) && rdCn > 2)) { + abnormalRdCn.computeIfAbsent(variant.getID(), k -> new HashSet<>()).add(sample); + } + } + + // Process overlaps with variants in the buffer + overlappingVariantsBuffer.removeIf(vc -> !vc.getContig().equals(variant.getContig()) + || (vc.getStart() + vc.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) < variant.getStart()); + for (VariantContext bufferedVariant : overlappingVariantsBuffer) { + if (overlaps(variant, bufferedVariant)) { + adjustCopyNumber(variant, bufferedVariant); + } + } + overlappingVariantsBuffer.add(variant); + } + + public void secondPassApply(final VariantContext variant) { + VariantContextBuilder builder = new VariantContextBuilder(variant); + if (revisedCopyNumbers.containsKey(variant.getID())) { + processRevisedCn(builder, variant); + } + vcfWriter.add(builder.make()); + } + + private void adjustCopyNumber(final VariantContext v1, final VariantContext v2) { + // Determine larger variant + VariantContext largerVariant = v1; + VariantContext smallerVariant = v2; + int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + int length2 = v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + int largerLength = length1; + int smallerLength = length2; + + // Swap variants if necessary + if (length2 > length1) { + largerVariant = v2; + smallerVariant = v1; + largerLength = length2; + smallerLength = length1; + } + + // Get variant attributes + String variantId1 = largerVariant.getID(); + String variantId2 = smallerVariant.getID(); + Map variantRdCn1 = getRdCnForVariant(largerVariant); + Map variantRdCn2 = getRdCnForVariant(smallerVariant); + Map> variantSupport1 = getSupportForVariant(largerVariant); + Map> variantSupport2 = getSupportForVariant(smallerVariant); + String svType1 = largerVariant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + String svType2 = smallerVariant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + + // Calculate overlap + int minEnd = Math.min( + largerVariant.getStart() + largerVariant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0), + smallerVariant.getStart() + smallerVariant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) + ); + int maxStart = Math.max(largerVariant.getStart(), smallerVariant.getStart()); + int lengthOverlap = minEnd - maxStart + 1; + double overlap1 = (double) lengthOverlap / (double) largerLength; + double overlap2 = (double) lengthOverlap / (double) smallerLength; + + // Get samples with abnormal CN across both variants + Set samples = new HashSet<>(abnormalRdCn.getOrDefault(variantId1, Collections.emptySet())); + samples.retainAll(abnormalRdCn.getOrDefault(variantId2, Collections.emptySet())); + + // Iterate through samples to test against conditions + for (String sample : samples) { + String id1 = variantId1 + "@" + sample; + String id2 = variantId2 + "@" + sample; + if (revisedComplete.contains(id1)) { + continue; + } + + // Initialize variables for evaluation + int rdCn1 = revisedCopyNumbers.getOrDefault(variantId1, Collections.emptyMap()).getOrDefault(sample, variantRdCn1.get(sample)); + int rdCn2 = revisedCopyNumbers.getOrDefault(variantId2, Collections.emptyMap()).getOrDefault(sample, variantRdCn2.get(sample)); + Set support1 = variantSupport1.get(sample); + Set support2 = variantSupport2.get(sample); + Genotype genotype2 = smallerVariant.getGenotype(sample); + + // Condition 1: Smaller depth call is being driven by larger call + if (support1.contains(GATKSVVCFConstants.EV_VALUES.get(1)) && support1.size() > 1 + && support2.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) + && overlap2 > 0.5 && !largerVariant.hasAttribute(GATKSVVCFConstants.MULTI_CNV)) { + if (rdCn1 == 0) { + makeRevision(id2, rdCn2 + 2); + } else if (rdCn1 == 1) { + makeRevision(id2, rdCn2 + rdCn1); + } else if (rdCn1 > 1) { + int newCN = rdCn2 - rdCn1 + 2; + newCN = Math.max(newCN, 0); + makeRevision(id2, newCN); + } + } + + // Condition 2: Smaller CNV is driven by larger CNV genotype + else if (support1.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) + && support2.contains(GATKSVVCFConstants.EV_VALUES.get(1)) && support2.size() > 1 + && overlap1 > 0.5 && overlap2 > 0.5 && !smallerVariant.hasAttribute(GATKSVVCFConstants.MULTI_CNV) + && !genotype2.isHomRef()) { + if (rdCn2 == 0) { + makeRevision(id1, rdCn1 + 2); + } else if (rdCn2 == 1) { + makeRevision(id1, rdCn1 + rdCn2); + } else if (rdCn2 > 1) { + int newCN = rdCn1 - rdCn2 + 2; + newCN = Math.max(newCN, 0); + makeRevision(id1, newCN); + } + } + + // Condition 3: Depth-only calls where smaller call is driven by larger call + else if (support1.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) + && support2.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) + && overlap2 > 0.5 && !largerVariant.hasAttribute(GATKSVVCFConstants.MULTI_CNV) && svType1.equals(svType2)) { + if (rdCn1 == 0 && rdCn1 != rdCn2) { + makeRevision(id2, rdCn2 + 2); + } else if (rdCn1 == 1 && rdCn1 > rdCn2) { + makeRevision(id2, 1); + } else if (rdCn1 > 1 && rdCn1 < rdCn2) { + makeRevision(id2, Math.max(rdCn2 - rdCn1 + 2, 0)); + } else { + makeRevision(id2, 2); + } + } + + // Condition 4: Any other time a larger call drives a smaller call + else if (support1.contains(GATKSVVCFConstants.EV_VALUES.get(1)) + && overlap2 > 0.5 && !largerVariant.hasAttribute(GATKSVVCFConstants.MULTI_CNV) && largerLength > MIN_VARIANT_SIZE) { + if (rdCn1 == 0) { + makeRevision(id2, rdCn2 + 2); + } else if (rdCn1 == 1) { + makeRevision(id2, rdCn2 + rdCn1); + } else if (rdCn1 > 1) { + int newCN = rdCn2 - rdCn1 + 2; + newCN = Math.max(newCN, 0); + makeRevision(id2, newCN); + } + } + } + } + + private void processRevisedCn(final VariantContextBuilder builder, final VariantContext variant) { + // Initialize data structures + final String variantID = variant.getID(); + List genotypes = builder.getGenotypes(); + List updatedGenotypes = new ArrayList<>(genotypes.size()); + + // Replace revised alleles and copy numbers + for (Genotype genotype : genotypes) { + String sampleName = genotype.getSampleName(); + if (revisedCopyNumbers.get(variantID).containsKey(sampleName)) { + GenotypeBuilder gb = new GenotypeBuilder(genotype); + gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); + gb.attribute(GATKSVVCFConstants.RD_CN, revisedCopyNumbers.get(variantID).get(sampleName)); + updatedGenotypes.add(gb.make()); + } else { + updatedGenotypes.add(genotype); + } + } + builder.genotypes(updatedGenotypes); + } + + private void makeRevision(final String id, final int val) { + String[] tokens = id.split("@"); + String variantId = tokens[0]; + String sample = tokens[1]; + revisedCopyNumbers.computeIfAbsent(variantId, k -> new HashMap<>()).put(sample, val); + if (val == 2) { + revisedComplete.add(id); + } + } + + private Map> getSupportForVariant(final VariantContext variant) { + Map> supportMap = new HashMap<>(); + for (String sample : variant.getSampleNames()) { + Genotype genotype = variant.getGenotype(sample); + String supportStr = genotype.hasExtendedAttribute(GATKSVVCFConstants.EV) ? genotype.getExtendedAttribute(GATKSVVCFConstants.EV).toString() : ""; + Set supportSet = new HashSet<>(); + if (!supportStr.isEmpty()) { + supportSet.addAll(Arrays.asList(supportStr.split(","))); + } + supportMap.put(sample, supportSet); + } + return supportMap; + } + + private Map getRdCnForVariant(final VariantContext variant) { + Map rdCnMap = new HashMap<>(); + for (String sample : variant.getSampleNames()) { + Genotype genotype = variant.getGenotype(sample); + if (genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { + rdCnMap.put(sample, Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString())); + } + } + return rdCnMap; + } + + private boolean isDelDup(final VariantContext variant) { + String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + return svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP); + } + + private boolean isLarge(final VariantContext variant, final int minSize) { + int variantLength = Math.abs(variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); + return variantLength >= minSize; + } + + private boolean overlaps(final VariantContext v1, final VariantContext v2) { + return v1.getContig().equals(v2.getContig()) + && v1.getStart() <= (v2.getStart() + v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) + && v2.getStart() <= (v1.getStart() + v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); + } +} \ No newline at end of file diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvGts.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvGts.java new file mode 100644 index 00000000000..a2a392a1a8b --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvGts.java @@ -0,0 +1,339 @@ +package org.broadinstitute.hellbender.tools.walkers.sv; + +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.GenotypeBuilder; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLineType; +import htsjdk.variant.vcf.VCFInfoHeaderLine; + +import org.apache.commons.lang3.tuple.ImmutablePair; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; +import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; + +import java.util.Arrays; +import java.util.List; +import java.util.ArrayList; +import java.util.Set; +import java.util.Map; +import java.util.HashSet; +import java.util.HashMap; +import java.util.Collections; +import org.apache.commons.lang3.tuple.Pair; + +/** + * Completes a series of cleaning steps for a VCF produced by the GATK-SV pipeline. + * + *

          Inputs

          + *
            + *
          • + * TODO + *
          • + *
          + * + *

          Output

          + *
            + *
          • + * TODO + *
          • + *
          + * + *

          Usage Example

          + *
          + *     TODO
          + * 
          + * + *

          Processing Steps

          + *
            + *
          1. + * TODO + *
          2. + *
          + */ +@CommandLineProgramProperties( + summary = "Clean and format SV VCF", + oneLineSummary = "Clean and format SV VCF", + programGroup = StructuralVariantDiscoveryProgramGroup.class +) +@BetaFeature +@DocumentedFeature +public class SVReviseOverlappingCnvGts extends MultiplePassVariantWalker { + @Argument( + fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + doc = "Output VCF name" + ) + private GATKPath outputVcf; + + private VariantContextWriter vcfWriter; + + // Data structures to hold accumulated data across variants + private static final List overlappingVariantsBuffer = new ArrayList<>(); + + private static final Map>> revisedEventsAll = new HashMap<>(); + private static final Map> revisedEventsFiltered = new HashMap<>(); + private static final Map> currentCopyNumbers = new HashMap<>(); + + private static final int MIN_VARIANT_SIZE = 5000; + + @Override + protected int numberOfPasses() { return 3; } + + @Override + protected void afterNthPass(final int n) { + if (n == 0) { + processCollectedVariants(); + } + } + + @Override + public void onTraversalStart() { + vcfWriter = createVCFWriter(outputVcf); + final VCFHeader header = getHeaderForVariants(); + header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.MULTI_CNV, 0, VCFHeaderLineType.Flag, "Variant is a multiallelic CNV")); + vcfWriter.writeHeader(header); + } + + @Override + public void closeTool() { + if (vcfWriter != null) { + vcfWriter.close(); + } + } + + @Override + protected void nthPassApply(final VariantContext variant, final ReadsContext readsContext, + final ReferenceContext referenceContext, final FeatureContext featureContext, final int n) { + switch (n) { + case 0: + firstPassApply(variant); + break; + case 1: + secondPassApply(variant); + break; + case 2: + thirdPassApply(variant); + break; + } + } + + public void firstPassApply(final VariantContext variant) { + if (!isDelDup(variant) || !isLarge(variant, MIN_VARIANT_SIZE)) { + return; + } + + // Process overlaps with variants in the buffer + overlappingVariantsBuffer.removeIf(vc -> !vc.getContig().equals(variant.getContig()) + || (vc.getStart() + vc.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) < variant.getStart()); + for (VariantContext bufferedVariant : overlappingVariantsBuffer) { + if (overlaps(bufferedVariant, variant)) { + processOverlap(bufferedVariant, variant); + } + } + overlappingVariantsBuffer.add(variant); + } + + public void secondPassApply(final VariantContext variant) { + if (!revisedEventsFiltered.containsKey(variant.getID())) { + return; + } + + // Initialize data structures + final String variantId = variant.getID(); + final Set samples = revisedEventsFiltered.get(variantId); + final Map variantRdCn = new HashMap<>(); + + // Initialize revisedRdCn value for each variant + for (final String sampleName : samples) { + final Genotype genotype = variant.getGenotype(sampleName); + final String rdCn = (String) genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN); + variantRdCn.put(sampleName, Integer.parseInt(rdCn)); + } + currentCopyNumbers.put(variantId, variantRdCn); + } + + public void thirdPassApply(final VariantContext variant) { + VariantContextBuilder builder = new VariantContextBuilder(variant); + if (revisedEventsAll.containsKey(variant.getID())) { + processRevisedEvent(builder, variant); + } + if (isDelDup(variant)) { + processCnvs(builder, variant); + } + vcfWriter.add(builder.make()); + } + + private void processCollectedVariants() { + // Prune variant-sample pairs we need RD_CN values for + for (final Map.Entry>> entry : revisedEventsAll.entrySet()) { + for (final Map.Entry> innerEntry : entry.getValue().entrySet()) { + final String sampleName = innerEntry.getKey(); + final String variantId = entry.getKey(); + final String widerVariantId = innerEntry.getValue().getLeft(); + final String svType = innerEntry.getValue().getRight(); + if (svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL)) { + revisedEventsFiltered.computeIfAbsent(variantId, k -> new HashSet<>()).add(sampleName); + revisedEventsFiltered.computeIfAbsent(widerVariantId, k -> new HashSet<>()).add(sampleName); + } + } + } + } + + private void processOverlap(final VariantContext v1, final VariantContext v2) { + // Get overlap data + VariantContext wider; + VariantContext narrower; + final int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + final int length2 = v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + if (length1 > length2) { + wider = v1; + narrower = v2; + } else if (length2 > length1) { + wider = v2; + narrower = v1; + } else { + return; + } + String widerID = wider.getID(); + String narrowerID = narrower.getID(); + + // Skip processing if same variant ID, SV type or samples + String widerSvType = wider.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + String narrowerSvType = narrower.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + Set widerSamples = getNonReferenceSamples(wider); + Set narrowerSamples = getNonReferenceSamples(narrower); + if (widerID.equals(narrowerID) || widerSvType.equals(narrowerSvType) || widerSamples.equals(narrowerSamples)) { + return; + } + + // Get samples present in wider but not in narrower + Set nonCommonSamples = new HashSet<>(widerSamples); + nonCommonSamples.removeAll(narrowerSamples); + if (nonCommonSamples.isEmpty()) { + return; + } + + // Revise variant if coverage exceeds threshold + double coverage = getCoverage(wider, narrower); + if (coverage >= 0.5) { + for (String sample : nonCommonSamples) { + revisedEventsAll.computeIfAbsent(narrowerID, k -> new HashMap<>()) + .put(sample, new ImmutablePair<>(widerID, widerSvType)); + } + } + } + + private void processRevisedEvent(final VariantContextBuilder builder, final VariantContext variant) { + // Initialize data structures + final String variantId = variant.getID(); + final Map> variantEvents = revisedEventsAll.get(variantId); + final List newGenotypes = new ArrayList<>(); + + // Create updated genotypes + for (String sample : variant.getSampleNamesOrderedByName()) { + final Genotype oldGenotype = variant.getGenotype(sample); + final Pair event = variantEvents.get(sample); + + if (event != null) { + final String widerVariantId = event.getLeft(); + final String widerSvType = event.getRight(); + final int currentRdCn = currentCopyNumbers.get(variantId).getOrDefault(sample, 0); + final int widerRdCn = currentCopyNumbers.getOrDefault(widerVariantId, new HashMap<>()).getOrDefault(sample, 0); + + int newVal = -1; + if (widerSvType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) && currentRdCn == 2 && widerRdCn == 3) { + newVal = 1; + } else if (widerSvType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) && currentRdCn == 2 && widerRdCn == 1) { + newVal = 3; + } + + if (newVal != -1) { + final GenotypeBuilder gb = new GenotypeBuilder(oldGenotype); + gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); + gb.GQ(Integer.parseInt((String) oldGenotype.getExtendedAttribute(GATKSVVCFConstants.RD_GQ))); + newGenotypes.add(gb.make()); + } else { + newGenotypes.add(oldGenotype); + } + } else { + newGenotypes.add(oldGenotype); + } + } + builder.genotypes(newGenotypes); + } + + private void processCnvs(final VariantContextBuilder builder, final VariantContext variant) { + final boolean isDel = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, "").equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL); + for (String sample : variant.getSampleNamesOrderedByName()) { + final Genotype genotype = variant.getGenotype(sample); + final String rdCnString = (String) genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN); + final int rdCn = Integer.parseInt(rdCnString); + if ((isDel && rdCn > 3) || (!isDel && (rdCn < 1 || rdCn > 4))) { + builder.attribute(GATKSVVCFConstants.MULTI_CNV, true); + break; + } + } + } + + private Map> getSupportForVariant(final VariantContext variant) { + Map> supportMap = new HashMap<>(); + for (String sample : variant.getSampleNames()) { + Genotype genotype = variant.getGenotype(sample); + String supportStr = genotype.hasExtendedAttribute(GATKSVVCFConstants.EV) ? genotype.getExtendedAttribute(GATKSVVCFConstants.EV).toString() : ""; + Set supportSet = new HashSet<>(); + if (!supportStr.isEmpty()) { + supportSet.addAll(Arrays.asList(supportStr.split(","))); + } + supportMap.put(sample, supportSet); + } + return supportMap; + } + + private boolean isDelDup(final VariantContext variant) { + String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + return svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP); + } + + private boolean isLarge(final VariantContext variant, final int minSize) { + int variantLength = Math.abs(variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); + return variantLength >= minSize; + } + + private boolean overlaps(final VariantContext v1, final VariantContext v2) { + return v1.getContig().equals(v2.getContig()) + && v1.getStart() <= (v2.getStart() + v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) + && v2.getStart() <= (v1.getStart() + v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); + } + + private Set getNonReferenceSamples(final VariantContext variant) { + Set samples = new HashSet<>(); + for (String sampleName : variant.getSampleNames()) { + Genotype genotype = variant.getGenotype(sampleName); + if (genotype.isCalled() && !genotype.isHomRef()) { + samples.add(sampleName); + } + } + return samples; + } + + private double getCoverage(final VariantContext wider, final VariantContext narrower) { + int nStart = narrower.getStart(); + int nStop = narrower.getEnd(); + int wStart = wider.getStart(); + int wStop = wider.getEnd(); + + if (wStart <= nStop && nStart <= wStop) { + int intersectionSize = Math.min(nStop, wStop) - Math.max(nStart, wStart) + 1; + return (double) intersectionSize / (nStop - nStart + 1); + } + return 0.0; + } +} \ No newline at end of file From 726144d5c04b47370f9b6e1260c7d0c87f40a50e Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Wed, 11 Dec 2024 12:22:43 -0500 Subject: [PATCH 49/58] Updated to include necessary imports only --- .../tools/walkers/sv/SVReviseAbnormalAllosomes.java | 6 +++++- .../tools/walkers/sv/SVReviseLargeCnvs.java | 4 ++-- .../tools/walkers/sv/SVReviseMultiallelicCnvs.java | 4 +++- .../tools/walkers/sv/SVReviseOverlappingCnvCns.java | 12 +++++++++--- .../tools/walkers/sv/SVReviseOverlappingCnvGts.java | 1 - 5 files changed, 19 insertions(+), 8 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseAbnormalAllosomes.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseAbnormalAllosomes.java index 30dcf803201..38f536eee63 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseAbnormalAllosomes.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseAbnormalAllosomes.java @@ -1,7 +1,11 @@ package org.broadinstitute.hellbender.tools.walkers.sv; -import htsjdk.variant.variantcontext.*; +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.GenotypeBuilder; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.variant.variantcontext.writer.VariantContextWriter; + import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.BetaFeature; import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseLargeCnvs.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseLargeCnvs.java index dd1637adfdf..ed8c892a10a 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseLargeCnvs.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseLargeCnvs.java @@ -1,7 +1,6 @@ package org.broadinstitute.hellbender.tools.walkers.sv; import htsjdk.variant.variantcontext.Allele; - import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.GenotypeBuilder; import htsjdk.variant.variantcontext.VariantContext; @@ -10,8 +9,9 @@ import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; +import htsjdk.variant.vcf.VCFFormatHeaderLine; +import htsjdk.variant.vcf.VCFFilterHeaderLine; -import htsjdk.variant.vcf.*; import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.BetaFeature; import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseMultiallelicCnvs.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseMultiallelicCnvs.java index 3690e28470d..e2587da32cd 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseMultiallelicCnvs.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseMultiallelicCnvs.java @@ -3,8 +3,10 @@ import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLineType; +import htsjdk.variant.vcf.VCFFormatHeaderLine; -import htsjdk.variant.vcf.*; import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.BetaFeature; import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvCns.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvCns.java index 219647e5467..829df188582 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvCns.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvCns.java @@ -8,8 +8,7 @@ import htsjdk.variant.vcf.VCFHeader; import htsjdk.variant.vcf.VCFHeaderLineType; import htsjdk.variant.vcf.VCFInfoHeaderLine; -import org.apache.commons.lang3.tuple.ImmutablePair; -import org.apache.commons.lang3.tuple.Pair; + import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.BetaFeature; import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; @@ -19,7 +18,14 @@ import org.broadinstitute.hellbender.engine.*; import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; -import java.util.*; +import java.util.Arrays; +import java.util.List; +import java.util.ArrayList; +import java.util.Set; +import java.util.Map; +import java.util.HashSet; +import java.util.HashMap; +import java.util.Collections; /** * Completes a series of cleaning steps for a VCF produced by the GATK-SV pipeline. diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvGts.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvGts.java index a2a392a1a8b..30f6ea4dc5e 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvGts.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvGts.java @@ -26,7 +26,6 @@ import java.util.Map; import java.util.HashSet; import java.util.HashMap; -import java.util.Collections; import org.apache.commons.lang3.tuple.Pair; /** From 0ab56ebd439244f0b310fb7d0800771600767981 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Fri, 13 Dec 2024 10:45:08 -0500 Subject: [PATCH 50/58] Updated header writing --- .../tools/walkers/sv/SVReviseMultiallelicCnvs.java | 4 +--- .../walkers/sv/SVReviseOverlappingCnvCns.java | 4 +--- .../walkers/sv/SVReviseOverlappingCnvGts.java | 14 -------------- 3 files changed, 2 insertions(+), 20 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseMultiallelicCnvs.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseMultiallelicCnvs.java index e2587da32cd..2bcdfefc1cb 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseMultiallelicCnvs.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseMultiallelicCnvs.java @@ -81,10 +81,8 @@ protected void afterNthPass(int n) {} @Override public void onTraversalStart() { - final VCFHeader header = getHeaderForVariants(); - header.addMetaDataLine(new VCFFormatHeaderLine(GATKSVVCFConstants.EV, 1, VCFHeaderLineType.String, "Classes of evidence supporting final genotype")); vcfWriter = createVCFWriter(outputVcf); - vcfWriter.writeHeader(header); + vcfWriter.writeHeader(getHeaderForVariants()); } @Override diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvCns.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvCns.java index 829df188582..fc6f2219b14 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvCns.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvCns.java @@ -85,9 +85,7 @@ public class SVReviseOverlappingCnvCns extends MultiplePassVariantWalker { @Override public void onTraversalStart() { vcfWriter = createVCFWriter(outputVcf); - final VCFHeader header = getHeaderForVariants(); - header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.MULTI_CNV, 0, VCFHeaderLineType.Flag, "Variant is a multiallelic CNV")); - vcfWriter.writeHeader(header); + vcfWriter.writeHeader(getHeaderForVariants()); } @Override diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvGts.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvGts.java index 30f6ea4dc5e..7ed17a2918e 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvGts.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvGts.java @@ -282,20 +282,6 @@ private void processCnvs(final VariantContextBuilder builder, final VariantConte } } - private Map> getSupportForVariant(final VariantContext variant) { - Map> supportMap = new HashMap<>(); - for (String sample : variant.getSampleNames()) { - Genotype genotype = variant.getGenotype(sample); - String supportStr = genotype.hasExtendedAttribute(GATKSVVCFConstants.EV) ? genotype.getExtendedAttribute(GATKSVVCFConstants.EV).toString() : ""; - Set supportSet = new HashSet<>(); - if (!supportStr.isEmpty()) { - supportSet.addAll(Arrays.asList(supportStr.split(","))); - } - supportMap.put(sample, supportSet); - } - return supportMap; - } - private boolean isDelDup(final VariantContext variant) { String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); return svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP); From 8fbd27e8bf2ced361c9125acde74d2f0936c0cd7 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Mon, 13 Jan 2025 10:55:00 -0500 Subject: [PATCH 51/58] Added sex revisions for male GT --- .../spark/sv/utils/GATKSVVCFConstants.java | 21 ++-- .../walkers/sv/SVReviseAbnormalAllosomes.java | 13 ++- .../tools/walkers/sv/SVReviseLargeCnvs.java | 41 ++++--- .../walkers/sv/SVReviseMultiallelicCnvs.java | 22 ++-- .../walkers/sv/SVReviseOverlappingCnvCns.java | 107 +++++++++--------- .../walkers/sv/SVReviseOverlappingCnvGts.java | 58 +++++----- 6 files changed, 130 insertions(+), 132 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java index 3b06d240e19..5e2920f8204 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java @@ -165,35 +165,30 @@ public enum ComplexVariantSubtype { public static final String LOW_QS_SCORE_FILTER_KEY = "LOW_QS"; public static final String FREQUENCY_FILTER_KEY = "FREQ"; - // SVCleanPt1a - public static final String EV = "EV"; - public static final List EV_VALUES = Arrays.asList( - null, "RD", "PE", "RD,PE", "SR", "RD,SR", "PE,SR", "RD,PE,SR" - ); + // CleanVcf public static final String ME = "ME"; public static final String VAR_GQ = "varGQ"; public static final String MULTIALLELIC = "MULTIALLELIC"; public static final String UNRESOLVED = "UNRESOLVED"; public static final String HIGH_SR_BACKGROUND = "HIGH_SR_BACKGROUND"; public static final String BOTHSIDES_SUPPORT = "BOTHSIDES_SUPPORT"; + public static final String PESR_GT_OVERDISPERSION = "PESR_GT_OVERDISPERSION"; public static final String REVISED_EVENT = "REVISED_EVENT"; - public static final String RD_CN = "RD_CN"; - - // SVCleanPt1b - public static final String RD_GQ = "RD_GQ"; public static final String MULTI_CNV = "MULTI_CNV"; - // SVCleanPt4 - public static final String PESR_GT_OVERDISPERSION = "PESR_GT_OVERDISPERSION"; + public static final String RD_CN = "RD_CN"; + public static final String RD_GQ = "RD_GQ"; public static final String PE_GT = "PE_GT"; public static final String SR_GT = "SR_GT"; public static final String PE_GQ = "PE_GQ"; public static final String SR_GQ = "SR_GQ"; public static final String CNV = "CNV"; - - // SVCleanPt5 public static final String UNR = "UNR"; public static final String EVENT = "EVENT"; + public static final String EV = "EV"; + public static final List EV_VALUES = Arrays.asList( + null, "RD", "PE", "RD,PE", "SR", "RD,SR", "PE,SR", "RD,PE,SR" + ); public static final Set FILTER_VCF_LINES = new HashSet<>(Arrays.asList( "CIPOS", "CIEND", "RMSSTD", "source", "bcftools", "GATKCommandLine", "#CHROM" )); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseAbnormalAllosomes.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseAbnormalAllosomes.java index 38f536eee63..fecb8860445 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseAbnormalAllosomes.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseAbnormalAllosomes.java @@ -81,18 +81,19 @@ public void closeTool() { } @Override - public void apply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext) { - VariantContextBuilder builder = new VariantContextBuilder(variant); - if (!variant.getAttributeAsBoolean(GATKSVVCFConstants.REVISED_EVENT, false)) { + public void apply(final VariantContext variant, final ReadsContext readsContext, + final ReferenceContext referenceContext, final FeatureContext featureContext) { + final VariantContextBuilder builder = new VariantContextBuilder(variant); + if (variant.getAttributeAsBoolean(GATKSVVCFConstants.REVISED_EVENT, false)) { processRevisedSex(variant, builder); } vcfWriter.add(builder.make()); } private void processRevisedSex(final VariantContext variant, final VariantContextBuilder builder) { - List genotypes = variant.getGenotypes(); - List updatedGenotypes = new ArrayList<>(genotypes.size()); - for (Genotype genotype : genotypes) { + final List genotypes = variant.getGenotypes(); + final List updatedGenotypes = new ArrayList<>(genotypes.size()); + for (final Genotype genotype : genotypes) { if (Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 0).toString()) > 0) { int newRdCn = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()) - 1; GenotypeBuilder gb = new GenotypeBuilder(genotype); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseLargeCnvs.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseLargeCnvs.java index ed8c892a10a..508ea310eb4 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseLargeCnvs.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseLargeCnvs.java @@ -134,7 +134,7 @@ public void closeTool() { @Override public void apply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext) { // Initialize data structures - VariantContextBuilder builder = new VariantContextBuilder(variant); + final VariantContextBuilder builder = new VariantContextBuilder(variant); List genotypes = variant.getGenotypes(); // Process variants @@ -152,9 +152,9 @@ public void apply(final VariantContext variant, final ReadsContext readsContext, private void processMultiallelic(final VariantContextBuilder builder, final List genotypes) { int numGtOver2 = 0; for (Genotype genotype : genotypes) { - Integer peGt = genotype.hasExtendedAttribute(GATKSVVCFConstants.PE_GT) ? + final Integer peGt = genotype.hasExtendedAttribute(GATKSVVCFConstants.PE_GT) ? Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.PE_GT).toString()) : null; - Integer srGt = genotype.hasExtendedAttribute(GATKSVVCFConstants.SR_GT) ? + final Integer srGt = genotype.hasExtendedAttribute(GATKSVVCFConstants.SR_GT) ? Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.SR_GT).toString()) : null; int gt; if (peGt == null) { @@ -166,9 +166,9 @@ private void processMultiallelic(final VariantContextBuilder builder, final List } else if (peGt == 0) { gt = srGt; } else { - Integer peGq = genotype.hasExtendedAttribute(GATKSVVCFConstants.PE_GQ) ? + final Integer peGq = genotype.hasExtendedAttribute(GATKSVVCFConstants.PE_GQ) ? Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.PE_GQ).toString()) : null; - Integer srGq = genotype.hasExtendedAttribute(GATKSVVCFConstants.SR_GQ) ? + final Integer srGq = genotype.hasExtendedAttribute(GATKSVVCFConstants.SR_GQ) ? Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.SR_GQ).toString()) : null; if (peGq != null && srGq != null && peGq >= srGq) { gt = peGt; @@ -177,7 +177,7 @@ private void processMultiallelic(final VariantContextBuilder builder, final List } } if (gt > 2) { - numGtOver2++; + numGtOver2 += 1; } } if (numGtOver2 > maxVF) { @@ -204,7 +204,7 @@ private List processLargeDeletions(final VariantContext variant, final } boolean gt5kbFilter = false; - List allowedAlleleIndices = Arrays.asList(-1, 0, 1); + final List allowedAlleleIndices = Arrays.asList(-1, 0, 1); if (genotypes.stream().anyMatch(g -> g.getAlleles().stream().anyMatch(a -> !allowedAlleleIndices.contains(variant.getAlleleIndex(a))))) { gt5kbFilter = true; } else if (variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) >= MIN_MULTIALLELIC_EVENT_SIZE && !multiallelicFilter) { @@ -213,8 +213,8 @@ private List processLargeDeletions(final VariantContext variant, final List updatedGenotypes = new ArrayList<>(genotypes.size()); if (gt5kbFilter) { - for (Genotype genotype : genotypes) { - GenotypeBuilder gb = new GenotypeBuilder(genotype); + for (final Genotype genotype : genotypes) { + final GenotypeBuilder gb = new GenotypeBuilder(genotype); if (!genotype.isNoCall()) { if (genotype.hasGQ() && Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 0).toString()) >= 2) { gb.alleles(Arrays.asList(variant.getReference(), variant.getReference())); @@ -231,7 +231,7 @@ private List processLargeDeletions(final VariantContext variant, final updatedGenotypes = new ArrayList<>(genotypes.size()); if (multiallelicFilter) { - for (Genotype genotype : genotypes) { + for (final Genotype genotype : genotypes) { GenotypeBuilder gb = new GenotypeBuilder(genotype); gb.noGQ(); gb.alleles(Arrays.asList(Allele.NO_CALL)); @@ -257,7 +257,7 @@ private List processLargeDuplications(final VariantContext variant, fi boolean multiallelicFilter = false; if (variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) >= MIN_LARGE_EVENT_SIZE) { Map sampleRdCn = new HashMap<>(); - for (Genotype genotype : genotypes) { + for (final Genotype genotype : genotypes) { if (!outlierSamples.contains(genotype.getSampleName()) && genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { sampleRdCn.put(genotype.getSampleName(), Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString())); } @@ -273,7 +273,7 @@ private List processLargeDuplications(final VariantContext variant, fi } boolean gt5kbFilter = false; - List allowedAlleleIndices = Arrays.asList(-1, 0, 1); + final List allowedAlleleIndices = Arrays.asList(-1, 0, 1); if (genotypes.stream().anyMatch(g -> g.getAlleles().stream().anyMatch(a -> !allowedAlleleIndices.contains(variant.getAlleleIndex(a))))) { gt5kbFilter = true; } else if (variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) >= MIN_MULTIALLELIC_EVENT_SIZE && !multiallelicFilter) { @@ -282,8 +282,8 @@ private List processLargeDuplications(final VariantContext variant, fi List updatedGenotypes = new ArrayList<>(genotypes.size()); if (gt5kbFilter) { - for (Genotype genotype : genotypes) { - GenotypeBuilder gb = new GenotypeBuilder(genotype); + for (final Genotype genotype : genotypes) { + final GenotypeBuilder gb = new GenotypeBuilder(genotype); if (!genotype.isNoCall()) { if (genotype.hasGQ() && Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 3).toString()) <= 2) { gb.alleles(Arrays.asList(variant.getReference(), variant.getReference())); @@ -300,8 +300,8 @@ private List processLargeDuplications(final VariantContext variant, fi updatedGenotypes = new ArrayList<>(genotypes.size()); if (multiallelicFilter) { - for (Genotype genotype : genotypes) { - GenotypeBuilder gb = new GenotypeBuilder(genotype); + for (final Genotype genotype : genotypes) { + final GenotypeBuilder gb = new GenotypeBuilder(genotype); gb.noGQ(); gb.alleles(Arrays.asList(Allele.NO_CALL)); gb.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN)); @@ -319,16 +319,15 @@ private List processLargeDuplications(final VariantContext variant, fi } public boolean isCalled(final VariantContextBuilder builder, final List genotypes) { - for (Genotype genotype : genotypes) { + for (final Genotype genotype : genotypes) { if (!isNoCallGt(genotype.getAlleles())) { return true; } } if (builder.getAttributes().getOrDefault(GATKSVVCFConstants.SVTYPE, "").equals(GATKSVVCFConstants.CNV)) { - for (Genotype genotype : genotypes) { - final int cn = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, 2).toString()); - if (cn != 2) { + for (final Genotype genotype : genotypes) { + if (Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, 2).toString()) != 2) { return true; } } @@ -337,7 +336,7 @@ public boolean isCalled(final VariantContextBuilder builder, final List alleles) { + private boolean isNoCallGt(final List alleles) { if (alleles.size() == 1 && alleles.get(0).isReference()) return true; else if (alleles.size() == 2 && alleles.get(0).isReference() && alleles.get(1).isReference()) return true; else if (alleles.size() == 1 && alleles.get(0).isNoCall()) return true; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseMultiallelicCnvs.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseMultiallelicCnvs.java index 2bcdfefc1cb..8da4fd855fa 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseMultiallelicCnvs.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseMultiallelicCnvs.java @@ -3,9 +3,6 @@ import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.variant.variantcontext.writer.VariantContextWriter; -import htsjdk.variant.vcf.VCFHeader; -import htsjdk.variant.vcf.VCFHeaderLineType; -import htsjdk.variant.vcf.VCFFormatHeaderLine; import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.BetaFeature; @@ -93,7 +90,8 @@ public void closeTool() { } @Override - protected void nthPassApply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext, int n) { + protected void nthPassApply(final VariantContext variant, final ReadsContext readsContext, + final ReferenceContext referenceContext, final FeatureContext featureContext, int n) { switch (n) { case 0: firstPassApply(variant); @@ -111,7 +109,7 @@ public void firstPassApply(final VariantContext variant) { overlappingVariantsBuffer.removeIf(vc -> !vc.getContig().equals(variant.getContig()) || (vc.getStart() + vc.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) < variant.getStart()); - for (VariantContext bufferedVariant : overlappingVariantsBuffer) { + for (final VariantContext bufferedVariant : overlappingVariantsBuffer) { if (overlaps(bufferedVariant, variant)) { processVariantPair(bufferedVariant, variant); } @@ -124,16 +122,16 @@ public void secondPassApply(final VariantContext variant) { return; } - VariantContextBuilder builder = new VariantContextBuilder(variant); + final VariantContextBuilder builder = new VariantContextBuilder(variant); vcfWriter.add(builder.make()); } - private void processVariantPair(VariantContext v1, VariantContext v2) { + private void processVariantPair(final VariantContext v1, final VariantContext v2) { // Determine larger variant VariantContext largerVariant = v1; VariantContext smallerVariant = v2; - int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); - int length2 = v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + final int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + final int length2 = v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); int smallerLength = length2; // Swap variants if necessary @@ -148,14 +146,14 @@ private void processVariantPair(VariantContext v1, VariantContext v2) { largerVariant.getStart() + largerVariant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0), smallerVariant.getStart() + smallerVariant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) ); - int maxStart = Math.max(largerVariant.getStart(), smallerVariant.getStart()); - int overlapLength = minEnd - maxStart + 1; + final int maxStart = Math.max(largerVariant.getStart(), smallerVariant.getStart()); + final int overlapLength = minEnd - maxStart + 1; if (overlapLength <= 0) { return; } // Filter variant based on conditions - double coverage = (double) overlapLength / smallerLength; + final double coverage = (double) overlapLength / smallerLength; if (coverage > 0.5 && !filteredVariantIds.contains(largerVariant.getID())) { filteredVariantIds.add(smallerVariant.getID()); } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvCns.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvCns.java index fc6f2219b14..f7e607a17a5 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvCns.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvCns.java @@ -5,9 +5,6 @@ import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.variant.variantcontext.writer.VariantContextWriter; -import htsjdk.variant.vcf.VCFHeader; -import htsjdk.variant.vcf.VCFHeaderLineType; -import htsjdk.variant.vcf.VCFInfoHeaderLine; import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.BetaFeature; @@ -102,7 +99,8 @@ public void closeTool() { protected void afterNthPass(int n) {} @Override - protected void nthPassApply(VariantContext variant, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext, int n) { + protected void nthPassApply(final VariantContext variant, final ReadsContext readsContext, + final ReferenceContext referenceContext, final FeatureContext featureContext, int n) { switch (n) { case 0: firstPassApply(variant); @@ -120,11 +118,12 @@ public void firstPassApply(final VariantContext variant) { } // Flag sample as having an abnormal copy number if it passes certain conditions - for (String sample : variant.getSampleNames()) { - Genotype genotype = variant.getGenotype(sample); - int rdCn = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()); + for (final String sample : variant.getSampleNames()) { + final Genotype genotype = variant.getGenotype(sample); + if (!genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) continue; - String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + final int rdCn = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()); + final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); if ((svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) && rdCn < 2) || (svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) && rdCn > 2)) { abnormalRdCn.computeIfAbsent(variant.getID(), k -> new HashSet<>()).add(sample); } @@ -133,7 +132,7 @@ public void firstPassApply(final VariantContext variant) { // Process overlaps with variants in the buffer overlappingVariantsBuffer.removeIf(vc -> !vc.getContig().equals(variant.getContig()) || (vc.getStart() + vc.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) < variant.getStart()); - for (VariantContext bufferedVariant : overlappingVariantsBuffer) { + for (final VariantContext bufferedVariant : overlappingVariantsBuffer) { if (overlaps(variant, bufferedVariant)) { adjustCopyNumber(variant, bufferedVariant); } @@ -151,10 +150,10 @@ public void secondPassApply(final VariantContext variant) { private void adjustCopyNumber(final VariantContext v1, final VariantContext v2) { // Determine larger variant + final int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + final int length2 = v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); VariantContext largerVariant = v1; VariantContext smallerVariant = v2; - int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); - int length2 = v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); int largerLength = length1; int smallerLength = length2; @@ -167,43 +166,43 @@ private void adjustCopyNumber(final VariantContext v1, final VariantContext v2) } // Get variant attributes - String variantId1 = largerVariant.getID(); - String variantId2 = smallerVariant.getID(); - Map variantRdCn1 = getRdCnForVariant(largerVariant); - Map variantRdCn2 = getRdCnForVariant(smallerVariant); - Map> variantSupport1 = getSupportForVariant(largerVariant); - Map> variantSupport2 = getSupportForVariant(smallerVariant); - String svType1 = largerVariant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); - String svType2 = smallerVariant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + final String variantId1 = largerVariant.getID(); + final String variantId2 = smallerVariant.getID(); + final Map variantRdCn1 = getRdCnForVariant(largerVariant); + final Map variantRdCn2 = getRdCnForVariant(smallerVariant); + final Map> variantSupport1 = getSupportForVariant(largerVariant); + final Map> variantSupport2 = getSupportForVariant(smallerVariant); + final String svType1 = largerVariant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + final String svType2 = smallerVariant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); // Calculate overlap - int minEnd = Math.min( + final int minEnd = Math.min( largerVariant.getStart() + largerVariant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0), smallerVariant.getStart() + smallerVariant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) ); - int maxStart = Math.max(largerVariant.getStart(), smallerVariant.getStart()); - int lengthOverlap = minEnd - maxStart + 1; - double overlap1 = (double) lengthOverlap / (double) largerLength; - double overlap2 = (double) lengthOverlap / (double) smallerLength; + final int maxStart = Math.max(largerVariant.getStart(), smallerVariant.getStart()); + final int lengthOverlap = minEnd - maxStart + 1; + final double overlap1 = (double) lengthOverlap / (double) largerLength; + final double overlap2 = (double) lengthOverlap / (double) smallerLength; // Get samples with abnormal CN across both variants - Set samples = new HashSet<>(abnormalRdCn.getOrDefault(variantId1, Collections.emptySet())); + final Set samples = new HashSet<>(abnormalRdCn.getOrDefault(variantId1, Collections.emptySet())); samples.retainAll(abnormalRdCn.getOrDefault(variantId2, Collections.emptySet())); // Iterate through samples to test against conditions for (String sample : samples) { - String id1 = variantId1 + "@" + sample; - String id2 = variantId2 + "@" + sample; + final String id1 = variantId1 + "@" + sample; + final String id2 = variantId2 + "@" + sample; if (revisedComplete.contains(id1)) { continue; } // Initialize variables for evaluation - int rdCn1 = revisedCopyNumbers.getOrDefault(variantId1, Collections.emptyMap()).getOrDefault(sample, variantRdCn1.get(sample)); - int rdCn2 = revisedCopyNumbers.getOrDefault(variantId2, Collections.emptyMap()).getOrDefault(sample, variantRdCn2.get(sample)); - Set support1 = variantSupport1.get(sample); - Set support2 = variantSupport2.get(sample); - Genotype genotype2 = smallerVariant.getGenotype(sample); + final int rdCn1 = revisedCopyNumbers.getOrDefault(variantId1, Collections.emptyMap()).getOrDefault(sample, variantRdCn1.get(sample)); + final int rdCn2 = revisedCopyNumbers.getOrDefault(variantId2, Collections.emptyMap()).getOrDefault(sample, variantRdCn2.get(sample)); + final Set support1 = variantSupport1.get(sample); + final Set support2 = variantSupport2.get(sample); + final Genotype genotype2 = smallerVariant.getGenotype(sample); // Condition 1: Smaller depth call is being driven by larger call if (support1.contains(GATKSVVCFConstants.EV_VALUES.get(1)) && support1.size() > 1 @@ -267,17 +266,27 @@ else if (support1.contains(GATKSVVCFConstants.EV_VALUES.get(1)) } } + private void makeRevision(final String id, final int val) { + final String[] tokens = id.split("@"); + final String variantId = tokens[0]; + final String sample = tokens[1]; + revisedCopyNumbers.computeIfAbsent(variantId, k -> new HashMap<>()).put(sample, val); + if (val == 2) { + revisedComplete.add(id); + } + } + private void processRevisedCn(final VariantContextBuilder builder, final VariantContext variant) { // Initialize data structures final String variantID = variant.getID(); - List genotypes = builder.getGenotypes(); - List updatedGenotypes = new ArrayList<>(genotypes.size()); + final List genotypes = builder.getGenotypes(); + final List updatedGenotypes = new ArrayList<>(genotypes.size()); // Replace revised alleles and copy numbers - for (Genotype genotype : genotypes) { - String sampleName = genotype.getSampleName(); + for (final Genotype genotype : genotypes) { + final String sampleName = genotype.getSampleName(); if (revisedCopyNumbers.get(variantID).containsKey(sampleName)) { - GenotypeBuilder gb = new GenotypeBuilder(genotype); + final GenotypeBuilder gb = new GenotypeBuilder(genotype); gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); gb.attribute(GATKSVVCFConstants.RD_CN, revisedCopyNumbers.get(variantID).get(sampleName)); updatedGenotypes.add(gb.make()); @@ -288,22 +297,12 @@ private void processRevisedCn(final VariantContextBuilder builder, final Variant builder.genotypes(updatedGenotypes); } - private void makeRevision(final String id, final int val) { - String[] tokens = id.split("@"); - String variantId = tokens[0]; - String sample = tokens[1]; - revisedCopyNumbers.computeIfAbsent(variantId, k -> new HashMap<>()).put(sample, val); - if (val == 2) { - revisedComplete.add(id); - } - } - private Map> getSupportForVariant(final VariantContext variant) { Map> supportMap = new HashMap<>(); for (String sample : variant.getSampleNames()) { - Genotype genotype = variant.getGenotype(sample); - String supportStr = genotype.hasExtendedAttribute(GATKSVVCFConstants.EV) ? genotype.getExtendedAttribute(GATKSVVCFConstants.EV).toString() : ""; - Set supportSet = new HashSet<>(); + final Genotype genotype = variant.getGenotype(sample); + final String supportStr = genotype.hasExtendedAttribute(GATKSVVCFConstants.EV) ? genotype.getExtendedAttribute(GATKSVVCFConstants.EV).toString() : ""; + final Set supportSet = new HashSet<>(); if (!supportStr.isEmpty()) { supportSet.addAll(Arrays.asList(supportStr.split(","))); } @@ -313,9 +312,9 @@ private Map> getSupportForVariant(final VariantContext varia } private Map getRdCnForVariant(final VariantContext variant) { - Map rdCnMap = new HashMap<>(); + final Map rdCnMap = new HashMap<>(); for (String sample : variant.getSampleNames()) { - Genotype genotype = variant.getGenotype(sample); + final Genotype genotype = variant.getGenotype(sample); if (genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { rdCnMap.put(sample, Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString())); } @@ -324,12 +323,12 @@ private Map getRdCnForVariant(final VariantContext variant) { } private boolean isDelDup(final VariantContext variant) { - String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); return svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP); } private boolean isLarge(final VariantContext variant, final int minSize) { - int variantLength = Math.abs(variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); + final int variantLength = Math.abs(variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); return variantLength >= minSize; } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvGts.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvGts.java index 7ed17a2918e..59798aa3444 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvGts.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvGts.java @@ -132,7 +132,7 @@ public void firstPassApply(final VariantContext variant) { // Process overlaps with variants in the buffer overlappingVariantsBuffer.removeIf(vc -> !vc.getContig().equals(variant.getContig()) || (vc.getStart() + vc.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) < variant.getStart()); - for (VariantContext bufferedVariant : overlappingVariantsBuffer) { + for (final VariantContext bufferedVariant : overlappingVariantsBuffer) { if (overlaps(bufferedVariant, variant)) { processOverlap(bufferedVariant, variant); } @@ -153,14 +153,16 @@ public void secondPassApply(final VariantContext variant) { // Initialize revisedRdCn value for each variant for (final String sampleName : samples) { final Genotype genotype = variant.getGenotype(sampleName); - final String rdCn = (String) genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN); - variantRdCn.put(sampleName, Integer.parseInt(rdCn)); + if (!genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) continue; + + final int rdCn = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()); + variantRdCn.put(sampleName, rdCn); } currentCopyNumbers.put(variantId, variantRdCn); } public void thirdPassApply(final VariantContext variant) { - VariantContextBuilder builder = new VariantContextBuilder(variant); + final VariantContextBuilder builder = new VariantContextBuilder(variant); if (revisedEventsAll.containsKey(variant.getID())) { processRevisedEvent(builder, variant); } @@ -201,29 +203,29 @@ private void processOverlap(final VariantContext v1, final VariantContext v2) { } else { return; } - String widerID = wider.getID(); - String narrowerID = narrower.getID(); + final String widerID = wider.getID(); + final String narrowerID = narrower.getID(); // Skip processing if same variant ID, SV type or samples - String widerSvType = wider.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); - String narrowerSvType = narrower.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); - Set widerSamples = getNonReferenceSamples(wider); - Set narrowerSamples = getNonReferenceSamples(narrower); + final String widerSvType = wider.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + final String narrowerSvType = narrower.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + final Set widerSamples = getNonReferenceSamples(wider); + final Set narrowerSamples = getNonReferenceSamples(narrower); if (widerID.equals(narrowerID) || widerSvType.equals(narrowerSvType) || widerSamples.equals(narrowerSamples)) { return; } // Get samples present in wider but not in narrower - Set nonCommonSamples = new HashSet<>(widerSamples); + final Set nonCommonSamples = new HashSet<>(widerSamples); nonCommonSamples.removeAll(narrowerSamples); if (nonCommonSamples.isEmpty()) { return; } // Revise variant if coverage exceeds threshold - double coverage = getCoverage(wider, narrower); + final double coverage = getCoverage(wider, narrower); if (coverage >= 0.5) { - for (String sample : nonCommonSamples) { + for (final String sample : nonCommonSamples) { revisedEventsAll.computeIfAbsent(narrowerID, k -> new HashMap<>()) .put(sample, new ImmutablePair<>(widerID, widerSvType)); } @@ -257,7 +259,10 @@ private void processRevisedEvent(final VariantContextBuilder builder, final Vari if (newVal != -1) { final GenotypeBuilder gb = new GenotypeBuilder(oldGenotype); gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); - gb.GQ(Integer.parseInt((String) oldGenotype.getExtendedAttribute(GATKSVVCFConstants.RD_GQ))); +// if (!oldGenotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) continue; + + final int rdCn = Integer.parseInt(oldGenotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()); + gb.GQ(rdCn); newGenotypes.add(gb.make()); } else { newGenotypes.add(oldGenotype); @@ -273,8 +278,9 @@ private void processCnvs(final VariantContextBuilder builder, final VariantConte final boolean isDel = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, "").equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL); for (String sample : variant.getSampleNamesOrderedByName()) { final Genotype genotype = variant.getGenotype(sample); - final String rdCnString = (String) genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN); - final int rdCn = Integer.parseInt(rdCnString); + if (!genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) continue; + + final int rdCn = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()); if ((isDel && rdCn > 3) || (!isDel && (rdCn < 1 || rdCn > 4))) { builder.attribute(GATKSVVCFConstants.MULTI_CNV, true); break; @@ -283,12 +289,12 @@ private void processCnvs(final VariantContextBuilder builder, final VariantConte } private boolean isDelDup(final VariantContext variant) { - String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); return svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP); } private boolean isLarge(final VariantContext variant, final int minSize) { - int variantLength = Math.abs(variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); + final int variantLength = Math.abs(variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); return variantLength >= minSize; } @@ -299,9 +305,9 @@ private boolean overlaps(final VariantContext v1, final VariantContext v2) { } private Set getNonReferenceSamples(final VariantContext variant) { - Set samples = new HashSet<>(); - for (String sampleName : variant.getSampleNames()) { - Genotype genotype = variant.getGenotype(sampleName); + final Set samples = new HashSet<>(); + for (final String sampleName : variant.getSampleNames()) { + final Genotype genotype = variant.getGenotype(sampleName); if (genotype.isCalled() && !genotype.isHomRef()) { samples.add(sampleName); } @@ -310,13 +316,13 @@ private Set getNonReferenceSamples(final VariantContext variant) { } private double getCoverage(final VariantContext wider, final VariantContext narrower) { - int nStart = narrower.getStart(); - int nStop = narrower.getEnd(); - int wStart = wider.getStart(); - int wStop = wider.getEnd(); + final int nStart = narrower.getStart(); + final int nStop = narrower.getEnd(); + final int wStart = wider.getStart(); + final int wStop = wider.getEnd(); if (wStart <= nStop && nStart <= wStop) { - int intersectionSize = Math.min(nStop, wStop) - Math.max(nStart, wStart) + 1; + final int intersectionSize = Math.min(nStop, wStop) - Math.max(nStart, wStart) + 1; return (double) intersectionSize / (nStop - nStart + 1); } return 0.0; From a40b193b470b067ec3a22a21f393c5d3258e8a14 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Fri, 17 Jan 2025 17:00:14 -0500 Subject: [PATCH 52/58] Concatenated overlapping cnv tasks into one --- .../walkers/sv/SVReviseOverlappingCnvGts.java | 2 +- .../walkers/sv/SVReviseOverlappingCnvs.java | 522 ++++++++++++++++++ 2 files changed, 523 insertions(+), 1 deletion(-) create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvs.java diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvGts.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvGts.java index 59798aa3444..6f8f4eb7dbf 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvGts.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvGts.java @@ -259,7 +259,7 @@ private void processRevisedEvent(final VariantContextBuilder builder, final Vari if (newVal != -1) { final GenotypeBuilder gb = new GenotypeBuilder(oldGenotype); gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); -// if (!oldGenotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) continue; + if (!oldGenotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) continue; final int rdCn = Integer.parseInt(oldGenotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()); gb.GQ(rdCn); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvs.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvs.java new file mode 100644 index 00000000000..854785626ff --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvs.java @@ -0,0 +1,522 @@ +package org.broadinstitute.hellbender.tools.walkers.sv; + +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.GenotypeBuilder; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLineType; +import htsjdk.variant.vcf.VCFInfoHeaderLine; +import org.apache.commons.lang3.tuple.ImmutablePair; +import org.apache.commons.lang3.tuple.Pair; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; +import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; + +import java.util.*; + +/** + * Completes a series of cleaning steps for a VCF produced by the GATK-SV pipeline. + * + *

          Inputs

          + *
            + *
          • + * TODO + *
          • + *
          + * + *

          Output

          + *
            + *
          • + * TODO + *
          • + *
          + * + *

          Usage Example

          + *
          + *     TODO
          + * 
          + * + *

          Processing Steps

          + *
            + *
          1. + * TODO + *
          2. + *
          + */ +@CommandLineProgramProperties( + summary = "Clean and format SV VCF", + oneLineSummary = "Clean and format SV VCF", + programGroup = StructuralVariantDiscoveryProgramGroup.class +) +@BetaFeature +@DocumentedFeature +public class SVReviseOverlappingCnvs extends MultiplePassVariantWalker { + @Argument( + fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + doc = "Output VCF name" + ) + private GATKPath outputVcf; + + private VariantContextWriter vcfWriter; + + // Data structures to hold accumulated data across variants + private static final List overlappingVariantsBuffer = new ArrayList<>(); + + private static final Map>> revisedEventsAll = new HashMap<>(); + private static final Map> revisedEventsFiltered = new HashMap<>(); + private static final Map> currentCopyNumbers = new HashMap<>(); + + private static final Map> abnormalRdCn = new HashMap<>(); + private static final Map> revisedCopyNumbers = new HashMap<>(); + private static final Set revisedComplete = new HashSet<>(); + + private static final Set multiCnvs = new HashSet<>(); + + private static final int MIN_VARIANT_SIZE = 5000; + + @Override + protected int numberOfPasses() { return 3; } + + @Override + protected void afterNthPass(final int n) { + if (n == 0) { + processCollectedVariants(); + } + } + + @Override + public void onTraversalStart() { + vcfWriter = createVCFWriter(outputVcf); + final VCFHeader header = getHeaderForVariants(); + header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.MULTI_CNV, 0, VCFHeaderLineType.Flag, "Variant is a multiallelic CNV")); + vcfWriter.writeHeader(header); + } + + @Override + public void closeTool() { + if (vcfWriter != null) { + vcfWriter.close(); + } + } + + @Override + protected void nthPassApply(final VariantContext variant, final ReadsContext readsContext, + final ReferenceContext referenceContext, final FeatureContext featureContext, final int n) { + switch (n) { + case 0: + firstPassApply(variant); + break; + case 1: + secondPassApply(variant); + break; + case 2: + thirdPassApply(variant); + break; + } + } + + public void firstPassApply(final VariantContext variant) { + if (!isDelDup(variant)) { + return; + } + + // Flag variant as being a multiallelic CNV if it passes certain conditions + final boolean isDel = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, "").equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL); + for (String sample : variant.getSampleNamesOrderedByName()) { + final Genotype genotype = variant.getGenotype(sample); + if (!genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) continue; + + final int rdCn = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()); + if ((isDel && rdCn > 3) || (!isDel && (rdCn < 1 || rdCn > 4))) { + multiCnvs.add(variant.getID()); + break; + } + } + + if (!isLarge(variant, MIN_VARIANT_SIZE)) { + return; + } + + // Flag sample as having an abnormal copy number if it passes certain conditions + for (final String sample : variant.getSampleNames()) { + final Genotype genotype = variant.getGenotype(sample); + if (!genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) continue; + + final int rdCn = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()); + final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + if ((svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) && rdCn < 2) || (svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) && rdCn > 2)) { + abnormalRdCn.computeIfAbsent(variant.getID(), k -> new HashSet<>()).add(sample); + } + } + + // Process overlaps with variants in the buffer + overlappingVariantsBuffer.removeIf(vc -> !vc.getContig().equals(variant.getContig()) + || (vc.getStart() + vc.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) < variant.getStart()); + for (final VariantContext bufferedVariant : overlappingVariantsBuffer) { + if (overlaps(bufferedVariant, variant)) { + processOverlap(bufferedVariant, variant); + adjustCopyNumber(bufferedVariant, variant); + } + } + overlappingVariantsBuffer.add(variant); + } + + public void secondPassApply(final VariantContext variant) { + if (!revisedEventsFiltered.containsKey(variant.getID())) { + return; + } + + // Initialize data structures + final String variantId = variant.getID(); + final Set samples = revisedEventsFiltered.get(variantId); + final Map variantRdCn = new HashMap<>(); + + // Initialize revisedRdCn value for each variant + for (final String sampleName : samples) { + final Genotype genotype = variant.getGenotype(sampleName); + if (!genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) continue; + + final int rdCn = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()); + variantRdCn.put(sampleName, rdCn); + } + currentCopyNumbers.put(variantId, variantRdCn); + } + + public void thirdPassApply(final VariantContext variant) { + final VariantContextBuilder builder = new VariantContextBuilder(variant); + if (revisedEventsAll.containsKey(variant.getID())) { + processRevisedEvent(builder, variant); + } + if (revisedCopyNumbers.containsKey(variant.getID())) { + processRevisedCn(builder, variant); + } + if (multiCnvs.contains((variant.getID()))) { + builder.attribute(GATKSVVCFConstants.MULTI_CNV, true); + } + vcfWriter.add(builder.make()); + } + + private void processCollectedVariants() { + // Prune variant-sample pairs we need RD_CN values for + for (final Map.Entry>> entry : revisedEventsAll.entrySet()) { + for (final Map.Entry> innerEntry : entry.getValue().entrySet()) { + final String sampleName = innerEntry.getKey(); + final String variantId = entry.getKey(); + final String widerVariantId = innerEntry.getValue().getLeft(); + final String svType = innerEntry.getValue().getRight(); + if (svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL)) { + revisedEventsFiltered.computeIfAbsent(variantId, k -> new HashSet<>()).add(sampleName); + revisedEventsFiltered.computeIfAbsent(widerVariantId, k -> new HashSet<>()).add(sampleName); + } + } + } + } + + private void processOverlap(final VariantContext v1, final VariantContext v2) { + // Get overlap data + VariantContext wider; + VariantContext narrower; + final int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + final int length2 = v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + if (length1 > length2) { + wider = v1; + narrower = v2; + } else if (length2 > length1) { + wider = v2; + narrower = v1; + } else { + return; + } + final String widerID = wider.getID(); + final String narrowerID = narrower.getID(); + + // Skip processing if same variant ID, SV type or samples + final String widerSvType = wider.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + final String narrowerSvType = narrower.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + final Set widerSamples = getNonReferenceSamples(wider); + final Set narrowerSamples = getNonReferenceSamples(narrower); + if (widerID.equals(narrowerID) || widerSvType.equals(narrowerSvType) || widerSamples.equals(narrowerSamples)) { + return; + } + + // Get samples present in wider but not in narrower + final Set nonCommonSamples = new HashSet<>(widerSamples); + nonCommonSamples.removeAll(narrowerSamples); + if (nonCommonSamples.isEmpty()) { + return; + } + + // Revise variant if coverage exceeds threshold + final double coverage = getCoverage(wider, narrower); + if (coverage >= 0.5) { + for (final String sample : nonCommonSamples) { + revisedEventsAll.computeIfAbsent(narrowerID, k -> new HashMap<>()) + .put(sample, new ImmutablePair<>(widerID, widerSvType)); + } + } + } + + private void processRevisedEvent(final VariantContextBuilder builder, final VariantContext variant) { + // Initialize data structures + final String variantId = variant.getID(); + final Map> variantEvents = revisedEventsAll.get(variantId); + final List newGenotypes = new ArrayList<>(); + + // Create updated genotypes + for (String sample : variant.getSampleNamesOrderedByName()) { + final Genotype oldGenotype = variant.getGenotype(sample); + final Pair event = variantEvents.get(sample); + + if (event != null) { + final String widerVariantId = event.getLeft(); + final String widerSvType = event.getRight(); + final int currentRdCn = currentCopyNumbers.get(variantId).getOrDefault(sample, 0); + final int widerRdCn = currentCopyNumbers.getOrDefault(widerVariantId, new HashMap<>()).getOrDefault(sample, 0); + + int newVal = -1; + if (widerSvType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) && currentRdCn == 2 && widerRdCn == 3) { + newVal = 1; + } else if (widerSvType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) && currentRdCn == 2 && widerRdCn == 1) { + newVal = 3; + } + + if (newVal != -1) { + final GenotypeBuilder gb = new GenotypeBuilder(oldGenotype); + gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); + if (!oldGenotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) continue; + + final int rdCn = Integer.parseInt(oldGenotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()); + gb.GQ(rdCn); + newGenotypes.add(gb.make()); + } else { + newGenotypes.add(oldGenotype); + } + } else { + newGenotypes.add(oldGenotype); + } + } + builder.genotypes(newGenotypes); + } + + private void adjustCopyNumber(final VariantContext v1, final VariantContext v2) { + // Determine larger variant + final int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + final int length2 = v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + VariantContext largerVariant = v1; + VariantContext smallerVariant = v2; + int largerLength = length1; + int smallerLength = length2; + + // Swap variants if necessary + if (length2 > length1) { + largerVariant = v2; + smallerVariant = v1; + largerLength = length2; + smallerLength = length1; + } + + // Get variant attributes + final String variantId1 = largerVariant.getID(); + final String variantId2 = smallerVariant.getID(); + final Map variantRdCn1 = getRdCn(largerVariant); + final Map variantRdCn2 = getRdCn(smallerVariant); + final Map> variantSupport1 = getSupport(largerVariant); + final Map> variantSupport2 = getSupport(smallerVariant); + final String svType1 = largerVariant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + final String svType2 = smallerVariant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + + // Calculate overlap + final int minEnd = Math.min( + largerVariant.getStart() + largerVariant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0), + smallerVariant.getStart() + smallerVariant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) + ); + final int maxStart = Math.max(largerVariant.getStart(), smallerVariant.getStart()); + final int lengthOverlap = minEnd - maxStart + 1; + final double overlap1 = (double) lengthOverlap / (double) largerLength; + final double overlap2 = (double) lengthOverlap / (double) smallerLength; + + // Get samples with abnormal CN across both variants + final Set samples = new HashSet<>(abnormalRdCn.getOrDefault(variantId1, Collections.emptySet())); + samples.retainAll(abnormalRdCn.getOrDefault(variantId2, Collections.emptySet())); + + // Iterate through samples to test against conditions + for (String sample : samples) { + final String id1 = variantId1 + "@" + sample; + final String id2 = variantId2 + "@" + sample; + if (revisedComplete.contains(id1)) { + continue; + } + + // Initialize variables for evaluation + final int rdCn1 = revisedCopyNumbers.getOrDefault(variantId1, Collections.emptyMap()).getOrDefault(sample, variantRdCn1.get(sample)); + final int rdCn2 = revisedCopyNumbers.getOrDefault(variantId2, Collections.emptyMap()).getOrDefault(sample, variantRdCn2.get(sample)); + final Set support1 = variantSupport1.get(sample); + final Set support2 = variantSupport2.get(sample); + final Genotype genotype2 = smallerVariant.getGenotype(sample); + + // Condition 1: Smaller depth call is being driven by larger call + if (support1.contains(GATKSVVCFConstants.EV_VALUES.get(1)) && support1.size() > 1 + && support2.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) + && overlap2 > 0.5 && !largerVariant.hasAttribute(GATKSVVCFConstants.MULTI_CNV)) { + if (rdCn1 == 0) { + makeRevision(id2, rdCn2 + 2); + } else if (rdCn1 == 1) { + makeRevision(id2, rdCn2 + rdCn1); + } else if (rdCn1 > 1) { + int newCN = rdCn2 - rdCn1 + 2; + newCN = Math.max(newCN, 0); + makeRevision(id2, newCN); + } + } + + // Condition 2: Smaller CNV is driven by larger CNV genotype + else if (support1.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) + && support2.contains(GATKSVVCFConstants.EV_VALUES.get(1)) && support2.size() > 1 + && overlap1 > 0.5 && overlap2 > 0.5 && !smallerVariant.hasAttribute(GATKSVVCFConstants.MULTI_CNV) + && !genotype2.isHomRef()) { + if (rdCn2 == 0) { + makeRevision(id1, rdCn1 + 2); + } else if (rdCn2 == 1) { + makeRevision(id1, rdCn1 + rdCn2); + } else if (rdCn2 > 1) { + int newCN = rdCn1 - rdCn2 + 2; + newCN = Math.max(newCN, 0); + makeRevision(id1, newCN); + } + } + + // Condition 3: Depth-only calls where smaller call is driven by larger call + else if (support1.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) + && support2.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) + && overlap2 > 0.5 && !largerVariant.hasAttribute(GATKSVVCFConstants.MULTI_CNV) && svType1.equals(svType2)) { + if (rdCn1 == 0 && rdCn1 != rdCn2) { + makeRevision(id2, rdCn2 + 2); + } else if (rdCn1 == 1 && rdCn1 > rdCn2) { + makeRevision(id2, 1); + } else if (rdCn1 > 1 && rdCn1 < rdCn2) { + makeRevision(id2, Math.max(rdCn2 - rdCn1 + 2, 0)); + } else { + makeRevision(id2, 2); + } + } + + // Condition 4: Any other time a larger call drives a smaller call + else if (support1.contains(GATKSVVCFConstants.EV_VALUES.get(1)) + && overlap2 > 0.5 && !largerVariant.hasAttribute(GATKSVVCFConstants.MULTI_CNV) && largerLength > MIN_VARIANT_SIZE) { + if (rdCn1 == 0) { + makeRevision(id2, rdCn2 + 2); + } else if (rdCn1 == 1) { + makeRevision(id2, rdCn2 + rdCn1); + } else if (rdCn1 > 1) { + int newCN = rdCn2 - rdCn1 + 2; + newCN = Math.max(newCN, 0); + makeRevision(id2, newCN); + } + } + } + } + + private void makeRevision(final String id, final int val) { + final String[] tokens = id.split("@"); + final String variantId = tokens[0]; + final String sample = tokens[1]; + revisedCopyNumbers.computeIfAbsent(variantId, k -> new HashMap<>()).put(sample, val); + if (val == 2) { + revisedComplete.add(id); + } + } + + private void processRevisedCn(final VariantContextBuilder builder, final VariantContext variant) { + // Initialize data structures + final String variantID = variant.getID(); + final List genotypes = builder.getGenotypes(); + final List updatedGenotypes = new ArrayList<>(genotypes.size()); + + // Replace revised alleles and copy numbers + for (final Genotype genotype : genotypes) { + final String sampleName = genotype.getSampleName(); + if (revisedCopyNumbers.get(variantID).containsKey(sampleName)) { + final GenotypeBuilder gb = new GenotypeBuilder(genotype); + gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); + gb.attribute(GATKSVVCFConstants.RD_CN, revisedCopyNumbers.get(variantID).get(sampleName)); + updatedGenotypes.add(gb.make()); + } else { + updatedGenotypes.add(genotype); + } + } + builder.genotypes(updatedGenotypes); + } + + private boolean isDelDup(final VariantContext variant) { + final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + return svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP); + } + + private boolean isLarge(final VariantContext variant, final int minSize) { + final int variantLength = Math.abs(variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); + return variantLength >= minSize; + } + + private boolean overlaps(final VariantContext v1, final VariantContext v2) { + return v1.getContig().equals(v2.getContig()) + && v1.getStart() <= (v2.getStart() + v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) + && v2.getStart() <= (v1.getStart() + v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); + } + + private Set getNonReferenceSamples(final VariantContext variant) { + final Set samples = new HashSet<>(); + for (final String sampleName : variant.getSampleNames()) { + final Genotype genotype = variant.getGenotype(sampleName); + if (genotype.isCalled() && !genotype.isHomRef()) { + samples.add(sampleName); + } + } + return samples; + } + + private double getCoverage(final VariantContext wider, final VariantContext narrower) { + final int nStart = narrower.getStart(); + final int nStop = narrower.getEnd(); + final int wStart = wider.getStart(); + final int wStop = wider.getEnd(); + + if (wStart <= nStop && nStart <= wStop) { + final int intersectionSize = Math.min(nStop, wStop) - Math.max(nStart, wStart) + 1; + return (double) intersectionSize / (nStop - nStart + 1); + } + return 0.0; + } + + private Map> getSupport(final VariantContext variant) { + Map> supportMap = new HashMap<>(); + for (String sample : variant.getSampleNames()) { + final Genotype genotype = variant.getGenotype(sample); + final String supportStr = genotype.hasExtendedAttribute(GATKSVVCFConstants.EV) ? genotype.getExtendedAttribute(GATKSVVCFConstants.EV).toString() : ""; + final Set supportSet = new HashSet<>(); + if (!supportStr.isEmpty()) { + supportSet.addAll(Arrays.asList(supportStr.split(","))); + } + supportMap.put(sample, supportSet); + } + return supportMap; + } + + private Map getRdCn(final VariantContext variant) { + final Map rdCnMap = new HashMap<>(); + for (String sample : variant.getSampleNames()) { + final Genotype genotype = variant.getGenotype(sample); + if (genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { + rdCnMap.put(sample, Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString())); + } + } + return rdCnMap; + } +} \ No newline at end of file From e60fc9cce17731fa90bc13c2655a5bf688fdd869 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Tue, 21 Jan 2025 12:12:27 -0500 Subject: [PATCH 53/58] Bug-fixed to match results from 5-pass version --- .../walkers/sv/SVReviseOverlappingCnvs.java | 112 +++++++++--------- 1 file changed, 58 insertions(+), 54 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvs.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvs.java index 854785626ff..77dcba57b46 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvs.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvs.java @@ -163,7 +163,10 @@ public void firstPassApply(final VariantContext variant) { for (final VariantContext bufferedVariant : overlappingVariantsBuffer) { if (overlaps(bufferedVariant, variant)) { processOverlap(bufferedVariant, variant); - adjustCopyNumber(bufferedVariant, variant); + } + + if (overlaps(variant, bufferedVariant)) { + adjustCopyNumber(variant, bufferedVariant); } } overlappingVariantsBuffer.add(variant); @@ -264,48 +267,6 @@ private void processOverlap(final VariantContext v1, final VariantContext v2) { } } - private void processRevisedEvent(final VariantContextBuilder builder, final VariantContext variant) { - // Initialize data structures - final String variantId = variant.getID(); - final Map> variantEvents = revisedEventsAll.get(variantId); - final List newGenotypes = new ArrayList<>(); - - // Create updated genotypes - for (String sample : variant.getSampleNamesOrderedByName()) { - final Genotype oldGenotype = variant.getGenotype(sample); - final Pair event = variantEvents.get(sample); - - if (event != null) { - final String widerVariantId = event.getLeft(); - final String widerSvType = event.getRight(); - final int currentRdCn = currentCopyNumbers.get(variantId).getOrDefault(sample, 0); - final int widerRdCn = currentCopyNumbers.getOrDefault(widerVariantId, new HashMap<>()).getOrDefault(sample, 0); - - int newVal = -1; - if (widerSvType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) && currentRdCn == 2 && widerRdCn == 3) { - newVal = 1; - } else if (widerSvType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) && currentRdCn == 2 && widerRdCn == 1) { - newVal = 3; - } - - if (newVal != -1) { - final GenotypeBuilder gb = new GenotypeBuilder(oldGenotype); - gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); - if (!oldGenotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) continue; - - final int rdCn = Integer.parseInt(oldGenotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()); - gb.GQ(rdCn); - newGenotypes.add(gb.make()); - } else { - newGenotypes.add(oldGenotype); - } - } else { - newGenotypes.add(oldGenotype); - } - } - builder.genotypes(newGenotypes); - } - private void adjustCopyNumber(final VariantContext v1, final VariantContext v2) { // Determine larger variant final int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); @@ -365,7 +326,7 @@ private void adjustCopyNumber(final VariantContext v1, final VariantContext v2) // Condition 1: Smaller depth call is being driven by larger call if (support1.contains(GATKSVVCFConstants.EV_VALUES.get(1)) && support1.size() > 1 && support2.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) - && overlap2 > 0.5 && !largerVariant.hasAttribute(GATKSVVCFConstants.MULTI_CNV)) { + && overlap2 > 0.5 && !multiCnvs.contains(variantId1)) { if (rdCn1 == 0) { makeRevision(id2, rdCn2 + 2); } else if (rdCn1 == 1) { @@ -380,7 +341,7 @@ private void adjustCopyNumber(final VariantContext v1, final VariantContext v2) // Condition 2: Smaller CNV is driven by larger CNV genotype else if (support1.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) && support2.contains(GATKSVVCFConstants.EV_VALUES.get(1)) && support2.size() > 1 - && overlap1 > 0.5 && overlap2 > 0.5 && !smallerVariant.hasAttribute(GATKSVVCFConstants.MULTI_CNV) + && overlap1 > 0.5 && overlap2 > 0.5 && !multiCnvs.contains(variantId2) && !genotype2.isHomRef()) { if (rdCn2 == 0) { makeRevision(id1, rdCn1 + 2); @@ -396,7 +357,7 @@ else if (support1.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get( // Condition 3: Depth-only calls where smaller call is driven by larger call else if (support1.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) && support2.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) - && overlap2 > 0.5 && !largerVariant.hasAttribute(GATKSVVCFConstants.MULTI_CNV) && svType1.equals(svType2)) { + && overlap2 > 0.5 && !multiCnvs.contains(variantId1) && svType1.equals(svType2)) { if (rdCn1 == 0 && rdCn1 != rdCn2) { makeRevision(id2, rdCn2 + 2); } else if (rdCn1 == 1 && rdCn1 > rdCn2) { @@ -410,7 +371,7 @@ else if (support1.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get( // Condition 4: Any other time a larger call drives a smaller call else if (support1.contains(GATKSVVCFConstants.EV_VALUES.get(1)) - && overlap2 > 0.5 && !largerVariant.hasAttribute(GATKSVVCFConstants.MULTI_CNV) && largerLength > MIN_VARIANT_SIZE) { + && overlap2 > 0.5 && !multiCnvs.contains(variantId1) && largerLength > MIN_VARIANT_SIZE) { if (rdCn1 == 0) { makeRevision(id2, rdCn2 + 2); } else if (rdCn1 == 1) { @@ -424,14 +385,47 @@ else if (support1.contains(GATKSVVCFConstants.EV_VALUES.get(1)) } } - private void makeRevision(final String id, final int val) { - final String[] tokens = id.split("@"); - final String variantId = tokens[0]; - final String sample = tokens[1]; - revisedCopyNumbers.computeIfAbsent(variantId, k -> new HashMap<>()).put(sample, val); - if (val == 2) { - revisedComplete.add(id); + private void processRevisedEvent(final VariantContextBuilder builder, final VariantContext variant) { + // Initialize data structures + final String variantId = variant.getID(); + final Map> variantEvents = revisedEventsAll.get(variantId); + final List genotypes = builder.getGenotypes(); + final List newGenotypes = new ArrayList<>(); + + // Create updated genotypes + for (final Genotype genotype : genotypes) { + final String sampleName = genotype.getSampleName(); + final Pair event = variantEvents.get(sampleName); + + if (event != null) { + final String widerVariantId = event.getLeft(); + final String widerSvType = event.getRight(); + final int currentRdCn = currentCopyNumbers.get(variantId).getOrDefault(sampleName, 0); + final int widerRdCn = currentCopyNumbers.getOrDefault(widerVariantId, new HashMap<>()).getOrDefault(sampleName, 0); + + int newVal = -1; + if (widerSvType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) && currentRdCn == 2 && widerRdCn == 3) { + newVal = 1; + } else if (widerSvType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) && currentRdCn == 2 && widerRdCn == 1) { + newVal = 3; + } + + if (newVal != -1) { + final GenotypeBuilder gb = new GenotypeBuilder(genotype); + gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); + if (!genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) continue; + + final int rdCn = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()); + gb.GQ(rdCn); + newGenotypes.add(gb.make()); + } else { + newGenotypes.add(genotype); + } + } else { + newGenotypes.add(genotype); + } } + builder.genotypes(newGenotypes); } private void processRevisedCn(final VariantContextBuilder builder, final VariantContext variant) { @@ -455,6 +449,16 @@ private void processRevisedCn(final VariantContextBuilder builder, final Variant builder.genotypes(updatedGenotypes); } + private void makeRevision(final String id, final int val) { + final String[] tokens = id.split("@"); + final String variantId = tokens[0]; + final String sample = tokens[1]; + revisedCopyNumbers.computeIfAbsent(variantId, k -> new HashMap<>()).put(sample, val); + if (val == 2) { + revisedComplete.add(id); + } + } + private boolean isDelDup(final VariantContext variant) { final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); return svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP); From a4db400bf550ed16e7149df20e31d6e0e6805a71 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Tue, 21 Jan 2025 12:39:42 -0500 Subject: [PATCH 54/58] Standardized overlap logic across OverlappingCnv task methods --- .../walkers/sv/SVReviseOverlappingCnvs.java | 87 ++++++++++--------- 1 file changed, 44 insertions(+), 43 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvs.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvs.java index 77dcba57b46..fa83194bf9d 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvs.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvs.java @@ -67,19 +67,22 @@ public class SVReviseOverlappingCnvs extends MultiplePassVariantWalker { private VariantContextWriter vcfWriter; - // Data structures to hold accumulated data across variants + // Data structure for overlap detection private static final List overlappingVariantsBuffer = new ArrayList<>(); + // Data structure for mutliallelic CNV detection + private static final Set multiCnvs = new HashSet<>(); + + // Data structures for revising genotypes private static final Map>> revisedEventsAll = new HashMap<>(); private static final Map> revisedEventsFiltered = new HashMap<>(); private static final Map> currentCopyNumbers = new HashMap<>(); + // Data structures for revising copy numbers private static final Map> abnormalRdCn = new HashMap<>(); private static final Map> revisedCopyNumbers = new HashMap<>(); private static final Set revisedComplete = new HashSet<>(); - private static final Set multiCnvs = new HashSet<>(); - private static final int MIN_VARIANT_SIZE = 5000; @Override @@ -124,11 +127,12 @@ protected void nthPassApply(final VariantContext variant, final ReadsContext rea } public void firstPassApply(final VariantContext variant) { + // Skip processing if not CNV if (!isDelDup(variant)) { return; } - // Flag variant as being a multiallelic CNV if it passes certain conditions + // Flag variant as being a multiallelic CNV final boolean isDel = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, "").equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL); for (String sample : variant.getSampleNamesOrderedByName()) { final Genotype genotype = variant.getGenotype(sample); @@ -141,11 +145,12 @@ public void firstPassApply(final VariantContext variant) { } } + // Skip processing if below size threshold if (!isLarge(variant, MIN_VARIANT_SIZE)) { return; } - // Flag sample as having an abnormal copy number if it passes certain conditions + // Flag sample as having an abnormal copy number for (final String sample : variant.getSampleNames()) { final Genotype genotype = variant.getGenotype(sample); if (!genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) continue; @@ -162,17 +167,15 @@ public void firstPassApply(final VariantContext variant) { || (vc.getStart() + vc.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) < variant.getStart()); for (final VariantContext bufferedVariant : overlappingVariantsBuffer) { if (overlaps(bufferedVariant, variant)) { - processOverlap(bufferedVariant, variant); - } - - if (overlaps(variant, bufferedVariant)) { - adjustCopyNumber(variant, bufferedVariant); + processGt(bufferedVariant, variant); + processCn(bufferedVariant, variant); } } overlappingVariantsBuffer.add(variant); } public void secondPassApply(final VariantContext variant) { + // Skip processing if not in revised events map if (!revisedEventsFiltered.containsKey(variant.getID())) { return; } @@ -182,7 +185,7 @@ public void secondPassApply(final VariantContext variant) { final Set samples = revisedEventsFiltered.get(variantId); final Map variantRdCn = new HashMap<>(); - // Initialize revisedRdCn value for each variant + // Initialize revisedRdCn values for each variant for (final String sampleName : samples) { final Genotype genotype = variant.getGenotype(sampleName); if (!genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) continue; @@ -195,15 +198,22 @@ public void secondPassApply(final VariantContext variant) { public void thirdPassApply(final VariantContext variant) { final VariantContextBuilder builder = new VariantContextBuilder(variant); + + // Revise genotypes if (revisedEventsAll.containsKey(variant.getID())) { - processRevisedEvent(builder, variant); + processRevisedGt(builder, variant); } + + // Revise copy numbers if (revisedCopyNumbers.containsKey(variant.getID())) { processRevisedCn(builder, variant); } + + // Tag multiallelic CNVs if (multiCnvs.contains((variant.getID()))) { builder.attribute(GATKSVVCFConstants.MULTI_CNV, true); } + vcfWriter.add(builder.make()); } @@ -223,30 +233,25 @@ private void processCollectedVariants() { } } - private void processOverlap(final VariantContext v1, final VariantContext v2) { - // Get overlap data - VariantContext wider; - VariantContext narrower; + private void processGt(final VariantContext v1, final VariantContext v2) { + // Determine larger variant, swapping if necessary final int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); final int length2 = v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); - if (length1 > length2) { - wider = v1; - narrower = v2; - } else if (length2 > length1) { - wider = v2; - narrower = v1; - } else { - return; + VariantContext largerVariant = v1; + VariantContext smallerVariant = v2; + if (length2 > length1) { + largerVariant = v2; + smallerVariant = v1; } - final String widerID = wider.getID(); - final String narrowerID = narrower.getID(); + final String largerID = largerVariant.getID(); + final String smallerID = smallerVariant.getID(); // Skip processing if same variant ID, SV type or samples - final String widerSvType = wider.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); - final String narrowerSvType = narrower.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); - final Set widerSamples = getNonReferenceSamples(wider); - final Set narrowerSamples = getNonReferenceSamples(narrower); - if (widerID.equals(narrowerID) || widerSvType.equals(narrowerSvType) || widerSamples.equals(narrowerSamples)) { + final String widerSvType = largerVariant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + final String narrowerSvType = smallerVariant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + final Set widerSamples = getNonReferenceSamples(largerVariant); + final Set narrowerSamples = getNonReferenceSamples(smallerVariant); + if (largerID.equals(smallerID) || widerSvType.equals(narrowerSvType) || widerSamples.equals(narrowerSamples)) { return; } @@ -258,31 +263,27 @@ private void processOverlap(final VariantContext v1, final VariantContext v2) { } // Revise variant if coverage exceeds threshold - final double coverage = getCoverage(wider, narrower); + final double coverage = getCoverage(largerVariant, smallerVariant); if (coverage >= 0.5) { for (final String sample : nonCommonSamples) { - revisedEventsAll.computeIfAbsent(narrowerID, k -> new HashMap<>()) - .put(sample, new ImmutablePair<>(widerID, widerSvType)); + revisedEventsAll.computeIfAbsent(smallerID, k -> new HashMap<>()) + .put(sample, new ImmutablePair<>(largerID, widerSvType)); } } } - private void adjustCopyNumber(final VariantContext v1, final VariantContext v2) { - // Determine larger variant + private void processCn(final VariantContext v1, final VariantContext v2) { + // Determine larger variant, swapping if necessary final int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); final int length2 = v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); VariantContext largerVariant = v1; VariantContext smallerVariant = v2; - int largerLength = length1; - int smallerLength = length2; - - // Swap variants if necessary if (length2 > length1) { largerVariant = v2; smallerVariant = v1; - largerLength = length2; - smallerLength = length1; } + final int largerLength = largerVariant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + final int smallerLength = smallerVariant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); // Get variant attributes final String variantId1 = largerVariant.getID(); @@ -385,7 +386,7 @@ else if (support1.contains(GATKSVVCFConstants.EV_VALUES.get(1)) } } - private void processRevisedEvent(final VariantContextBuilder builder, final VariantContext variant) { + private void processRevisedGt(final VariantContextBuilder builder, final VariantContext variant) { // Initialize data structures final String variantId = variant.getID(); final Map> variantEvents = revisedEventsAll.get(variantId); From 7f9b22dae5c0a45214432d61a6c882766d8ea453 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Wed, 29 Jan 2025 16:41:38 -0500 Subject: [PATCH 55/58] Used caching to improve runtime --- .../tools/walkers/sv/SVReviseLargeCnvs.java | 14 +- .../walkers/sv/SVReviseMultiallelicCnvs.java | 15 +- .../walkers/sv/SVReviseOverlappingCnvs.java | 260 +++++++++++------- 3 files changed, 178 insertions(+), 111 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseLargeCnvs.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseLargeCnvs.java index 508ea310eb4..4e4c83652e7 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseLargeCnvs.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseLargeCnvs.java @@ -151,7 +151,7 @@ public void apply(final VariantContext variant, final ReadsContext readsContext, private void processMultiallelic(final VariantContextBuilder builder, final List genotypes) { int numGtOver2 = 0; - for (Genotype genotype : genotypes) { + for (final Genotype genotype : genotypes) { final Integer peGt = genotype.hasExtendedAttribute(GATKSVVCFConstants.PE_GT) ? Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.PE_GT).toString()) : null; final Integer srGt = genotype.hasExtendedAttribute(GATKSVVCFConstants.SR_GT) ? @@ -193,9 +193,10 @@ private List processLargeDeletions(final VariantContext variant, final boolean multiallelicFilter = false; if (variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) >= MIN_LARGE_EVENT_SIZE) { Map sampleRdCn = new HashMap<>(); - for (Genotype genotype : genotypes) { - if (!outlierSamples.contains(genotype.getSampleName()) && genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { - sampleRdCn.put(genotype.getSampleName(), Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString())); + for (final Genotype genotype : genotypes) { + final String sample = genotype.getSampleName(); + if (!outlierSamples.contains(sample) && genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { + sampleRdCn.put(sample, Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString())); } } if (sampleRdCn.values().stream().filter(value -> value > 3).count() > maxVF) { @@ -258,8 +259,9 @@ private List processLargeDuplications(final VariantContext variant, fi if (variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) >= MIN_LARGE_EVENT_SIZE) { Map sampleRdCn = new HashMap<>(); for (final Genotype genotype : genotypes) { - if (!outlierSamples.contains(genotype.getSampleName()) && genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { - sampleRdCn.put(genotype.getSampleName(), Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString())); + final String sample = genotype.getSampleName(); + if (!outlierSamples.contains(sample) && genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { + sampleRdCn.put(sample, Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString())); } } if (sampleRdCn.values().stream().filter(value -> value > 4).count() > maxVF) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseMultiallelicCnvs.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseMultiallelicCnvs.java index 8da4fd855fa..6deb3d5b008 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseMultiallelicCnvs.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseMultiallelicCnvs.java @@ -127,19 +127,16 @@ public void secondPassApply(final VariantContext variant) { } private void processVariantPair(final VariantContext v1, final VariantContext v2) { - // Determine larger variant + // Determine larger variant, swapping if necessary VariantContext largerVariant = v1; VariantContext smallerVariant = v2; final int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); final int length2 = v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); - int smallerLength = length2; - - // Swap variants if necessary if (length2 > length1) { largerVariant = v2; smallerVariant = v1; - smallerLength = length1; } + final int smallerLength = smallerVariant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); // Calculate overlap int minEnd = Math.min( @@ -148,13 +145,15 @@ private void processVariantPair(final VariantContext v1, final VariantContext v2 ); final int maxStart = Math.max(largerVariant.getStart(), smallerVariant.getStart()); final int overlapLength = minEnd - maxStart + 1; - if (overlapLength <= 0) { + + // Skip if insufficient overlap length or coverage + final double coverage = (double) overlapLength / smallerLength; + if (overlapLength <= 0 || coverage <= 0.5) { return; } // Filter variant based on conditions - final double coverage = (double) overlapLength / smallerLength; - if (coverage > 0.5 && !filteredVariantIds.contains(largerVariant.getID())) { + if (!filteredVariantIds.contains(largerVariant.getID())) { filteredVariantIds.add(smallerVariant.getID()); } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvs.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvs.java index fa83194bf9d..c747cade3bf 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvs.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvs.java @@ -83,6 +83,11 @@ public class SVReviseOverlappingCnvs extends MultiplePassVariantWalker { private static final Map> revisedCopyNumbers = new HashMap<>(); private static final Set revisedComplete = new HashSet<>(); + // Data structures for cached data + private final Map> nonRefSamplesCache = new HashMap<>(); + private final Map>> supportCache = new HashMap<>(); + private final Map> rdCnCache = new HashMap<>(); + private static final int MIN_VARIANT_SIZE = 5000; @Override @@ -92,6 +97,7 @@ public class SVReviseOverlappingCnvs extends MultiplePassVariantWalker { protected void afterNthPass(final int n) { if (n == 0) { processCollectedVariants(); + clearAllCaches(); } } @@ -134,8 +140,7 @@ public void firstPassApply(final VariantContext variant) { // Flag variant as being a multiallelic CNV final boolean isDel = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, "").equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL); - for (String sample : variant.getSampleNamesOrderedByName()) { - final Genotype genotype = variant.getGenotype(sample); + for (final Genotype genotype : variant.getGenotypes()) { if (!genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) continue; final int rdCn = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()); @@ -151,20 +156,25 @@ public void firstPassApply(final VariantContext variant) { } // Flag sample as having an abnormal copy number - for (final String sample : variant.getSampleNames()) { - final Genotype genotype = variant.getGenotype(sample); + for (final Genotype genotype : variant.getGenotypes()) { if (!genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) continue; final int rdCn = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()); final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); if ((svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) && rdCn < 2) || (svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) && rdCn > 2)) { - abnormalRdCn.computeIfAbsent(variant.getID(), k -> new HashSet<>()).add(sample); + abnormalRdCn.computeIfAbsent(variant.getID(), k -> new HashSet<>()).add(genotype.getSampleName()); } } + // Remove variants not in current context window + overlappingVariantsBuffer.removeIf(overlappingVariant -> { + boolean shouldRemove = !overlappingVariant.getContig().equals(variant.getContig()) + || (overlappingVariant.getStart() + overlappingVariant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) < variant.getStart(); + if (shouldRemove) removeVariantFromCaches(overlappingVariant.getID()); + return shouldRemove; + }); + // Process overlaps with variants in the buffer - overlappingVariantsBuffer.removeIf(vc -> !vc.getContig().equals(variant.getContig()) - || (vc.getStart() + vc.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) < variant.getStart()); for (final VariantContext bufferedVariant : overlappingVariantsBuffer) { if (overlaps(bufferedVariant, variant)) { processGt(bufferedVariant, variant); @@ -217,28 +227,12 @@ public void thirdPassApply(final VariantContext variant) { vcfWriter.add(builder.make()); } - private void processCollectedVariants() { - // Prune variant-sample pairs we need RD_CN values for - for (final Map.Entry>> entry : revisedEventsAll.entrySet()) { - for (final Map.Entry> innerEntry : entry.getValue().entrySet()) { - final String sampleName = innerEntry.getKey(); - final String variantId = entry.getKey(); - final String widerVariantId = innerEntry.getValue().getLeft(); - final String svType = innerEntry.getValue().getRight(); - if (svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL)) { - revisedEventsFiltered.computeIfAbsent(variantId, k -> new HashSet<>()).add(sampleName); - revisedEventsFiltered.computeIfAbsent(widerVariantId, k -> new HashSet<>()).add(sampleName); - } - } - } - } - private void processGt(final VariantContext v1, final VariantContext v2) { // Determine larger variant, swapping if necessary - final int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); - final int length2 = v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); VariantContext largerVariant = v1; VariantContext smallerVariant = v2; + final int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + final int length2 = v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); if (length2 > length1) { largerVariant = v2; smallerVariant = v1; @@ -246,7 +240,13 @@ private void processGt(final VariantContext v1, final VariantContext v2) { final String largerID = largerVariant.getID(); final String smallerID = smallerVariant.getID(); - // Skip processing if same variant ID, SV type or samples + // Skip if coverage below expected + final double coverage = getCoverage(largerVariant, smallerVariant); + if (coverage < 0.5) { + return; + } + + // Skip if same variant ID, SV type or samples final String widerSvType = largerVariant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); final String narrowerSvType = smallerVariant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); final Set widerSamples = getNonReferenceSamples(largerVariant); @@ -255,7 +255,7 @@ private void processGt(final VariantContext v1, final VariantContext v2) { return; } - // Get samples present in wider but not in narrower + // Skip if no non-overlapping samples final Set nonCommonSamples = new HashSet<>(widerSamples); nonCommonSamples.removeAll(narrowerSamples); if (nonCommonSamples.isEmpty()) { @@ -263,21 +263,18 @@ private void processGt(final VariantContext v1, final VariantContext v2) { } // Revise variant if coverage exceeds threshold - final double coverage = getCoverage(largerVariant, smallerVariant); - if (coverage >= 0.5) { - for (final String sample : nonCommonSamples) { - revisedEventsAll.computeIfAbsent(smallerID, k -> new HashMap<>()) - .put(sample, new ImmutablePair<>(largerID, widerSvType)); - } + for (final String sample : nonCommonSamples) { + revisedEventsAll.computeIfAbsent(smallerID, k -> new HashMap<>()) + .put(sample, new ImmutablePair<>(largerID, widerSvType)); } } private void processCn(final VariantContext v1, final VariantContext v2) { // Determine larger variant, swapping if necessary - final int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); - final int length2 = v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); VariantContext largerVariant = v1; VariantContext smallerVariant = v2; + final int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + final int length2 = v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); if (length2 > length1) { largerVariant = v2; smallerVariant = v1; @@ -305,12 +302,29 @@ private void processCn(final VariantContext v1, final VariantContext v2) { final double overlap1 = (double) lengthOverlap / (double) largerLength; final double overlap2 = (double) lengthOverlap / (double) smallerLength; + // Skip if overlap conditions not met + if (overlap2 <= 0.5) { + return; + } + // Get samples with abnormal CN across both variants final Set samples = new HashSet<>(abnormalRdCn.getOrDefault(variantId1, Collections.emptySet())); samples.retainAll(abnormalRdCn.getOrDefault(variantId2, Collections.emptySet())); + // Skip if sample overlap conditions not met + if (samples.isEmpty()) { + return; + } + + // Cached boolean fields + final boolean isMultiCnv1 = multiCnvs.contains(variantId1); + final boolean isMultiCnv2 = multiCnvs.contains(variantId2); + final boolean isMatchingSvType = svType1.equals(svType2); + final boolean isOverlapping = (overlap1 > 0.5); + final boolean isLargerThanMin = largerLength > MIN_VARIANT_SIZE; + // Iterate through samples to test against conditions - for (String sample : samples) { + for (final String sample : samples) { final String id1 = variantId1 + "@" + sample; final String id2 = variantId2 + "@" + sample; if (revisedComplete.contains(id1)) { @@ -326,8 +340,7 @@ private void processCn(final VariantContext v1, final VariantContext v2) { // Condition 1: Smaller depth call is being driven by larger call if (support1.contains(GATKSVVCFConstants.EV_VALUES.get(1)) && support1.size() > 1 - && support2.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) - && overlap2 > 0.5 && !multiCnvs.contains(variantId1)) { + && support2.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) && !isMultiCnv1) { if (rdCn1 == 0) { makeRevision(id2, rdCn2 + 2); } else if (rdCn1 == 1) { @@ -340,10 +353,9 @@ private void processCn(final VariantContext v1, final VariantContext v2) { } // Condition 2: Smaller CNV is driven by larger CNV genotype - else if (support1.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) - && support2.contains(GATKSVVCFConstants.EV_VALUES.get(1)) && support2.size() > 1 - && overlap1 > 0.5 && overlap2 > 0.5 && !multiCnvs.contains(variantId2) - && !genotype2.isHomRef()) { + else if (support2.contains(GATKSVVCFConstants.EV_VALUES.get(1)) && support2.size() > 1 + && support1.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) + && !genotype2.isHomRef() && !isMultiCnv2 && isOverlapping) { if (rdCn2 == 0) { makeRevision(id1, rdCn1 + 2); } else if (rdCn2 == 1) { @@ -358,7 +370,7 @@ else if (support1.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get( // Condition 3: Depth-only calls where smaller call is driven by larger call else if (support1.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) && support2.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) - && overlap2 > 0.5 && !multiCnvs.contains(variantId1) && svType1.equals(svType2)) { + && !isMultiCnv1 && isMatchingSvType) { if (rdCn1 == 0 && rdCn1 != rdCn2) { makeRevision(id2, rdCn2 + 2); } else if (rdCn1 == 1 && rdCn1 > rdCn2) { @@ -371,8 +383,7 @@ else if (support1.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get( } // Condition 4: Any other time a larger call drives a smaller call - else if (support1.contains(GATKSVVCFConstants.EV_VALUES.get(1)) - && overlap2 > 0.5 && !multiCnvs.contains(variantId1) && largerLength > MIN_VARIANT_SIZE) { + else if (support1.contains(GATKSVVCFConstants.EV_VALUES.get(1)) && !isMultiCnv1 && isLargerThanMin) { if (rdCn1 == 0) { makeRevision(id2, rdCn2 + 2); } else if (rdCn1 == 1) { @@ -390,58 +401,64 @@ private void processRevisedGt(final VariantContextBuilder builder, final Variant // Initialize data structures final String variantId = variant.getID(); final Map> variantEvents = revisedEventsAll.get(variantId); - final List genotypes = builder.getGenotypes(); - final List newGenotypes = new ArrayList<>(); + final Map revisedGenotypes = new HashMap<>(); + final List oldGenotypes = variant.getGenotypes(); + final List newGenotypes = new ArrayList<>(oldGenotypes.size()); + + // Populate genotypes that need revising + for (final Map.Entry> entry : variantEvents.entrySet()) { + final Pair event = entry.getValue(); + if (event == null) { + continue; + } - // Create updated genotypes - for (final Genotype genotype : genotypes) { - final String sampleName = genotype.getSampleName(); - final Pair event = variantEvents.get(sampleName); - - if (event != null) { - final String widerVariantId = event.getLeft(); - final String widerSvType = event.getRight(); - final int currentRdCn = currentCopyNumbers.get(variantId).getOrDefault(sampleName, 0); - final int widerRdCn = currentCopyNumbers.getOrDefault(widerVariantId, new HashMap<>()).getOrDefault(sampleName, 0); - - int newVal = -1; - if (widerSvType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) && currentRdCn == 2 && widerRdCn == 3) { - newVal = 1; - } else if (widerSvType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) && currentRdCn == 2 && widerRdCn == 1) { - newVal = 3; - } + final String sampleName = entry.getKey(); + final Genotype genotype = variant.getGenotype(sampleName); + final String widerVariantId = event.getLeft(); + final String widerSvType = event.getRight(); + final int currentRdCn = currentCopyNumbers.get(variantId).getOrDefault(sampleName, 0); + final int widerRdCn = currentCopyNumbers.getOrDefault(widerVariantId, new HashMap<>()).getOrDefault(sampleName, 0); + + int newVal = -1; + if (widerSvType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) && currentRdCn == 2 && widerRdCn == 3) { + newVal = 1; + } else if (widerSvType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) && currentRdCn == 2 && widerRdCn == 1) { + newVal = 3; + } - if (newVal != -1) { - final GenotypeBuilder gb = new GenotypeBuilder(genotype); - gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); - if (!genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) continue; + if (newVal != -1) { + final GenotypeBuilder gb = new GenotypeBuilder(genotype); + gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); + if (!genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) continue; - final int rdCn = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()); - gb.GQ(rdCn); - newGenotypes.add(gb.make()); - } else { - newGenotypes.add(genotype); - } - } else { - newGenotypes.add(genotype); + final int rdCn = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()); + gb.GQ(rdCn); + revisedGenotypes.put(sampleName, gb.make()); } } + + // Populate genotypes that don't need revising + for (final Genotype genotype : oldGenotypes) { + final String sampleName = genotype.getSampleName(); + newGenotypes.add(revisedGenotypes.getOrDefault(sampleName, genotype)); + } + builder.genotypes(newGenotypes); } private void processRevisedCn(final VariantContextBuilder builder, final VariantContext variant) { // Initialize data structures - final String variantID = variant.getID(); + final String variantId = variant.getID(); final List genotypes = builder.getGenotypes(); final List updatedGenotypes = new ArrayList<>(genotypes.size()); // Replace revised alleles and copy numbers for (final Genotype genotype : genotypes) { final String sampleName = genotype.getSampleName(); - if (revisedCopyNumbers.get(variantID).containsKey(sampleName)) { + if (revisedCopyNumbers.get(variantId).containsKey(sampleName)) { final GenotypeBuilder gb = new GenotypeBuilder(genotype); gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); - gb.attribute(GATKSVVCFConstants.RD_CN, revisedCopyNumbers.get(variantID).get(sampleName)); + gb.attribute(GATKSVVCFConstants.RD_CN, revisedCopyNumbers.get(variantId).get(sampleName)); updatedGenotypes.add(gb.make()); } else { updatedGenotypes.add(genotype); @@ -450,6 +467,35 @@ private void processRevisedCn(final VariantContextBuilder builder, final Variant builder.genotypes(updatedGenotypes); } + private void processCollectedVariants() { + // Prune variant-sample pairs we need RD_CN values for + for (final Map.Entry>> entry : revisedEventsAll.entrySet()) { + for (final Map.Entry> innerEntry : entry.getValue().entrySet()) { + final String sampleName = innerEntry.getKey(); + final String variantId = entry.getKey(); + final String widerVariantId = innerEntry.getValue().getLeft(); + final String svType = innerEntry.getValue().getRight(); + if (svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL)) { + revisedEventsFiltered.computeIfAbsent(variantId, k -> new HashSet<>()).add(sampleName); + revisedEventsFiltered.computeIfAbsent(widerVariantId, k -> new HashSet<>()).add(sampleName); + } + } + } + } + + private void clearAllCaches() { + // Clear caches to free up memory + nonRefSamplesCache.clear(); + supportCache.clear(); + rdCnCache.clear(); + } + + private void removeVariantFromCaches(final String variantID) { + nonRefSamplesCache.remove(variantID); + supportCache.remove(variantID); + rdCnCache.remove(variantID); + } + private void makeRevision(final String id, final int val) { final String[] tokens = id.split("@"); final String variantId = tokens[0]; @@ -476,17 +522,6 @@ private boolean overlaps(final VariantContext v1, final VariantContext v2) { && v2.getStart() <= (v1.getStart() + v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); } - private Set getNonReferenceSamples(final VariantContext variant) { - final Set samples = new HashSet<>(); - for (final String sampleName : variant.getSampleNames()) { - final Genotype genotype = variant.getGenotype(sampleName); - if (genotype.isCalled() && !genotype.isHomRef()) { - samples.add(sampleName); - } - } - return samples; - } - private double getCoverage(final VariantContext wider, final VariantContext narrower) { final int nStart = narrower.getStart(); final int nStop = narrower.getEnd(); @@ -500,28 +535,59 @@ private double getCoverage(final VariantContext wider, final VariantContext narr return 0.0; } + private Set getNonReferenceSamples(final VariantContext variant) { + final String variantId = variant.getID(); + if (nonRefSamplesCache.containsKey(variantId)) { + return nonRefSamplesCache.get(variantId); + } + + final Set samples = new HashSet<>(); + for (final Genotype genotype : variant.getGenotypes()) { + if (genotype.isCalled() && !genotype.isHomRef()) { + samples.add(genotype.getSampleName()); + } + } + + nonRefSamplesCache.put(variantId, samples); + return samples; + } + private Map> getSupport(final VariantContext variant) { + final String variantId = variant.getID(); + if (supportCache.containsKey(variantId)) { + return supportCache.get(variantId); + } + Map> supportMap = new HashMap<>(); - for (String sample : variant.getSampleNames()) { - final Genotype genotype = variant.getGenotype(sample); - final String supportStr = genotype.hasExtendedAttribute(GATKSVVCFConstants.EV) ? genotype.getExtendedAttribute(GATKSVVCFConstants.EV).toString() : ""; + for (final Genotype genotype : variant.getGenotypes()) { + final String supportStr = genotype.hasExtendedAttribute(GATKSVVCFConstants.EV) + ? genotype.getExtendedAttribute(GATKSVVCFConstants.EV).toString() + : ""; final Set supportSet = new HashSet<>(); if (!supportStr.isEmpty()) { supportSet.addAll(Arrays.asList(supportStr.split(","))); } - supportMap.put(sample, supportSet); + supportMap.put(genotype.getSampleName(), supportSet); } + + supportCache.put(variantId, supportMap); return supportMap; } private Map getRdCn(final VariantContext variant) { + final String variantId = variant.getID(); + if (rdCnCache.containsKey(variantId)) { + return rdCnCache.get(variantId); + } + final Map rdCnMap = new HashMap<>(); - for (String sample : variant.getSampleNames()) { - final Genotype genotype = variant.getGenotype(sample); + for (final Genotype genotype : variant.getGenotypes()) { if (genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { - rdCnMap.put(sample, Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString())); + rdCnMap.put(genotype.getSampleName(), Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString())); } } + + rdCnCache.put(variantId, rdCnMap); return rdCnMap; } } \ No newline at end of file From f46e501bf46078e9f0b80efe4841239a33685f67 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Wed, 5 Feb 2025 21:45:40 -0500 Subject: [PATCH 56/58] Minor structural changes to overlap code --- .../tools/walkers/sv/SVCleanPt1b.java | 6 + .../tools/walkers/sv/SVCleanPt2.java | 2 + .../walkers/sv/SVReviseMultiallelicCnvs.java | 30 +- .../walkers/sv/SVReviseOverlappingCnvs.java | 302 +++++++++--------- 4 files changed, 174 insertions(+), 166 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java index 872f840c9e5..46ec2a56a73 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt1b.java @@ -160,6 +160,8 @@ public void secondPassApply(final VariantContext variant) { // Initialize revisedRdCn value for each variant for (final String sampleName : samples) { final Genotype genotype = variant.getGenotype(sampleName); + if (!genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) continue; + final String rdCn = (String) genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN); variantRdCn.put(sampleName, Integer.parseInt(rdCn)); } @@ -262,6 +264,8 @@ private void processVariant(final VariantContextBuilder builder, final VariantCo if (newVal != -1) { final GenotypeBuilder gb = new GenotypeBuilder(oldGenotype); gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); + if (!oldGenotype.hasExtendedAttribute(GATKSVVCFConstants.RD_GQ)) continue; + gb.GQ(Integer.parseInt((String) oldGenotype.getExtendedAttribute(GATKSVVCFConstants.RD_GQ))); newGenotypes.add(gb.make()); } else { @@ -278,6 +282,8 @@ private void processCnvs(final VariantContextBuilder builder, final VariantConte final boolean isDel = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, "").equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL); for (String sample : variant.getSampleNamesOrderedByName()) { final Genotype genotype = variant.getGenotype(sample); + if (!genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) continue; + final String rdCnString = (String) genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN); final int rdCn = Integer.parseInt(rdCnString); if ((isDel && rdCn > 3) || (!isDel && (rdCn < 1 || rdCn > 4))) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java index ba32a181a01..6b852d44e79 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVCleanPt2.java @@ -141,6 +141,8 @@ public void apply(final VariantContext variant, final ReadsContext readsContext, // Flag sample as having an abnormal copy number if it passes certain conditions for (String sample : variant.getSampleNames()) { Genotype genotype = variant.getGenotype(sample); + if (!genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) continue; + int rdCn = Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString()); if (!sampleWhitelist.contains(sample) || !genotype.isCalled() || rdCn == 2) { continue; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseMultiallelicCnvs.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseMultiallelicCnvs.java index 6deb3d5b008..25066ab0ec6 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseMultiallelicCnvs.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseMultiallelicCnvs.java @@ -136,19 +136,10 @@ private void processVariantPair(final VariantContext v1, final VariantContext v2 largerVariant = v2; smallerVariant = v1; } - final int smallerLength = smallerVariant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); - - // Calculate overlap - int minEnd = Math.min( - largerVariant.getStart() + largerVariant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0), - smallerVariant.getStart() + smallerVariant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) - ); - final int maxStart = Math.max(largerVariant.getStart(), smallerVariant.getStart()); - final int overlapLength = minEnd - maxStart + 1; - - // Skip if insufficient overlap length or coverage - final double coverage = (double) overlapLength / smallerLength; - if (overlapLength <= 0 || coverage <= 0.5) { + + // Skip if coverage below expected + final double coverage = getCoverage(largerVariant, smallerVariant); + if (coverage < 0.5) { return; } @@ -163,4 +154,17 @@ private boolean overlaps(final VariantContext v1, final VariantContext v2) { && v1.getStart() <= (v2.getStart() + v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) && v2.getStart() <= (v1.getStart() + v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); } + + private double getCoverage(final VariantContext larger, final VariantContext smaller) { + final int largerStart = larger.getStart(); + final int smallerStart = smaller.getStart(); + final int largerStop = larger.getEnd(); + final int smallerStop = smaller.getEnd(); + + if (largerStart <= smallerStop && smallerStart <= largerStop) { + final int intersectionSize = Math.min(smallerStop, largerStop) - Math.max(smallerStart, largerStart) + 1; + return (double) intersectionSize / (smallerStop - smallerStart + 1); + } + return 0.0; + } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvs.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvs.java index c747cade3bf..39fa7ae19aa 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvs.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvs.java @@ -237,8 +237,6 @@ private void processGt(final VariantContext v1, final VariantContext v2) { largerVariant = v2; smallerVariant = v1; } - final String largerID = largerVariant.getID(); - final String smallerID = smallerVariant.getID(); // Skip if coverage below expected final double coverage = getCoverage(largerVariant, smallerVariant); @@ -246,26 +244,28 @@ private void processGt(final VariantContext v1, final VariantContext v2) { return; } - // Skip if same variant ID, SV type or samples - final String widerSvType = largerVariant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); - final String narrowerSvType = smallerVariant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); - final Set widerSamples = getNonReferenceSamples(largerVariant); - final Set narrowerSamples = getNonReferenceSamples(smallerVariant); - if (largerID.equals(smallerID) || widerSvType.equals(narrowerSvType) || widerSamples.equals(narrowerSamples)) { + // Skip if same variant ID, SV type or sample sets + final String largerId = largerVariant.getID(); + final String smallerId = smallerVariant.getID(); + final String largerSvType = largerVariant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + final String smallerSvType = smallerVariant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + final Set largerSamples = getNonReferenceSamples(largerVariant); + final Set smallerSamples = getNonReferenceSamples(smallerVariant); + if (largerId.equals(smallerId) || largerSvType.equals(smallerSvType) || largerSamples.equals(smallerSamples)) { return; } // Skip if no non-overlapping samples - final Set nonCommonSamples = new HashSet<>(widerSamples); - nonCommonSamples.removeAll(narrowerSamples); + final Set nonCommonSamples = new HashSet<>(largerSamples); + nonCommonSamples.removeAll(smallerSamples); if (nonCommonSamples.isEmpty()) { return; } - // Revise variant if coverage exceeds threshold + // Add variant pair to data structure for (final String sample : nonCommonSamples) { - revisedEventsAll.computeIfAbsent(smallerID, k -> new HashMap<>()) - .put(sample, new ImmutablePair<>(largerID, widerSvType)); + revisedEventsAll.computeIfAbsent(smallerId, k -> new HashMap<>()) + .put(sample, new ImmutablePair<>(largerId, largerSvType)); } } @@ -273,125 +273,122 @@ private void processCn(final VariantContext v1, final VariantContext v2) { // Determine larger variant, swapping if necessary VariantContext largerVariant = v1; VariantContext smallerVariant = v2; - final int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); - final int length2 = v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + int length2 = v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); if (length2 > length1) { largerVariant = v2; smallerVariant = v1; + length1 = length2; } - final int largerLength = largerVariant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); - final int smallerLength = smallerVariant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); - - // Get variant attributes - final String variantId1 = largerVariant.getID(); - final String variantId2 = smallerVariant.getID(); - final Map variantRdCn1 = getRdCn(largerVariant); - final Map variantRdCn2 = getRdCn(smallerVariant); - final Map> variantSupport1 = getSupport(largerVariant); - final Map> variantSupport2 = getSupport(smallerVariant); - final String svType1 = largerVariant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); - final String svType2 = smallerVariant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); // Calculate overlap - final int minEnd = Math.min( - largerVariant.getStart() + largerVariant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0), - smallerVariant.getStart() + smallerVariant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) - ); - final int maxStart = Math.max(largerVariant.getStart(), smallerVariant.getStart()); - final int lengthOverlap = minEnd - maxStart + 1; - final double overlap1 = (double) lengthOverlap / (double) largerLength; - final double overlap2 = (double) lengthOverlap / (double) smallerLength; - - // Skip if overlap conditions not met - if (overlap2 <= 0.5) { + final double coverage = getCoverage(largerVariant, smallerVariant); + if (coverage < 0.5) { return; } - // Get samples with abnormal CN across both variants - final Set samples = new HashSet<>(abnormalRdCn.getOrDefault(variantId1, Collections.emptySet())); - samples.retainAll(abnormalRdCn.getOrDefault(variantId2, Collections.emptySet())); - - // Skip if sample overlap conditions not met + // Skip if no common abnormal samples + final Set samples = new HashSet<>(abnormalRdCn.getOrDefault(largerVariant.getID(), Collections.emptySet())); + samples.retainAll(abnormalRdCn.getOrDefault(smallerVariant.getID(), Collections.emptySet())); if (samples.isEmpty()) { return; } + // Cached non-boolean fields + final String largerId = largerVariant.getID(); + final String smallerId = smallerVariant.getID(); + final Map largerRdCn = getRdCn(largerVariant); + final Map smallerRdCn = getRdCn(smallerVariant); + final Map> largerSupport = getSupport(largerVariant); + final Map> smallerSupport = getSupport(smallerVariant); + final String largerSvType = largerVariant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + final String smallerSvType = smallerVariant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + + // Cached length fields + final int minEnd = Math.min( + largerVariant.getStart() + largerVariant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0), + smallerVariant.getStart() + smallerVariant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) + ); + final int maxStart = Math.max(largerVariant.getStart(), smallerVariant.getStart()); + final int lengthOverlap = minEnd - maxStart + 1; + final double largerOverlap = (double) lengthOverlap / (double) length1; + // Cached boolean fields - final boolean isMultiCnv1 = multiCnvs.contains(variantId1); - final boolean isMultiCnv2 = multiCnvs.contains(variantId2); - final boolean isMatchingSvType = svType1.equals(svType2); - final boolean isOverlapping = (overlap1 > 0.5); - final boolean isLargerThanMin = largerLength > MIN_VARIANT_SIZE; + final boolean largerIsMultiCnv = multiCnvs.contains(largerId); + final boolean smallerIsMultiCnv = multiCnvs.contains(smallerId); + final boolean isMatchingSvType = largerSvType.equals(smallerSvType); + final boolean isOverlapping = (largerOverlap > 0.5); + final boolean isLargerThanMin = length1 > MIN_VARIANT_SIZE; // Iterate through samples to test against conditions for (final String sample : samples) { - final String id1 = variantId1 + "@" + sample; - final String id2 = variantId2 + "@" + sample; - if (revisedComplete.contains(id1)) { + final String largerFullId = largerId + "@" + sample; + final String smallerFullId = smallerId + "@" + sample; + if (revisedComplete.contains(largerFullId)) { continue; } // Initialize variables for evaluation - final int rdCn1 = revisedCopyNumbers.getOrDefault(variantId1, Collections.emptyMap()).getOrDefault(sample, variantRdCn1.get(sample)); - final int rdCn2 = revisedCopyNumbers.getOrDefault(variantId2, Collections.emptyMap()).getOrDefault(sample, variantRdCn2.get(sample)); - final Set support1 = variantSupport1.get(sample); - final Set support2 = variantSupport2.get(sample); + final int largerSampleRdCn = revisedCopyNumbers.getOrDefault(largerId, Collections.emptyMap()).getOrDefault(sample, largerRdCn.get(sample)); + final int smallerSampleRdCn = revisedCopyNumbers.getOrDefault(smallerId, Collections.emptyMap()).getOrDefault(sample, smallerRdCn.get(sample)); + final Set largerSampleSupport = largerSupport.get(sample); + final Set smallerSampleSupport = smallerSupport.get(sample); final Genotype genotype2 = smallerVariant.getGenotype(sample); // Condition 1: Smaller depth call is being driven by larger call - if (support1.contains(GATKSVVCFConstants.EV_VALUES.get(1)) && support1.size() > 1 - && support2.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) && !isMultiCnv1) { - if (rdCn1 == 0) { - makeRevision(id2, rdCn2 + 2); - } else if (rdCn1 == 1) { - makeRevision(id2, rdCn2 + rdCn1); - } else if (rdCn1 > 1) { - int newCN = rdCn2 - rdCn1 + 2; + if (largerSampleSupport.contains(GATKSVVCFConstants.EV_VALUES.get(1)) && largerSampleSupport.size() > 1 + && smallerSampleSupport.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) && !largerIsMultiCnv) { + if (largerSampleRdCn == 0) { + makeRevision(smallerFullId, smallerSampleRdCn + 2); + } else if (largerSampleRdCn == 1) { + makeRevision(smallerFullId, smallerSampleRdCn + largerSampleRdCn); + } else if (largerSampleRdCn > 1) { + int newCN = smallerSampleRdCn - largerSampleRdCn + 2; newCN = Math.max(newCN, 0); - makeRevision(id2, newCN); + makeRevision(smallerFullId, newCN); } } // Condition 2: Smaller CNV is driven by larger CNV genotype - else if (support2.contains(GATKSVVCFConstants.EV_VALUES.get(1)) && support2.size() > 1 - && support1.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) - && !genotype2.isHomRef() && !isMultiCnv2 && isOverlapping) { - if (rdCn2 == 0) { - makeRevision(id1, rdCn1 + 2); - } else if (rdCn2 == 1) { - makeRevision(id1, rdCn1 + rdCn2); - } else if (rdCn2 > 1) { - int newCN = rdCn1 - rdCn2 + 2; + else if (smallerSampleSupport.contains(GATKSVVCFConstants.EV_VALUES.get(1)) && smallerSampleSupport.size() > 1 + && largerSampleSupport.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) + && !genotype2.isHomRef() && !smallerIsMultiCnv && isOverlapping) { + if (smallerSampleRdCn == 0) { + makeRevision(largerFullId, largerSampleRdCn + 2); + } else if (smallerSampleRdCn == 1) { + makeRevision(largerFullId, largerSampleRdCn + smallerSampleRdCn); + } else if (smallerSampleRdCn > 1) { + int newCN = largerSampleRdCn - smallerSampleRdCn + 2; newCN = Math.max(newCN, 0); - makeRevision(id1, newCN); + makeRevision(largerFullId, newCN); } } // Condition 3: Depth-only calls where smaller call is driven by larger call - else if (support1.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) - && support2.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) - && !isMultiCnv1 && isMatchingSvType) { - if (rdCn1 == 0 && rdCn1 != rdCn2) { - makeRevision(id2, rdCn2 + 2); - } else if (rdCn1 == 1 && rdCn1 > rdCn2) { - makeRevision(id2, 1); - } else if (rdCn1 > 1 && rdCn1 < rdCn2) { - makeRevision(id2, Math.max(rdCn2 - rdCn1 + 2, 0)); + else if (largerSampleSupport.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) + && smallerSampleSupport.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) + && !largerIsMultiCnv && isMatchingSvType) { + if (largerSampleRdCn == 0 && largerSampleRdCn != smallerSampleRdCn) { + makeRevision(smallerFullId, smallerSampleRdCn + 2); + } else if (largerSampleRdCn == 1 && largerSampleRdCn > smallerSampleRdCn) { + makeRevision(smallerFullId, 1); + } else if (largerSampleRdCn > 1 && largerSampleRdCn < smallerSampleRdCn) { + makeRevision(smallerFullId, Math.max(smallerSampleRdCn - largerSampleRdCn + 2, 0)); } else { - makeRevision(id2, 2); + makeRevision(smallerFullId, 2); } } // Condition 4: Any other time a larger call drives a smaller call - else if (support1.contains(GATKSVVCFConstants.EV_VALUES.get(1)) && !isMultiCnv1 && isLargerThanMin) { - if (rdCn1 == 0) { - makeRevision(id2, rdCn2 + 2); - } else if (rdCn1 == 1) { - makeRevision(id2, rdCn2 + rdCn1); - } else if (rdCn1 > 1) { - int newCN = rdCn2 - rdCn1 + 2; + else if (largerSampleSupport.contains(GATKSVVCFConstants.EV_VALUES.get(1)) && !largerIsMultiCnv && isLargerThanMin) { + if (largerSampleRdCn == 0) { + makeRevision(smallerFullId, smallerSampleRdCn + 2); + } else if (largerSampleRdCn == 1) { + makeRevision(smallerFullId, smallerSampleRdCn + largerSampleRdCn); + } else if (largerSampleRdCn > 1) { + int newCN = smallerSampleRdCn - largerSampleRdCn + 2; newCN = Math.max(newCN, 0); - makeRevision(id2, newCN); + makeRevision(smallerFullId, newCN); } } } @@ -414,15 +411,15 @@ private void processRevisedGt(final VariantContextBuilder builder, final Variant final String sampleName = entry.getKey(); final Genotype genotype = variant.getGenotype(sampleName); - final String widerVariantId = event.getLeft(); - final String widerSvType = event.getRight(); + final String largerId = event.getLeft(); + final String largerSvType = event.getRight(); final int currentRdCn = currentCopyNumbers.get(variantId).getOrDefault(sampleName, 0); - final int widerRdCn = currentCopyNumbers.getOrDefault(widerVariantId, new HashMap<>()).getOrDefault(sampleName, 0); + final int largerRdCn = currentCopyNumbers.getOrDefault(largerId, new HashMap<>()).getOrDefault(sampleName, 0); int newVal = -1; - if (widerSvType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) && currentRdCn == 2 && widerRdCn == 3) { + if (largerSvType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) && currentRdCn == 2 && largerRdCn == 3) { newVal = 1; - } else if (widerSvType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) && currentRdCn == 2 && widerRdCn == 1) { + } else if (largerSvType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) && currentRdCn == 2 && largerRdCn == 1) { newVal = 3; } @@ -473,68 +470,16 @@ private void processCollectedVariants() { for (final Map.Entry> innerEntry : entry.getValue().entrySet()) { final String sampleName = innerEntry.getKey(); final String variantId = entry.getKey(); - final String widerVariantId = innerEntry.getValue().getLeft(); - final String svType = innerEntry.getValue().getRight(); - if (svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL)) { + final String largerVariantId = innerEntry.getValue().getLeft(); + final String largerSvType = innerEntry.getValue().getRight(); + if (largerSvType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP) || largerSvType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL)) { revisedEventsFiltered.computeIfAbsent(variantId, k -> new HashSet<>()).add(sampleName); - revisedEventsFiltered.computeIfAbsent(widerVariantId, k -> new HashSet<>()).add(sampleName); + revisedEventsFiltered.computeIfAbsent(largerVariantId, k -> new HashSet<>()).add(sampleName); } } } } - private void clearAllCaches() { - // Clear caches to free up memory - nonRefSamplesCache.clear(); - supportCache.clear(); - rdCnCache.clear(); - } - - private void removeVariantFromCaches(final String variantID) { - nonRefSamplesCache.remove(variantID); - supportCache.remove(variantID); - rdCnCache.remove(variantID); - } - - private void makeRevision(final String id, final int val) { - final String[] tokens = id.split("@"); - final String variantId = tokens[0]; - final String sample = tokens[1]; - revisedCopyNumbers.computeIfAbsent(variantId, k -> new HashMap<>()).put(sample, val); - if (val == 2) { - revisedComplete.add(id); - } - } - - private boolean isDelDup(final VariantContext variant) { - final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); - return svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP); - } - - private boolean isLarge(final VariantContext variant, final int minSize) { - final int variantLength = Math.abs(variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); - return variantLength >= minSize; - } - - private boolean overlaps(final VariantContext v1, final VariantContext v2) { - return v1.getContig().equals(v2.getContig()) - && v1.getStart() <= (v2.getStart() + v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) - && v2.getStart() <= (v1.getStart() + v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); - } - - private double getCoverage(final VariantContext wider, final VariantContext narrower) { - final int nStart = narrower.getStart(); - final int nStop = narrower.getEnd(); - final int wStart = wider.getStart(); - final int wStop = wider.getEnd(); - - if (wStart <= nStop && nStart <= wStop) { - final int intersectionSize = Math.min(nStop, wStop) - Math.max(nStart, wStart) + 1; - return (double) intersectionSize / (nStop - nStart + 1); - } - return 0.0; - } - private Set getNonReferenceSamples(final VariantContext variant) { final String variantId = variant.getID(); if (nonRefSamplesCache.containsKey(variantId)) { @@ -590,4 +535,55 @@ private Map getRdCn(final VariantContext variant) { rdCnCache.put(variantId, rdCnMap); return rdCnMap; } + + private void clearAllCaches() { + nonRefSamplesCache.clear(); + supportCache.clear(); + rdCnCache.clear(); + } + + private void removeVariantFromCaches(final String variantID) { + nonRefSamplesCache.remove(variantID); + supportCache.remove(variantID); + rdCnCache.remove(variantID); + } + + private void makeRevision(final String id, final int val) { + final String[] tokens = id.split("@"); + final String variantId = tokens[0]; + final String sample = tokens[1]; + revisedCopyNumbers.computeIfAbsent(variantId, k -> new HashMap<>()).put(sample, val); + if (val == 2) { + revisedComplete.add(id); + } + } + + private boolean isDelDup(final VariantContext variant) { + final String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + return svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL) || svType.equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP); + } + + private boolean isLarge(final VariantContext variant, final int minSize) { + final int variantLength = Math.abs(variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); + return variantLength >= minSize; + } + + private boolean overlaps(final VariantContext v1, final VariantContext v2) { + return v1.getContig().equals(v2.getContig()) + && v1.getStart() <= (v2.getStart() + v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) + && v2.getStart() <= (v1.getStart() + v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); + } + + private double getCoverage(final VariantContext larger, final VariantContext smaller) { + final int largerStart = larger.getStart(); + final int smallerStart = smaller.getStart(); + final int largerStop = larger.getEnd(); + final int smallerStop = smaller.getEnd(); + + if (largerStart <= smallerStop && smallerStart <= largerStop) { + final int intersectionSize = Math.min(smallerStop, largerStop) - Math.max(smallerStart, largerStart) + 1; + return (double) intersectionSize / (smallerStop - smallerStart + 1); + } + return 0.0; + } } \ No newline at end of file From 029bfc9aaf43df2b7b1febf87ca2a5a7f0091bdb Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Thu, 20 Mar 2025 11:10:24 -0400 Subject: [PATCH 57/58] Moving temp changes --- .../hellbender/engine/MultiVariantWalker.java | 2 +- .../tools/walkers/sv/SVReviseLargeCnvs.java | 347 ------------------ .../walkers/sv/SVReviseMultiallelicCnvs.java | 301 +++++++++++---- .../walkers/sv/SVReviseOverlappingCnvs.java | 4 +- .../sv/SVReviseOverlappingMultiallelics.java | 170 +++++++++ 5 files changed, 412 insertions(+), 412 deletions(-) delete mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseLargeCnvs.java create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingMultiallelics.java diff --git a/src/main/java/org/broadinstitute/hellbender/engine/MultiVariantWalker.java b/src/main/java/org/broadinstitute/hellbender/engine/MultiVariantWalker.java index 7f12ff641e2..8c13e147641 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/MultiVariantWalker.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/MultiVariantWalker.java @@ -86,7 +86,7 @@ protected void initializeDrivingVariants() { features.addToFeatureSources(0, featureInput, VariantContext.class, cloudPrefetchBuffer, cloudIndexPrefetchBuffer, referenceArguments.getReferencePath()); } - ); + ); 9434, final boolean skipDictionaryValidation = !seqValidationArguments.performSequenceDictionaryValidation() || !doDictionaryCrossValidation(); if (skipDictionaryValidation && !hasReference() && getMasterSequenceDictionary() == null) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseLargeCnvs.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseLargeCnvs.java deleted file mode 100644 index 4e4c83652e7..00000000000 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseLargeCnvs.java +++ /dev/null @@ -1,347 +0,0 @@ -package org.broadinstitute.hellbender.tools.walkers.sv; - -import htsjdk.variant.variantcontext.Allele; -import htsjdk.variant.variantcontext.Genotype; -import htsjdk.variant.variantcontext.GenotypeBuilder; -import htsjdk.variant.variantcontext.VariantContext; -import htsjdk.variant.variantcontext.VariantContextBuilder; -import htsjdk.variant.variantcontext.writer.VariantContextWriter; -import htsjdk.variant.vcf.VCFHeader; -import htsjdk.variant.vcf.VCFHeaderLineType; -import htsjdk.variant.vcf.VCFInfoHeaderLine; -import htsjdk.variant.vcf.VCFFormatHeaderLine; -import htsjdk.variant.vcf.VCFFilterHeaderLine; - -import org.broadinstitute.barclay.argparser.Argument; -import org.broadinstitute.barclay.argparser.BetaFeature; -import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; -import org.broadinstitute.barclay.help.DocumentedFeature; -import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; -import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; -import org.broadinstitute.hellbender.engine.*; -import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; - -import java.io.IOException; -import java.nio.file.Files; - -import java.util.Arrays; -import java.util.List; -import java.util.ArrayList; -import java.util.Set; -import java.util.Map; -import java.util.HashSet; -import java.util.HashMap; - -/** - * Completes an initial series of cleaning steps for a VCF produced by the GATK-SV pipeline. - * - *

          Inputs

          - *
            - *
          • - * VCF containing structural variant (SV) records from the GATK-SV pipeline. - *
          • - *
          • - * TODO - *
          • - *
          - * - *

          Output

          - *
            - *
          • - * Cleansed VCF. - *
          • - *
          - * - *

          Usage Example

          - *
          - *     TODO
          - * 
          - * - *

          Processing Steps

          - *
            - *
          1. - * TODO - *
          2. - *
          - */ -@CommandLineProgramProperties( - summary = "Clean and format SV VCF", - oneLineSummary = "Clean and format SV VCF", - programGroup = StructuralVariantDiscoveryProgramGroup.class -) -@BetaFeature -@DocumentedFeature -public class SVReviseLargeCnvs extends VariantWalker { - public static final String OUTLIERS_LIST_LONG_NAME = "outliers-list"; - - @Argument( - fullName = OUTLIERS_LIST_LONG_NAME, - doc = "File with outlier samples", - optional = true - ) - private GATKPath outliersListPath; - - @Argument( - fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, - shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, - doc = "Output VCF name" - ) - private GATKPath outputVcf; - - private VariantContextWriter vcfWriter; - - private Set outlierSamples; - - private double maxVF; - - private static final int MIN_LARGE_EVENT_SIZE = 1000; - private static final int MIN_MULTIALLELIC_EVENT_SIZE = 5000; - - @Override - public void onTraversalStart() { - // Read and parse input files - try { - outlierSamples = new HashSet<>(); - if (outliersListPath != null) { - outlierSamples = new HashSet<>(Files.readAllLines(outliersListPath.toPath())); - } - } catch (IOException e) { - throw new RuntimeException("Error reading input file", e); - } - - // Populate maxVf based on sample information - maxVF = Math.max((getHeaderForVariants().getGenotypeSamples().size() - outlierSamples.size()) * 0.01, 2); - - // Filter specific header lines - final VCFHeader header = getHeaderForVariants(); - header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.PESR_GT_OVERDISPERSION, 0, VCFHeaderLineType.Flag, "High PESR dispersion count")); - header.addMetaDataLine(new VCFFormatHeaderLine(GATKSVVCFConstants.COPY_NUMBER_FORMAT, 1, VCFHeaderLineType.Integer, "Predicted copy state")); - header.addMetaDataLine(new VCFFormatHeaderLine(GATKSVVCFConstants.COPY_NUMBER_QUALITY_FORMAT, 1, VCFHeaderLineType.Integer, "Read-depth genotype quality")); - header.addMetaDataLine(new VCFFilterHeaderLine(GATKSVVCFConstants.MULTIALLELIC, "Multiallelic site")); - - // Write header - vcfWriter = createVCFWriter(outputVcf); - vcfWriter.writeHeader(header); - } - - @Override - public void closeTool() { - if (vcfWriter != null) { - vcfWriter.close(); - } - } - - @Override - public void apply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext) { - // Initialize data structures - final VariantContextBuilder builder = new VariantContextBuilder(variant); - List genotypes = variant.getGenotypes(); - - // Process variants - processMultiallelic(builder, genotypes); - genotypes = processLargeDeletions(variant, builder, genotypes); - genotypes = processLargeDuplications(variant, builder, genotypes); - - // Build genotypes - if (isCalled(builder, genotypes)) { - builder.genotypes(genotypes); - vcfWriter.add(builder.make()); - } - } - - private void processMultiallelic(final VariantContextBuilder builder, final List genotypes) { - int numGtOver2 = 0; - for (final Genotype genotype : genotypes) { - final Integer peGt = genotype.hasExtendedAttribute(GATKSVVCFConstants.PE_GT) ? - Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.PE_GT).toString()) : null; - final Integer srGt = genotype.hasExtendedAttribute(GATKSVVCFConstants.SR_GT) ? - Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.SR_GT).toString()) : null; - int gt; - if (peGt == null) { - continue; - } else if (srGt == null) { - gt = peGt; - } else if (peGt > 0 && srGt == 0) { - gt = peGt; - } else if (peGt == 0) { - gt = srGt; - } else { - final Integer peGq = genotype.hasExtendedAttribute(GATKSVVCFConstants.PE_GQ) ? - Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.PE_GQ).toString()) : null; - final Integer srGq = genotype.hasExtendedAttribute(GATKSVVCFConstants.SR_GQ) ? - Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.SR_GQ).toString()) : null; - if (peGq != null && srGq != null && peGq >= srGq) { - gt = peGt; - } else { - gt = srGt; - } - } - if (gt > 2) { - numGtOver2 += 1; - } - } - if (numGtOver2 > maxVF) { - builder.attribute(GATKSVVCFConstants.PESR_GT_OVERDISPERSION, true); - } - } - - private List processLargeDeletions(final VariantContext variant, final VariantContextBuilder builder, List genotypes) { - if (!variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, "").equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL)) { - return genotypes; - } - - boolean multiallelicFilter = false; - if (variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) >= MIN_LARGE_EVENT_SIZE) { - Map sampleRdCn = new HashMap<>(); - for (final Genotype genotype : genotypes) { - final String sample = genotype.getSampleName(); - if (!outlierSamples.contains(sample) && genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { - sampleRdCn.put(sample, Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString())); - } - } - if (sampleRdCn.values().stream().filter(value -> value > 3).count() > maxVF) { - multiallelicFilter = true; - } - } - - boolean gt5kbFilter = false; - final List allowedAlleleIndices = Arrays.asList(-1, 0, 1); - if (genotypes.stream().anyMatch(g -> g.getAlleles().stream().anyMatch(a -> !allowedAlleleIndices.contains(variant.getAlleleIndex(a))))) { - gt5kbFilter = true; - } else if (variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) >= MIN_MULTIALLELIC_EVENT_SIZE && !multiallelicFilter) { - gt5kbFilter = true; - } - - List updatedGenotypes = new ArrayList<>(genotypes.size()); - if (gt5kbFilter) { - for (final Genotype genotype : genotypes) { - final GenotypeBuilder gb = new GenotypeBuilder(genotype); - if (!genotype.isNoCall()) { - if (genotype.hasGQ() && Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 0).toString()) >= 2) { - gb.alleles(Arrays.asList(variant.getReference(), variant.getReference())); - } else if (genotype.hasGQ() && Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 0).toString()) == 1) { - gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); - } else if (genotype.hasGQ()) { - gb.alleles(Arrays.asList(variant.getAlternateAllele(0), variant.getAlternateAllele(0))); - } - } - updatedGenotypes.add(gb.make()); - } - genotypes = updatedGenotypes; - } - - updatedGenotypes = new ArrayList<>(genotypes.size()); - if (multiallelicFilter) { - for (final Genotype genotype : genotypes) { - GenotypeBuilder gb = new GenotypeBuilder(genotype); - gb.noGQ(); - gb.alleles(Arrays.asList(Allele.NO_CALL)); - gb.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN)); - gb.attribute(GATKSVVCFConstants.COPY_NUMBER_QUALITY_FORMAT, genotype.getExtendedAttribute(GATKSVVCFConstants.RD_GQ)); - updatedGenotypes.add(gb.make()); - } - genotypes = updatedGenotypes; - - builder.filter(GATKSVVCFConstants.MULTIALLELIC); - builder.attribute(GATKSVVCFConstants.SVTYPE, GATKSVVCFConstants.CNV); - builder.alleles(Arrays.asList(variant.getReference(), Allele.create("<" + GATKSVVCFConstants.CNV + ">", false))); - } - - return genotypes; - } - - private List processLargeDuplications(final VariantContext variant, final VariantContextBuilder builder, List genotypes) { - if (!variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, "").equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP)) { - return genotypes; - } - - boolean multiallelicFilter = false; - if (variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) >= MIN_LARGE_EVENT_SIZE) { - Map sampleRdCn = new HashMap<>(); - for (final Genotype genotype : genotypes) { - final String sample = genotype.getSampleName(); - if (!outlierSamples.contains(sample) && genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { - sampleRdCn.put(sample, Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString())); - } - } - if (sampleRdCn.values().stream().filter(value -> value > 4).count() > maxVF) { - multiallelicFilter = true; - } - if (sampleRdCn.values().stream().filter(value -> (value < 1 || value > 4)).count() > 4) { - if (sampleRdCn.values().stream().filter(value -> (value < 1 || value > 4)).distinct().count() > maxVF) { - multiallelicFilter = true; - } - } - } - - boolean gt5kbFilter = false; - final List allowedAlleleIndices = Arrays.asList(-1, 0, 1); - if (genotypes.stream().anyMatch(g -> g.getAlleles().stream().anyMatch(a -> !allowedAlleleIndices.contains(variant.getAlleleIndex(a))))) { - gt5kbFilter = true; - } else if (variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) >= MIN_MULTIALLELIC_EVENT_SIZE && !multiallelicFilter) { - gt5kbFilter = true; - } - - List updatedGenotypes = new ArrayList<>(genotypes.size()); - if (gt5kbFilter) { - for (final Genotype genotype : genotypes) { - final GenotypeBuilder gb = new GenotypeBuilder(genotype); - if (!genotype.isNoCall()) { - if (genotype.hasGQ() && Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 3).toString()) <= 2) { - gb.alleles(Arrays.asList(variant.getReference(), variant.getReference())); - } else if (genotype.hasGQ() && Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 0).toString()) == 3) { - gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); - } else if (genotype.hasGQ()) { - gb.alleles(Arrays.asList(variant.getAlternateAllele(0), variant.getAlternateAllele(0))); - } - } - updatedGenotypes.add(gb.make()); - } - genotypes = updatedGenotypes; - } - - updatedGenotypes = new ArrayList<>(genotypes.size()); - if (multiallelicFilter) { - for (final Genotype genotype : genotypes) { - final GenotypeBuilder gb = new GenotypeBuilder(genotype); - gb.noGQ(); - gb.alleles(Arrays.asList(Allele.NO_CALL)); - gb.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN)); - gb.attribute(GATKSVVCFConstants.COPY_NUMBER_QUALITY_FORMAT, genotype.getExtendedAttribute(GATKSVVCFConstants.RD_GQ)); - updatedGenotypes.add(gb.make()); - } - genotypes = updatedGenotypes; - - builder.filter(GATKSVVCFConstants.MULTIALLELIC); - builder.attribute(GATKSVVCFConstants.SVTYPE, GATKSVVCFConstants.CNV); - builder.alleles(Arrays.asList(variant.getReference(), Allele.create("<" + GATKSVVCFConstants.CNV + ">", false))); - } - - return genotypes; - } - - public boolean isCalled(final VariantContextBuilder builder, final List genotypes) { - for (final Genotype genotype : genotypes) { - if (!isNoCallGt(genotype.getAlleles())) { - return true; - } - } - - if (builder.getAttributes().getOrDefault(GATKSVVCFConstants.SVTYPE, "").equals(GATKSVVCFConstants.CNV)) { - for (final Genotype genotype : genotypes) { - if (Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, 2).toString()) != 2) { - return true; - } - } - } - - return false; - } - - private boolean isNoCallGt(final List alleles) { - if (alleles.size() == 1 && alleles.get(0).isReference()) return true; - else if (alleles.size() == 2 && alleles.get(0).isReference() && alleles.get(1).isReference()) return true; - else if (alleles.size() == 1 && alleles.get(0).isNoCall()) return true; - return false; - } -} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseMultiallelicCnvs.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseMultiallelicCnvs.java index 25066ab0ec6..374fda6697c 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseMultiallelicCnvs.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseMultiallelicCnvs.java @@ -1,8 +1,16 @@ package org.broadinstitute.hellbender.tools.walkers.sv; +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.GenotypeBuilder; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.variant.variantcontext.writer.VariantContextWriter; +import htsjdk.variant.vcf.VCFHeader; +import htsjdk.variant.vcf.VCFHeaderLineType; +import htsjdk.variant.vcf.VCFInfoHeaderLine; +import htsjdk.variant.vcf.VCFFormatHeaderLine; +import htsjdk.variant.vcf.VCFFilterHeaderLine; import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.BetaFeature; @@ -13,10 +21,16 @@ import org.broadinstitute.hellbender.engine.*; import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; +import java.io.IOException; +import java.nio.file.Files; + +import java.util.Arrays; import java.util.List; import java.util.ArrayList; import java.util.Set; +import java.util.Map; import java.util.HashSet; +import java.util.HashMap; /** * Completes an initial series of cleaning steps for a VCF produced by the GATK-SV pipeline. @@ -57,7 +71,16 @@ ) @BetaFeature @DocumentedFeature -public class SVReviseMultiallelicCnvs extends MultiplePassVariantWalker { +public class SVReviseMultiallelicCnvs extends VariantWalker { + public static final String OUTLIERS_LIST_LONG_NAME = "outliers-list"; + + @Argument( + fullName = OUTLIERS_LIST_LONG_NAME, + doc = "File with outlier samples", + optional = true + ) + private GATKPath outliersListPath; + @Argument( fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, @@ -67,19 +90,38 @@ public class SVReviseMultiallelicCnvs extends MultiplePassVariantWalker { private VariantContextWriter vcfWriter; - private final List overlappingVariantsBuffer = new ArrayList<>(); - private final Set filteredVariantIds = new HashSet<>(); + private Set outlierSamples; - @Override - protected int numberOfPasses() { return 2; } + private double maxVF; - @Override - protected void afterNthPass(int n) {} + private static final int MIN_LARGE_EVENT_SIZE = 1000; + private static final int MIN_MULTIALLELIC_EVENT_SIZE = 5000; @Override public void onTraversalStart() { + // Read and parse input files + try { + outlierSamples = new HashSet<>(); + if (outliersListPath != null) { + outlierSamples = new HashSet<>(Files.readAllLines(outliersListPath.toPath())); + } + } catch (IOException e) { + throw new RuntimeException("Error reading input file", e); + } + + // Populate maxVf based on sample information + maxVF = Math.max((getHeaderForVariants().getGenotypeSamples().size() - outlierSamples.size()) * 0.01, 2); + + // Filter specific header lines + final VCFHeader header = getHeaderForVariants(); + header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.PESR_GT_OVERDISPERSION, 0, VCFHeaderLineType.Flag, "High PESR dispersion count")); + header.addMetaDataLine(new VCFFormatHeaderLine(GATKSVVCFConstants.COPY_NUMBER_FORMAT, 1, VCFHeaderLineType.Integer, "Predicted copy state")); + header.addMetaDataLine(new VCFFormatHeaderLine(GATKSVVCFConstants.COPY_NUMBER_QUALITY_FORMAT, 1, VCFHeaderLineType.Integer, "Read-depth genotype quality")); + header.addMetaDataLine(new VCFFilterHeaderLine(GATKSVVCFConstants.MULTIALLELIC, "Multiallelic site")); + + // Write header vcfWriter = createVCFWriter(outputVcf); - vcfWriter.writeHeader(getHeaderForVariants()); + vcfWriter.writeHeader(header); } @Override @@ -90,81 +132,216 @@ public void closeTool() { } @Override - protected void nthPassApply(final VariantContext variant, final ReadsContext readsContext, - final ReferenceContext referenceContext, final FeatureContext featureContext, int n) { - switch (n) { - case 0: - firstPassApply(variant); - break; - case 1: - secondPassApply(variant); - break; + public void apply(final VariantContext variant, final ReadsContext readsContext, final ReferenceContext referenceContext, final FeatureContext featureContext) { + // Initialize data structures + final VariantContextBuilder builder = new VariantContextBuilder(variant); + List genotypes = variant.getGenotypes(); + + // Process variants + processMultiallelic(builder, genotypes); + genotypes = processLargeDeletions(variant, builder, genotypes); + genotypes = processLargeDuplications(variant, builder, genotypes); + + // Build genotypes + if (isCalled(builder, genotypes)) { + builder.genotypes(genotypes); + vcfWriter.add(builder.make()); + } + } + + private void processMultiallelic(final VariantContextBuilder builder, final List genotypes) { + int numGtOver2 = 0; + for (final Genotype genotype : genotypes) { + final Integer peGt = genotype.hasExtendedAttribute(GATKSVVCFConstants.PE_GT) ? + Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.PE_GT).toString()) : null; + final Integer srGt = genotype.hasExtendedAttribute(GATKSVVCFConstants.SR_GT) ? + Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.SR_GT).toString()) : null; + int gt; + if (peGt == null) { + continue; + } else if (srGt == null) { + gt = peGt; + } else if (peGt > 0 && srGt == 0) { + gt = peGt; + } else if (peGt == 0) { + gt = srGt; + } else { + final Integer peGq = genotype.hasExtendedAttribute(GATKSVVCFConstants.PE_GQ) ? + Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.PE_GQ).toString()) : null; + final Integer srGq = genotype.hasExtendedAttribute(GATKSVVCFConstants.SR_GQ) ? + Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.SR_GQ).toString()) : null; + if (peGq != null && srGq != null && peGq >= srGq) { + gt = peGt; + } else { + gt = srGt; + } + } + if (gt > 2) { + numGtOver2 += 1; + } + } + if (numGtOver2 > maxVF) { + builder.attribute(GATKSVVCFConstants.PESR_GT_OVERDISPERSION, true); } } - public void firstPassApply(final VariantContext variant) { - if (!variant.getFilters().contains(GATKSVVCFConstants.MULTIALLELIC)) { - return; + private List processLargeDeletions(final VariantContext variant, final VariantContextBuilder builder, List genotypes) { + if (!variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, "").equals(GATKSVVCFConstants.SYMB_ALT_STRING_DEL)) { + return genotypes; } - overlappingVariantsBuffer.removeIf(vc -> !vc.getContig().equals(variant.getContig()) - || (vc.getStart() + vc.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) < variant.getStart()); - for (final VariantContext bufferedVariant : overlappingVariantsBuffer) { - if (overlaps(bufferedVariant, variant)) { - processVariantPair(bufferedVariant, variant); + boolean multiallelicFilter = false; + if (variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) >= MIN_LARGE_EVENT_SIZE) { + Map sampleRdCn = new HashMap<>(); + for (final Genotype genotype : genotypes) { + final String sample = genotype.getSampleName(); + if (!outlierSamples.contains(sample) && genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { + sampleRdCn.put(sample, Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString())); + } + } + if (sampleRdCn.values().stream().filter(value -> value > 3).count() > maxVF) { + multiallelicFilter = true; } } - overlappingVariantsBuffer.add(variant); - } - public void secondPassApply(final VariantContext variant) { - if (filteredVariantIds.contains(variant.getID())) { - return; + boolean gt5kbFilter = false; + final List allowedAlleleIndices = Arrays.asList(-1, 0, 1); + if (genotypes.stream().anyMatch(g -> g.getAlleles().stream().anyMatch(a -> !allowedAlleleIndices.contains(variant.getAlleleIndex(a))))) { + gt5kbFilter = true; + } else if (variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) >= MIN_MULTIALLELIC_EVENT_SIZE && !multiallelicFilter) { + gt5kbFilter = true; } - final VariantContextBuilder builder = new VariantContextBuilder(variant); - vcfWriter.add(builder.make()); + List updatedGenotypes = new ArrayList<>(genotypes.size()); + if (gt5kbFilter) { + for (final Genotype genotype : genotypes) { + final GenotypeBuilder gb = new GenotypeBuilder(genotype); + if (!genotype.isNoCall()) { + if (genotype.hasGQ() && Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 0).toString()) >= 2) { + gb.alleles(Arrays.asList(variant.getReference(), variant.getReference())); + } else if (genotype.hasGQ() && Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 0).toString()) == 1) { + gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); + } else if (genotype.hasGQ()) { + gb.alleles(Arrays.asList(variant.getAlternateAllele(0), variant.getAlternateAllele(0))); + } + } + updatedGenotypes.add(gb.make()); + } + genotypes = updatedGenotypes; + } + + updatedGenotypes = new ArrayList<>(genotypes.size()); + if (multiallelicFilter) { + for (final Genotype genotype : genotypes) { + GenotypeBuilder gb = new GenotypeBuilder(genotype); + gb.noGQ(); + gb.alleles(Arrays.asList(Allele.NO_CALL)); + gb.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN)); + gb.attribute(GATKSVVCFConstants.COPY_NUMBER_QUALITY_FORMAT, genotype.getExtendedAttribute(GATKSVVCFConstants.RD_GQ)); + updatedGenotypes.add(gb.make()); + } + genotypes = updatedGenotypes; + + builder.filter(GATKSVVCFConstants.MULTIALLELIC); + builder.attribute(GATKSVVCFConstants.SVTYPE, GATKSVVCFConstants.CNV); + builder.alleles(Arrays.asList(variant.getReference(), Allele.create("<" + GATKSVVCFConstants.CNV + ">", false))); + } + + return genotypes; } - private void processVariantPair(final VariantContext v1, final VariantContext v2) { - // Determine larger variant, swapping if necessary - VariantContext largerVariant = v1; - VariantContext smallerVariant = v2; - final int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); - final int length2 = v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); - if (length2 > length1) { - largerVariant = v2; - smallerVariant = v1; + private List processLargeDuplications(final VariantContext variant, final VariantContextBuilder builder, List genotypes) { + if (!variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, "").equals(GATKSVVCFConstants.SYMB_ALT_STRING_DUP)) { + return genotypes; } - // Skip if coverage below expected - final double coverage = getCoverage(largerVariant, smallerVariant); - if (coverage < 0.5) { - return; + boolean multiallelicFilter = false; + if (variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) >= MIN_LARGE_EVENT_SIZE) { + Map sampleRdCn = new HashMap<>(); + for (final Genotype genotype : genotypes) { + final String sample = genotype.getSampleName(); + if (!outlierSamples.contains(sample) && genotype.hasExtendedAttribute(GATKSVVCFConstants.RD_CN)) { + sampleRdCn.put(sample, Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN).toString())); + } + } + if (sampleRdCn.values().stream().filter(value -> value > 4).count() > maxVF) { + multiallelicFilter = true; + } + if (sampleRdCn.values().stream().filter(value -> (value < 1 || value > 4)).count() > 4) { + if (sampleRdCn.values().stream().filter(value -> (value < 1 || value > 4)).distinct().count() > maxVF) { + multiallelicFilter = true; + } + } } - // Filter variant based on conditions - if (!filteredVariantIds.contains(largerVariant.getID())) { - filteredVariantIds.add(smallerVariant.getID()); + boolean gt5kbFilter = false; + final List allowedAlleleIndices = Arrays.asList(-1, 0, 1); + if (genotypes.stream().anyMatch(g -> g.getAlleles().stream().anyMatch(a -> !allowedAlleleIndices.contains(variant.getAlleleIndex(a))))) { + gt5kbFilter = true; + } else if (variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0) >= MIN_MULTIALLELIC_EVENT_SIZE && !multiallelicFilter) { + gt5kbFilter = true; + } + + List updatedGenotypes = new ArrayList<>(genotypes.size()); + if (gt5kbFilter) { + for (final Genotype genotype : genotypes) { + final GenotypeBuilder gb = new GenotypeBuilder(genotype); + if (!genotype.isNoCall()) { + if (genotype.hasGQ() && Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 3).toString()) <= 2) { + gb.alleles(Arrays.asList(variant.getReference(), variant.getReference())); + } else if (genotype.hasGQ() && Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN, 0).toString()) == 3) { + gb.alleles(Arrays.asList(variant.getReference(), variant.getAlternateAllele(0))); + } else if (genotype.hasGQ()) { + gb.alleles(Arrays.asList(variant.getAlternateAllele(0), variant.getAlternateAllele(0))); + } + } + updatedGenotypes.add(gb.make()); + } + genotypes = updatedGenotypes; + } + + updatedGenotypes = new ArrayList<>(genotypes.size()); + if (multiallelicFilter) { + for (final Genotype genotype : genotypes) { + final GenotypeBuilder gb = new GenotypeBuilder(genotype); + gb.noGQ(); + gb.alleles(Arrays.asList(Allele.NO_CALL)); + gb.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, genotype.getExtendedAttribute(GATKSVVCFConstants.RD_CN)); + gb.attribute(GATKSVVCFConstants.COPY_NUMBER_QUALITY_FORMAT, genotype.getExtendedAttribute(GATKSVVCFConstants.RD_GQ)); + updatedGenotypes.add(gb.make()); + } + genotypes = updatedGenotypes; + + builder.filter(GATKSVVCFConstants.MULTIALLELIC); + builder.attribute(GATKSVVCFConstants.SVTYPE, GATKSVVCFConstants.CNV); + builder.alleles(Arrays.asList(variant.getReference(), Allele.create("<" + GATKSVVCFConstants.CNV + ">", false))); } - } - private boolean overlaps(final VariantContext v1, final VariantContext v2) { - return v1.getContig().equals(v2.getContig()) - && v1.getStart() <= (v2.getStart() + v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) - && v2.getStart() <= (v1.getStart() + v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); + return genotypes; } - private double getCoverage(final VariantContext larger, final VariantContext smaller) { - final int largerStart = larger.getStart(); - final int smallerStart = smaller.getStart(); - final int largerStop = larger.getEnd(); - final int smallerStop = smaller.getEnd(); + public boolean isCalled(final VariantContextBuilder builder, final List genotypes) { + for (final Genotype genotype : genotypes) { + if (!isNoCallGt(genotype.getAlleles())) { + return true; + } + } - if (largerStart <= smallerStop && smallerStart <= largerStop) { - final int intersectionSize = Math.min(smallerStop, largerStop) - Math.max(smallerStart, largerStart) + 1; - return (double) intersectionSize / (smallerStop - smallerStart + 1); + if (builder.getAttributes().getOrDefault(GATKSVVCFConstants.SVTYPE, "").equals(GATKSVVCFConstants.CNV)) { + for (final Genotype genotype : genotypes) { + if (Integer.parseInt(genotype.getExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, 2).toString()) != 2) { + return true; + } + } } - return 0.0; + + return false; + } + + private boolean isNoCallGt(final List alleles) { + if (alleles.size() == 1 && alleles.get(0).isReference()) return true; + else if (alleles.size() == 2 && alleles.get(0).isReference() && alleles.get(1).isReference()) return true; + else if (alleles.size() == 1 && alleles.get(0).isNoCall()) return true; + return false; } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvs.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvs.java index 39fa7ae19aa..c359f8881c1 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvs.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingCnvs.java @@ -335,7 +335,7 @@ private void processCn(final VariantContext v1, final VariantContext v2) { final Set smallerSampleSupport = smallerSupport.get(sample); final Genotype genotype2 = smallerVariant.getGenotype(sample); - // Condition 1: Smaller depth call is being driven by larger call + // Condition 1: Smaller depth call is driven by larger call if (largerSampleSupport.contains(GATKSVVCFConstants.EV_VALUES.get(1)) && largerSampleSupport.size() > 1 && smallerSampleSupport.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) && !largerIsMultiCnv) { if (largerSampleRdCn == 0) { @@ -349,7 +349,7 @@ private void processCn(final VariantContext v1, final VariantContext v2) { } } - // Condition 2: Smaller CNV is driven by larger CNV genotype + // Condition 2: Smaller call is driven by larger depth call else if (smallerSampleSupport.contains(GATKSVVCFConstants.EV_VALUES.get(1)) && smallerSampleSupport.size() > 1 && largerSampleSupport.equals(Collections.singleton(GATKSVVCFConstants.EV_VALUES.get(1))) && !genotype2.isHomRef() && !smallerIsMultiCnv && isOverlapping) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingMultiallelics.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingMultiallelics.java new file mode 100644 index 00000000000..eaef9ff3a64 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVReviseOverlappingMultiallelics.java @@ -0,0 +1,170 @@ +package org.broadinstitute.hellbender.tools.walkers.sv; + +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; + +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; +import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; + +import java.util.List; +import java.util.ArrayList; +import java.util.Set; +import java.util.HashSet; + +/** + * Completes an initial series of cleaning steps for a VCF produced by the GATK-SV pipeline. + * + *

          Inputs

          + *
            + *
          • + * VCF containing structural variant (SV) records from the GATK-SV pipeline. + *
          • + *
          • + * TODO + *
          • + *
          + * + *

          Output

          + *
            + *
          • + * Cleansed VCF. + *
          • + *
          + * + *

          Usage Example

          + *
          + *     TODO
          + * 
          + * + *

          Processing Steps

          + *
            + *
          1. + * TODO + *
          2. + *
          + */ +@CommandLineProgramProperties( + summary = "Clean and format SV VCF", + oneLineSummary = "Clean and format SV VCF", + programGroup = StructuralVariantDiscoveryProgramGroup.class +) +@BetaFeature +@DocumentedFeature +public class SVReviseOverlappingMultiallelics extends MultiplePassVariantWalker { + @Argument( + fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + doc = "Output VCF name" + ) + private GATKPath outputVcf; + + private VariantContextWriter vcfWriter; + + private final List overlappingVariantsBuffer = new ArrayList<>(); + private final Set filteredVariantIds = new HashSet<>(); + + @Override + protected int numberOfPasses() { return 2; } + + @Override + protected void afterNthPass(int n) {} + + @Override + public void onTraversalStart() { + vcfWriter = createVCFWriter(outputVcf); + vcfWriter.writeHeader(getHeaderForVariants()); + } + + @Override + public void closeTool() { + if (vcfWriter != null) { + vcfWriter.close(); + } + } + + @Override + protected void nthPassApply(final VariantContext variant, final ReadsContext readsContext, + final ReferenceContext referenceContext, final FeatureContext featureContext, int n) { + switch (n) { + case 0: + firstPassApply(variant); + break; + case 1: + secondPassApply(variant); + break; + } + } + + public void firstPassApply(final VariantContext variant) { + if (!variant.getFilters().contains(GATKSVVCFConstants.MULTIALLELIC)) { + return; + } + + overlappingVariantsBuffer.removeIf(vc -> !vc.getContig().equals(variant.getContig()) + || (vc.getStart() + vc.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) < variant.getStart()); + for (final VariantContext bufferedVariant : overlappingVariantsBuffer) { + if (overlaps(bufferedVariant, variant)) { + processVariantPair(bufferedVariant, variant); + } + } + overlappingVariantsBuffer.add(variant); + } + + public void secondPassApply(final VariantContext variant) { + if (filteredVariantIds.contains(variant.getID())) { + return; + } + + final VariantContextBuilder builder = new VariantContextBuilder(variant); + vcfWriter.add(builder.make()); + } + + private void processVariantPair(final VariantContext v1, final VariantContext v2) { + // Determine larger variant, swapping if necessary + VariantContext largerVariant = v1; + VariantContext smallerVariant = v2; + final int length1 = v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + final int length2 = v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0); + if (length2 > length1) { + largerVariant = v2; + smallerVariant = v1; + } + + // Skip if coverage below expected + final double coverage = getCoverage(largerVariant, smallerVariant); + if (coverage < 0.5) { + return; + } + + // Filter variant based on conditions + if (!filteredVariantIds.contains(largerVariant.getID())) { + filteredVariantIds.add(smallerVariant.getID()); + } + } + + private boolean overlaps(final VariantContext v1, final VariantContext v2) { + return v1.getContig().equals(v2.getContig()) + && v1.getStart() <= (v2.getStart() + v2.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) + && v2.getStart() <= (v1.getStart() + v1.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)); + } + + private double getCoverage(final VariantContext larger, final VariantContext smaller) { + final int largerStart = larger.getStart(); + final int smallerStart = smaller.getStart(); + final int largerStop = larger.getEnd(); + final int smallerStop = smaller.getEnd(); + + if (largerStart <= smallerStop && smallerStart <= largerStop) { + final int intersectionSize = Math.min(smallerStop, largerStop) - Math.max(smallerStart, largerStart) + 1; + return (double) intersectionSize / (smallerStop - smallerStart + 1); + } + return 0.0; + } +} From 8c4e09172ad7d1ea056fdc362428a85259b787f9 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Thu, 20 Mar 2025 11:41:24 -0400 Subject: [PATCH 58/58] Syntax error in MultiVariantWalker --- .../broadinstitute/hellbender/engine/MultiVariantWalker.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/broadinstitute/hellbender/engine/MultiVariantWalker.java b/src/main/java/org/broadinstitute/hellbender/engine/MultiVariantWalker.java index 8c13e147641..7f12ff641e2 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/MultiVariantWalker.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/MultiVariantWalker.java @@ -86,7 +86,7 @@ protected void initializeDrivingVariants() { features.addToFeatureSources(0, featureInput, VariantContext.class, cloudPrefetchBuffer, cloudIndexPrefetchBuffer, referenceArguments.getReferencePath()); } - ); 9434, + ); final boolean skipDictionaryValidation = !seqValidationArguments.performSequenceDictionaryValidation() || !doDictionaryCrossValidation(); if (skipDictionaryValidation && !hasReference() && getMasterSequenceDictionary() == null) {