Adding VQSR steps to filter SNPs

OpenOmics · Jun 18, 2024 · 07310a4 · 07310a4
1 parent f65fe3c
commit 07310a4
Show file tree

Hide file tree

Showing 4 changed files with 188 additions and 7 deletions.
diff --git a/config/cluster/slurm.json b/config/cluster/slurm.json
@@ -234,6 +234,26 @@
         "mem": "64G",
         "time": "1-12:00:00"
     },
+    "gatk_germline_build_vqsr_snps": {
+        "threads": "8",
+        "mem": "48G",
+        "time": "4-12:00:00"
+    },
+    "gatk_germline_apply_vqsr_snps": {
+        "threads": "8",
+        "mem": "48G",
+        "time": "2-00:00:00"
+    },
+    "gatk_germline_build_vqsr_indels": {
+        "threads": "8",
+        "mem": "48G",
+        "time": "4-12:00:00"
+    },
+    "gatk_germline_apply_vqsr_indels": {
+        "threads": "8",
+        "mem": "48G",
+        "time": "2-00:00:00"
+    },
     "hla": {
         "threads": "8",
         "partition": "norm",

diff --git a/config/cluster/uge.json b/config/cluster/uge.json
@@ -225,7 +225,27 @@
     "mem": "12G",
     "partition": "",
     "threads": "6"
-},
+  },
+  "gatk_germline_build_vqsr_snps": {
+    "threads": "8",
+    "partition": "",
+    "mem": "6G"
+  },
+  "gatk_germline_apply_vqsr_snps": {
+      "threads": "8",
+      "partition": "",
+      "mem": "6G"
+  },
+  "gatk_germline_build_vqsr_indels": {
+      "threads": "8",
+      "partition": "",
+      "mem": "6G"
+  },
+  "gatk_germline_apply_vqsr_indels": {
+      "threads": "8",
+      "partition": "",
+      "mem": "6G"
+  },
   "hla": {
     "mem": "12G",
     "partition": "",

diff --git a/config/genome.json b/config/genome.json
@@ -60,6 +60,7 @@
         "SLIVARGNOMAD3": "/data/OpenOmics/references/genome-seek/slivar/gnomad.hg38.genomes.v3.fix.zip",
         "SLIVARTOPMED": "/data/OpenOmics/references/genome-seek/slivar/topmed.hg38.dbsnp.151.zip",
         "GISTIC_REFGENE": "/data/OpenOmics/references/genome-seek/gistic/hg38.UCSC.add_miR.160920.refgene.mat",
+        "HAPMAP": "/data/OpenOmics/references/genome-seek/GATK_resource_bundle/hg38bundle/hapmap_3.3.hg38.vcf.gz",
         "HLA_LA_GRAPH": "/data/OpenOmics/references/genome-seek/HLA-LA/graphs/PRG_MHC_GRCh38_withIMGT",
         "HMFTOOLS_REF_VERSION": "38",
         "HMFTOOLS_AMBER_JAR": "/data/OpenOmics/references/genome-seek/hmftools/amber-3.5.jar",
@@ -97,6 +98,7 @@
         "OCTOPUS_SOMATIC_FOREST_MODEL": "/data/OpenOmics/references/genome-seek/Octopus/somatic.v0.7.4.forest",
         "OCTOPUS_GERMLINE_FOREST_MODEL": "/data/OpenOmics/references/genome-seek/Octopus/germline.v0.7.4.forest",
         "OCTOPUS_ERROR_MODEL": "PCR-FREE.NOVASEQ",
+        "OMNI": "/data/OpenOmics/references/genome-seek/GATK_resource_bundle/hg38bundle/1000G_omni2.5.hg38.vcf.gz",
         "PEDDY_FILTER": "chr22",
         "PON": "/data/OpenOmics/references/genome-seek/PON/hg38.noCOSMIC_ClinVar.pon.vcf.gz",
         "SEQUENZA_GC": "/data/OpenOmics/references/genome-seek/FREEC/hg38_gc50Base.txt.gz",

diff --git a/workflow/rules/gatk_germline.smk b/workflow/rules/gatk_germline.smk
@@ -47,8 +47,8 @@ rule gatk_germline_haplotypecaller:
         memory    = lambda _: int(
             int(allocated("mem", "gatk_germline_haplotypecaller", cluster).lower().rstrip('g')) * \
             int(allocated("threads", "gatk_germline_haplotypecaller", cluster)) 
-        )-1 if run_mode == "uge" \
-        else allocated("mem", "gatk_germline_haplotypecaller", cluster).lower().rstrip('g'),
+        )-2 if run_mode == "uge" \
+        else int(allocated("mem", "gatk_germline_haplotypecaller", cluster).lower().rstrip('g')) - 2,
     message: "Running GATK4 HaplotypeCaller on '{input.bam}' input file"
     threads: int(allocated("threads", "gatk_germline_haplotypecaller", cluster))
     container: config['images']['genome-seek']
@@ -105,8 +105,8 @@ rule gatk_germline_merge_gvcfs_across_chromosomes:
         memory  = lambda _: int(
             int(allocated("mem", "gatk_germline_merge_gvcfs_across_chromosomes", cluster).lower().rstrip('g')) * \
             int(allocated("threads", "gatk_germline_merge_gvcfs_across_chromosomes", cluster)) 
-        )-1 if run_mode == "uge" \
-        else allocated("mem", "gatk_germline_merge_gvcfs_across_chromosomes", cluster).lower().rstrip('g'),
+        )-2 if run_mode == "uge" \
+        else int(allocated("mem", "gatk_germline_merge_gvcfs_across_chromosomes", cluster).lower().rstrip('g')) - 2,
     threads: int(allocated("threads", "gatk_germline_merge_gvcfs_across_chromosomes", cluster))
     container: config['images']['genome-seek']
     envmodules: config['tools']['gatk4']
@@ -162,7 +162,7 @@ rule gatk_germline_list_gvcfs_across_samples:
             int(allocated("mem", "gatk_germline_list_gvcfs_across_samples", cluster).lower().rstrip('g')) * \
             int(allocated("threads", "gatk_germline_list_gvcfs_across_samples", cluster)) 
         )-1 if run_mode == "uge" \
-        else allocated("mem", "gatk_germline_list_gvcfs_across_samples", cluster).lower().rstrip('g'),
+        else int(allocated("mem", "gatk_germline_list_gvcfs_across_samples", cluster).lower().rstrip('g')) - 1,
     threads: int(allocated("threads", "gatk_germline_list_gvcfs_across_samples", cluster))
     container: config['images']['genome-seek']
     envmodules: config['tools']['gatk4']
@@ -328,7 +328,7 @@ rule gatk_germline_create_cohort_vcf_across_regions:
             int(allocated("mem", "gatk_germline_create_cohort_vcf_across_regions", cluster).lower().rstrip('g')) * \
             int(allocated("threads", "gatk_germline_create_cohort_vcf_across_regions", cluster)) 
         )-2 if run_mode == "uge" \
-        else allocated("mem", "gatk_germline_create_cohort_vcf_across_regions", cluster).lower().rstrip('g'),
+        else int(allocated("mem", "gatk_germline_create_cohort_vcf_across_regions", cluster).lower().rstrip('g')) - 2,
     threads: int(allocated("threads", "gatk_germline_create_cohort_vcf_across_regions", cluster))
     container: config['images']['genome-seek']
     envmodules: config['tools']['gatk4']
@@ -349,3 +349,142 @@ rule gatk_germline_create_cohort_vcf_across_regions:
         --OUTPUT {output.vcf} \\
         --INPUT {output.lsl}
     """
+
+
+rule gatk_germline_build_vqsr_snps:
+    """
+    Data-processing step to filter SNPs by Variant Quality Score 
+    Recalibration (VQSR). GATK's variant calling tools are designed to be 
+    very lenient in order to achieve a high degree of sensitivity. This is 
+    good because it minimizes the chance of missing real variants, but it 
+    does mean that we need to filter the raw callset they produce in order 
+    to reduce the amount of false positives, which can be quite large. The 
+    established way to filter the raw variant callset is to use variant 
+    quality score recalibration (VQSR), which uses machine learning to 
+    identify annotation profiles of variants that are likely to be real, 
+    and assigns a VQSLOD score to each variant that is much more reliable 
+    than the QUAL score calculated by the caller. This tool performs the 
+    first pass in VQSR. Specifically, it builds the model that will be 
+    used in the second step to actually filter variants.
+    @Input:
+        Joint genotyped VCF of the entire cohort across all regions. 
+        (singleton)
+    @Output:
+        Recalibration table file for ApplyVQSR.
+        Tranches file with various recalibration metrics.
+        Rscript file to visualize the recalibration plots.
+    """
+    input: 
+        vcf = join(workpath, "haplotypecaller", "VCFs", "raw_variants.vcf.gz"),
+    output:
+        recal   = join(workpath, "haplotypecaller", "VCFs", "SNP.output.AS.recal"),
+        tranch  = join(workpath, "haplotypecaller", "VCFs", "SNP.output.AS.tranches"),
+        rscript = join(workpath, "haplotypecaller", "VCFs", "SNP.output.AS.R"),
+    params:
+        rname  = "gatk_gl_bld_vqsr_snps",
+        # Reference files for VQSR
+        genome = config['references']['GENOME'],
+        dbsnp  = config['references']['DBSNP'],
+        onekg  = config['references']['1000GSNP'],
+        hapmap = config['references']['HAPMAP'],
+        omni   = config['references']['OMNI'], 
+        # For UGE/SGE clusters memory is allocated
+        # per cpu, so we must calculate total mem
+        # as the product of threads and memory
+        memory  = lambda _: int(
+            int(allocated("mem", "gatk_germline_build_vqsr_snps", cluster).lower().rstrip('g')) * \
+            int(allocated("threads", "gatk_germline_build_vqsr_snps", cluster)) 
+        )-2 if run_mode == "uge" \
+        else int(allocated("mem", "gatk_germline_build_vqsr_snps", cluster).lower().rstrip('g')) - 2,
+    threads: int(allocated("threads", "gatk_germline_build_vqsr_snps", cluster))
+    container: config['images']['genome-seek']
+    envmodules: config['tools']['gatk4']
+    shell: """
+    # Build a recalibration model to score 
+    # variant quality for filtering purposes.
+    # This is the first pass in a two-stage 
+    # process called Variant Quality Score 
+    # Recalibration (VQSR). 
+    gatk --java-options '-Xmx{params.memory}g' VariantRecalibrator \\
+        --trust-all-polymorphic \\
+        -tranche 100.0 -tranche 99.95 -tranche 99.9 -tranche 99.8 \\
+        -tranche 99.6 -tranche 99.5 -tranche 99.4 -tranche 99.3 \\
+        -tranche 99.0 -tranche 98.0 -tranche 97.0 -tranche 90.0 \\
+        --max-gaussians 6 \\
+        --reference {params.genome} \\
+        --use-jdk-inflater --use-jdk-deflater \\
+        -V {input.vcf} \\
+        --resource:hapmap,known=false,training=true,truth=true,prior=15.0 \\
+        {params.hapmap} \\
+        --resource:omni,known=false,training=true,truth=false,prior=12.0 \\
+        {params.omni} \\
+        --resource:1000G,known=false,training=true,truth=false,prior=10.0 \\
+        {params.onekg} \\
+        --resource:dbsnp,known=true,training=false,truth=false,prior=2.0 \\
+        {params.dbsnp} \\
+        -an QD -an DP -an MQ -an MQRankSum -an ReadPosRankSum -an FS -an SOR \\
+        -mode SNP \\
+        -O {output.recal} \\
+        --tranches-file {output.tranch} \\
+        --rscript-file {output.rscript}
+    """
+
+
+rule gatk_germline_apply_vqsr_snps:
+    """
+    Data-processing step to apply a score cutoff to filter variants based 
+    on a recalibration table. This tool performs the second pass in a two-
+    stage process called Variant Quality Score Recalibration (VQSR). 
+    Specifically, it applies filtering to the input variants based on the 
+    recalibration table produced in the first step by VariantRecalibrator 
+    and a target sensitivity value, which the tool matches internally to 
+    a VQSLOD score cutoff based on the model's estimated sensitivity to 
+    a set of true variants.
+    @Input:
+        Recalibration table file for ApplyVQSR.
+        Tranches file with various recalibration metrics.
+        Rscript file to visualize the recalibration plots.
+    @Output:
+        A recalibrated VCF file in which each variant of the requested 
+        type is annotated with its VQSLOD and marked as filtered if 
+        the score is below the desired quality level.
+    """
+    input: 
+        vcf    = join(workpath, "haplotypecaller", "VCFs", "raw_variants.vcf.gz"),
+        recal  = join(workpath, "haplotypecaller", "VCFs", "SNP.output.AS.recal"),
+        tranch = join(workpath, "haplotypecaller", "VCFs", "SNP.output.AS.tranches"),
+    output:
+        vcf = join(workpath, "haplotypecaller", "VCFs", "snp_recal_chunks", "snps_recal_variants_{region}.vcf.gz"),
+    params:
+        rname  = "gatk_gl_apl_vqsr_snps",
+        # Reference files for VQSR
+        genome = config['references']['GENOME'],
+        chunk  = "{region}",
+        # For UGE/SGE clusters memory is allocated
+        # per cpu, so we must calculate total mem
+        # as the product of threads and memory
+        memory  = lambda _: int(
+            int(allocated("mem", "gatk_germline_apply_vqsr_snps", cluster).lower().rstrip('g')) * \
+            int(allocated("threads", "gatk_germline_apply_vqsr_snps", cluster)) 
+        )-2 if run_mode == "uge" \
+        else int(allocated("mem", "gatk_germline_apply_vqsr_snps", cluster).lower().rstrip('g')) - 2,
+    threads: int(allocated("threads", "gatk_germline_apply_vqsr_snps", cluster))
+    container: config['images']['genome-seek']
+    envmodules: config['tools']['gatk4']
+    shell: """
+    # Apply a score cutoff to filter variants 
+    # based on a recalibration table. This is 
+    # the last step in VQSR to filter out low 
+    # quality variants.
+    gatk --java-options '-Xmx{params.memory}g' ApplyVQSR \\
+        --intervals {params.chunk} \\
+        --create-output-variant-index true \\
+        --reference {params.genome} \\
+        --use-jdk-inflater --use-jdk-deflater \\
+        -V {input.vcf} \\
+        -mode SNP \\
+        --recal-file {input.recal} \\
+        --tranches-file {input.tranch} \\
+        --truth-sensitivity-filter-level 99.7 \\
+        -O {output.vcf}
+    """