Skip to content

Commit

Permalink
Adding VQSR steps to filter SNPs
Browse files Browse the repository at this point in the history
  • Loading branch information
skchronicles committed Jun 18, 2024
1 parent f65fe3c commit 07310a4
Show file tree
Hide file tree
Showing 4 changed files with 188 additions and 7 deletions.
20 changes: 20 additions & 0 deletions config/cluster/slurm.json
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,26 @@
"mem": "64G",
"time": "1-12:00:00"
},
"gatk_germline_build_vqsr_snps": {
"threads": "8",
"mem": "48G",
"time": "4-12:00:00"
},
"gatk_germline_apply_vqsr_snps": {
"threads": "8",
"mem": "48G",
"time": "2-00:00:00"
},
"gatk_germline_build_vqsr_indels": {
"threads": "8",
"mem": "48G",
"time": "4-12:00:00"
},
"gatk_germline_apply_vqsr_indels": {
"threads": "8",
"mem": "48G",
"time": "2-00:00:00"
},
"hla": {
"threads": "8",
"partition": "norm",
Expand Down
22 changes: 21 additions & 1 deletion config/cluster/uge.json
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,27 @@
"mem": "12G",
"partition": "",
"threads": "6"
},
},
"gatk_germline_build_vqsr_snps": {
"threads": "8",
"partition": "",
"mem": "6G"
},
"gatk_germline_apply_vqsr_snps": {
"threads": "8",
"partition": "",
"mem": "6G"
},
"gatk_germline_build_vqsr_indels": {
"threads": "8",
"partition": "",
"mem": "6G"
},
"gatk_germline_apply_vqsr_indels": {
"threads": "8",
"partition": "",
"mem": "6G"
},
"hla": {
"mem": "12G",
"partition": "",
Expand Down
2 changes: 2 additions & 0 deletions config/genome.json
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
"SLIVARGNOMAD3": "/data/OpenOmics/references/genome-seek/slivar/gnomad.hg38.genomes.v3.fix.zip",
"SLIVARTOPMED": "/data/OpenOmics/references/genome-seek/slivar/topmed.hg38.dbsnp.151.zip",
"GISTIC_REFGENE": "/data/OpenOmics/references/genome-seek/gistic/hg38.UCSC.add_miR.160920.refgene.mat",
"HAPMAP": "/data/OpenOmics/references/genome-seek/GATK_resource_bundle/hg38bundle/hapmap_3.3.hg38.vcf.gz",
"HLA_LA_GRAPH": "/data/OpenOmics/references/genome-seek/HLA-LA/graphs/PRG_MHC_GRCh38_withIMGT",
"HMFTOOLS_REF_VERSION": "38",
"HMFTOOLS_AMBER_JAR": "/data/OpenOmics/references/genome-seek/hmftools/amber-3.5.jar",
Expand Down Expand Up @@ -97,6 +98,7 @@
"OCTOPUS_SOMATIC_FOREST_MODEL": "/data/OpenOmics/references/genome-seek/Octopus/somatic.v0.7.4.forest",
"OCTOPUS_GERMLINE_FOREST_MODEL": "/data/OpenOmics/references/genome-seek/Octopus/germline.v0.7.4.forest",
"OCTOPUS_ERROR_MODEL": "PCR-FREE.NOVASEQ",
"OMNI": "/data/OpenOmics/references/genome-seek/GATK_resource_bundle/hg38bundle/1000G_omni2.5.hg38.vcf.gz",
"PEDDY_FILTER": "chr22",
"PON": "/data/OpenOmics/references/genome-seek/PON/hg38.noCOSMIC_ClinVar.pon.vcf.gz",
"SEQUENZA_GC": "/data/OpenOmics/references/genome-seek/FREEC/hg38_gc50Base.txt.gz",
Expand Down
151 changes: 145 additions & 6 deletions workflow/rules/gatk_germline.smk
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ rule gatk_germline_haplotypecaller:
memory = lambda _: int(
int(allocated("mem", "gatk_germline_haplotypecaller", cluster).lower().rstrip('g')) * \
int(allocated("threads", "gatk_germline_haplotypecaller", cluster))
)-1 if run_mode == "uge" \
else allocated("mem", "gatk_germline_haplotypecaller", cluster).lower().rstrip('g'),
)-2 if run_mode == "uge" \
else int(allocated("mem", "gatk_germline_haplotypecaller", cluster).lower().rstrip('g')) - 2,
message: "Running GATK4 HaplotypeCaller on '{input.bam}' input file"
threads: int(allocated("threads", "gatk_germline_haplotypecaller", cluster))
container: config['images']['genome-seek']
Expand Down Expand Up @@ -105,8 +105,8 @@ rule gatk_germline_merge_gvcfs_across_chromosomes:
memory = lambda _: int(
int(allocated("mem", "gatk_germline_merge_gvcfs_across_chromosomes", cluster).lower().rstrip('g')) * \
int(allocated("threads", "gatk_germline_merge_gvcfs_across_chromosomes", cluster))
)-1 if run_mode == "uge" \
else allocated("mem", "gatk_germline_merge_gvcfs_across_chromosomes", cluster).lower().rstrip('g'),
)-2 if run_mode == "uge" \
else int(allocated("mem", "gatk_germline_merge_gvcfs_across_chromosomes", cluster).lower().rstrip('g')) - 2,
threads: int(allocated("threads", "gatk_germline_merge_gvcfs_across_chromosomes", cluster))
container: config['images']['genome-seek']
envmodules: config['tools']['gatk4']
Expand Down Expand Up @@ -162,7 +162,7 @@ rule gatk_germline_list_gvcfs_across_samples:
int(allocated("mem", "gatk_germline_list_gvcfs_across_samples", cluster).lower().rstrip('g')) * \
int(allocated("threads", "gatk_germline_list_gvcfs_across_samples", cluster))
)-1 if run_mode == "uge" \
else allocated("mem", "gatk_germline_list_gvcfs_across_samples", cluster).lower().rstrip('g'),
else int(allocated("mem", "gatk_germline_list_gvcfs_across_samples", cluster).lower().rstrip('g')) - 1,
threads: int(allocated("threads", "gatk_germline_list_gvcfs_across_samples", cluster))
container: config['images']['genome-seek']
envmodules: config['tools']['gatk4']
Expand Down Expand Up @@ -328,7 +328,7 @@ rule gatk_germline_create_cohort_vcf_across_regions:
int(allocated("mem", "gatk_germline_create_cohort_vcf_across_regions", cluster).lower().rstrip('g')) * \
int(allocated("threads", "gatk_germline_create_cohort_vcf_across_regions", cluster))
)-2 if run_mode == "uge" \
else allocated("mem", "gatk_germline_create_cohort_vcf_across_regions", cluster).lower().rstrip('g'),
else int(allocated("mem", "gatk_germline_create_cohort_vcf_across_regions", cluster).lower().rstrip('g')) - 2,
threads: int(allocated("threads", "gatk_germline_create_cohort_vcf_across_regions", cluster))
container: config['images']['genome-seek']
envmodules: config['tools']['gatk4']
Expand All @@ -349,3 +349,142 @@ rule gatk_germline_create_cohort_vcf_across_regions:
--OUTPUT {output.vcf} \\
--INPUT {output.lsl}
"""


rule gatk_germline_build_vqsr_snps:
"""
Data-processing step to filter SNPs by Variant Quality Score
Recalibration (VQSR). GATK's variant calling tools are designed to be
very lenient in order to achieve a high degree of sensitivity. This is
good because it minimizes the chance of missing real variants, but it
does mean that we need to filter the raw callset they produce in order
to reduce the amount of false positives, which can be quite large. The
established way to filter the raw variant callset is to use variant
quality score recalibration (VQSR), which uses machine learning to
identify annotation profiles of variants that are likely to be real,
and assigns a VQSLOD score to each variant that is much more reliable
than the QUAL score calculated by the caller. This tool performs the
first pass in VQSR. Specifically, it builds the model that will be
used in the second step to actually filter variants.
@Input:
Joint genotyped VCF of the entire cohort across all regions.
(singleton)
@Output:
Recalibration table file for ApplyVQSR.
Tranches file with various recalibration metrics.
Rscript file to visualize the recalibration plots.
"""
input:
vcf = join(workpath, "haplotypecaller", "VCFs", "raw_variants.vcf.gz"),
output:
recal = join(workpath, "haplotypecaller", "VCFs", "SNP.output.AS.recal"),
tranch = join(workpath, "haplotypecaller", "VCFs", "SNP.output.AS.tranches"),
rscript = join(workpath, "haplotypecaller", "VCFs", "SNP.output.AS.R"),
params:
rname = "gatk_gl_bld_vqsr_snps",
# Reference files for VQSR
genome = config['references']['GENOME'],
dbsnp = config['references']['DBSNP'],
onekg = config['references']['1000GSNP'],
hapmap = config['references']['HAPMAP'],
omni = config['references']['OMNI'],
# For UGE/SGE clusters memory is allocated
# per cpu, so we must calculate total mem
# as the product of threads and memory
memory = lambda _: int(
int(allocated("mem", "gatk_germline_build_vqsr_snps", cluster).lower().rstrip('g')) * \
int(allocated("threads", "gatk_germline_build_vqsr_snps", cluster))
)-2 if run_mode == "uge" \
else int(allocated("mem", "gatk_germline_build_vqsr_snps", cluster).lower().rstrip('g')) - 2,
threads: int(allocated("threads", "gatk_germline_build_vqsr_snps", cluster))
container: config['images']['genome-seek']
envmodules: config['tools']['gatk4']
shell: """
# Build a recalibration model to score
# variant quality for filtering purposes.
# This is the first pass in a two-stage
# process called Variant Quality Score
# Recalibration (VQSR).
gatk --java-options '-Xmx{params.memory}g' VariantRecalibrator \\
--trust-all-polymorphic \\
-tranche 100.0 -tranche 99.95 -tranche 99.9 -tranche 99.8 \\
-tranche 99.6 -tranche 99.5 -tranche 99.4 -tranche 99.3 \\
-tranche 99.0 -tranche 98.0 -tranche 97.0 -tranche 90.0 \\
--max-gaussians 6 \\
--reference {params.genome} \\
--use-jdk-inflater --use-jdk-deflater \\
-V {input.vcf} \\
--resource:hapmap,known=false,training=true,truth=true,prior=15.0 \\
{params.hapmap} \\
--resource:omni,known=false,training=true,truth=false,prior=12.0 \\
{params.omni} \\
--resource:1000G,known=false,training=true,truth=false,prior=10.0 \\
{params.onekg} \\
--resource:dbsnp,known=true,training=false,truth=false,prior=2.0 \\
{params.dbsnp} \\
-an QD -an DP -an MQ -an MQRankSum -an ReadPosRankSum -an FS -an SOR \\
-mode SNP \\
-O {output.recal} \\
--tranches-file {output.tranch} \\
--rscript-file {output.rscript}
"""


rule gatk_germline_apply_vqsr_snps:
"""
Data-processing step to apply a score cutoff to filter variants based
on a recalibration table. This tool performs the second pass in a two-
stage process called Variant Quality Score Recalibration (VQSR).
Specifically, it applies filtering to the input variants based on the
recalibration table produced in the first step by VariantRecalibrator
and a target sensitivity value, which the tool matches internally to
a VQSLOD score cutoff based on the model's estimated sensitivity to
a set of true variants.
@Input:
Recalibration table file for ApplyVQSR.
Tranches file with various recalibration metrics.
Rscript file to visualize the recalibration plots.
@Output:
A recalibrated VCF file in which each variant of the requested
type is annotated with its VQSLOD and marked as filtered if
the score is below the desired quality level.
"""
input:
vcf = join(workpath, "haplotypecaller", "VCFs", "raw_variants.vcf.gz"),
recal = join(workpath, "haplotypecaller", "VCFs", "SNP.output.AS.recal"),
tranch = join(workpath, "haplotypecaller", "VCFs", "SNP.output.AS.tranches"),
output:
vcf = join(workpath, "haplotypecaller", "VCFs", "snp_recal_chunks", "snps_recal_variants_{region}.vcf.gz"),
params:
rname = "gatk_gl_apl_vqsr_snps",
# Reference files for VQSR
genome = config['references']['GENOME'],
chunk = "{region}",
# For UGE/SGE clusters memory is allocated
# per cpu, so we must calculate total mem
# as the product of threads and memory
memory = lambda _: int(
int(allocated("mem", "gatk_germline_apply_vqsr_snps", cluster).lower().rstrip('g')) * \
int(allocated("threads", "gatk_germline_apply_vqsr_snps", cluster))
)-2 if run_mode == "uge" \
else int(allocated("mem", "gatk_germline_apply_vqsr_snps", cluster).lower().rstrip('g')) - 2,
threads: int(allocated("threads", "gatk_germline_apply_vqsr_snps", cluster))
container: config['images']['genome-seek']
envmodules: config['tools']['gatk4']
shell: """
# Apply a score cutoff to filter variants
# based on a recalibration table. This is
# the last step in VQSR to filter out low
# quality variants.
gatk --java-options '-Xmx{params.memory}g' ApplyVQSR \\
--intervals {params.chunk} \\
--create-output-variant-index true \\
--reference {params.genome} \\
--use-jdk-inflater --use-jdk-deflater \\
-V {input.vcf} \\
-mode SNP \\
--recal-file {input.recal} \\
--tranches-file {input.tranch} \\
--truth-sensitivity-filter-level 99.7 \\
-O {output.vcf}
"""

0 comments on commit 07310a4

Please sign in to comment.