Skip to content

Commit

Permalink
Adding genotype refinement rule.
Browse files Browse the repository at this point in the history
  • Loading branch information
skchronicles committed Jun 20, 2024
1 parent c21c80e commit 1173b0d
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 0 deletions.
5 changes: 5 additions & 0 deletions config/cluster/slurm.json
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,11 @@
"mem": "48G",
"time": "2-00:00:00"
},
"gatk_germline_scatter_genotype_refinement": {
"threads": "2",
"mem": "48G",
"time": "2-00:00:00"
},
"hla": {
"threads": "8",
"partition": "norm",
Expand Down
5 changes: 5 additions & 0 deletions config/cluster/uge.json
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,11 @@
"partition": "",
"mem": "6G"
},
"gatk_germline_scatter_genotype_refinement": {
"mem": "24G",
"partition": "",
"threads": "2"
},
"hla": {
"mem": "12G",
"partition": "",
Expand Down
2 changes: 2 additions & 0 deletions config/genome.json
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
"chrX:120000001-150000001", "chrX:150000001-156040895", "chrY:1-30000001",
"chrY:30000001-57227415", "chrM:1-16569"
],
"EXAC": "/data/OpenOmics/references/genome-seek/GATK_resource_bundle/hg38bundle/exacv1_grch38_release1_ExAC.r1.sites.liftover.GRCh38.vcf.gz",
"GENOME": "/data/OpenOmics/references/genome-seek/Homo_sapiens_assembly38.fasta",
"GNOMAD": "/data/OpenOmics/references/genome-seek/GNOMAD/somatic-hg38-af-only-gnomad.hg38.vcf.gz",
"SLIVARGNOMAD": "/data/OpenOmics/references/genome-seek/slivar/gnomad.hg38.v2.zip",
Expand Down Expand Up @@ -116,6 +117,7 @@
"SPECIES": "homo_sapiens"
},
"1000GSNP": "/data/OpenOmics/references/genome-seek/GATK_resource_bundle/hg38bundle/1000G_phase1.snps.high_confidence.hg38.vcf.gz",
"1000G": "/data/OpenOmics/references/genome-seek/1000G/ALL.GRCh38_sites.nuclear.20170504.vcf.gz",
"VEP_BUILD": "GRCh38",
"VEP_DATA": "/data/OpenOmics/references/genome-seek/VEP/106/cache",
"VEP_SPECIES": "homo_sapiens",
Expand Down
74 changes: 74 additions & 0 deletions workflow/rules/gatk_germline.smk
Original file line number Diff line number Diff line change
Expand Up @@ -628,3 +628,77 @@ rule gatk_germline_apply_vqsr_indels:
--truth-sensitivity-filter-level 99.7 \\
-O {output.vcf}
"""


rule gatk_germline_scatter_genotype_refinement:
"""
Data-processing step to calculate genotype posterior probabilities
given known population genotypes. The tool calculates the posterior
genotype probability for each sample genotype in a given VCF format
callset. This tool uses use extra information like allele frequencies
in relevant populations to further refine the genotype assignments.
@Input:
A recalibrated VCF file in which each variant of the requested
type is annotated with its VQSLOD and marked as filtered if
the score is below the desired quality level containing both
SNPs and INDELs for a given region.
(gather-per-cohort-scatter-across-regions)
@Output:
Genotype refined VCF with the following information: Genotype
posteriors added to the FORMAT fields ("PP"), genotypes and GQ
assigned according to these posteriors, per-site genotype priors
added to the INFO field ("PG").
"""
input:
vcf = join(workpath, "haplotypecaller", "VCFs", "snp_indel_recal_chunks", "snps_and_indels_recal_variants_{region}.vcf.gz"),
output:
tmp = temp(
join(workpath, "haplotypecaller", "VCFs", "gtype_temp_chunks", "snps_and_indels_recal_refinement_variants_{region}.vcf.gz")
),
vcf = temp(
join(workpath, "haplotypecaller", "VCFs", "gtype_fixed_chunks", "snps_and_indels_recal_refinement_variants_{region}.GTfix.vcf.gz")
),
params:
rname = "gatk_gl_scatter_gtype_refine",
# Reference files for GType Refinement
genome = config['references']['GENOME'],
onekg = config['references']['1000G'],
exac = config['references']['EXAC'],
chunk = "{region}",
# For UGE/SGE clusters memory is allocated
# per cpu, so we must calculate total mem
# as the product of threads and memory
memory = lambda _: int(
int(allocated("mem", "gatk_germline_scatter_genotype_refinement", cluster).lower().rstrip('g')) * \
int(allocated("threads", "gatk_germline_scatter_genotype_refinement", cluster))
)-2 if run_mode == "uge" \
else int(allocated("mem", "gatk_germline_scatter_genotype_refinement", cluster).lower().rstrip('g')) - 2,
threads: int(allocated("threads", "gatk_germline_scatter_genotype_refinement", cluster))
container: config['images']['genome-seek']
envmodules:
config['tools']['gatk4'],
config['tools']['bcftools']
shell: """
# Calculate genotype posterior probabilities
# in relevant populations to further refine
# the genotype assignments.
gatk --java-options '-Xmx{params.memory}g' CalculateGenotypePosteriors \\
--reference {params.genome} \\
--use-jdk-inflater --use-jdk-deflater \\
-V {input.vcf} \\
-supporting {params.onekg} \\
-supporting {params.exac} \\
-O {output.tmp} \\
-L {params.chunk}
# Unphase all genotypes and sort by allele
# frequency, example: (1|0 becomes 0/1)
bcftools +setGT \\
{output.tmp} \\
-O z \\
-o {output.vcf} \\
-- -t a -n u
# Create an tabix index for the VCF file
tabix -p vcf {output.vcf}
"""

0 comments on commit 1173b0d

Please sign in to comment.