Skip to content

Commit

Permalink
Adding gather steps for merging calls across chromosomes
Browse files Browse the repository at this point in the history
  • Loading branch information
skchronicles committed Jun 13, 2024
1 parent 1f0343b commit b8ce9f7
Show file tree
Hide file tree
Showing 2 changed files with 86 additions and 8 deletions.
21 changes: 21 additions & 0 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,12 @@ wes_bed_file = config['references']['WES_BED'] # default: gencode_v44_protein
# Run WES pipeline, default: False
run_wes = (wes_bed_provided != "None" or str_bool(config['options']['wes_mode']))

# Call variants with GATK4's set of
# best practices for germline variants
# only runs if provided --gatk-germline,
# default: False
call_gatk_germline = str_bool(config['options']['gatk_germline'])

# List of somatic callers
# TODO: add as cli option,
# make sure at least one
Expand All @@ -97,6 +103,7 @@ purple_callers = list(set(purple_callers) & set(somatic_callers))
with open(join('config', 'cluster', '{0}.json'.format(run_mode))) as fh:
cluster = json.load(fh)


# Final ouput files of the pipeline
rule all:
input:
Expand Down Expand Up @@ -517,6 +524,19 @@ rule all:
),
call_somatic
),
# Call germline variants with GATK4 HaplotypeCaller,
# scattered/merged on chroms per-sample, only runs
# if provided --gatk-germline and NOT WES mode,
# @imported from rules/germline.smk
# expand(
# join(workpath, "haplotypecaller", "gVCFs", "chunks", "{name}", "{chrom}.g.vcf.gz"),
# name=provided(samples, call_gatk_germline and not run_wes),
# chrom=provided(chunks, call_gatk_germline and not run_wes)
# ),
# expand(
# join(workpath, "haplotypecaller", "gVCFs", "{name}.g.vcf.gz"),
# name=provided(samples, call_gatk_germline and not run_wes)
# ),
# MultiQC, QC report and sample statistics
# @imported from rules/qc.smk
join(workpath, "QC", "MultiQC_Report.html"),
Expand All @@ -533,4 +553,5 @@ include: join("rules", "open_cravat.smk")
include: join("rules", "gatk_recal.smk")
include: join("rules", "somatic.smk")
include: join("rules", "wes.smk")
include: join("rules", "gatk_germline.smk")
include: join("rules", "hooks.smk")
73 changes: 65 additions & 8 deletions workflow/rules/gatk_germline.smk
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ from scripts.common import (
)


rule gatk_haplotypecaller:
rule gatk_germline_haplotypecaller:
"""
Call germline SNPs and indels via local re-assembly of haplotypes.
The HaplotypeCaller is capable of calling SNPs and indels simultaneously
Expand All @@ -28,13 +28,13 @@ rule gatk_haplotypecaller:
@Input:
Realigned, recalibrated BAM file (scatter-per-sample-per-chrom)
@Output:
Single-sample GVCF file with raw, unfiltered SNP and indel calls.
Single-sample per-chromosome GVCF file with raw, unfiltered SNP and indel calls.
"""
input:
bam = join(workpath, "BAM", "{name}.recal.bam"),
output:
gvcf = temp(join(workpath, "haplotypecaller", "gVCFs", "{name}.{chrom}.g.vcf.gz")),
idx = temp(join(workpath, "haplotypecaller", "gVCFs", "{name}.{chrom}.g.vcf.gz.tbi")),
gvcf = temp(join(workpath, "haplotypecaller", "gVCFs", "chunks", "{name}", "{chrom}.g.vcf.gz")),
idx = temp(join(workpath, "haplotypecaller", "gVCFs", "chunks", "{name}", "{chrom}.g.vcf.gz.tbi")),
params:
rname = "haplotype",
genome = config['references']['GENOME'],
Expand All @@ -45,12 +45,12 @@ rule gatk_haplotypecaller:
# per cpu, so we must calculate total mem
# as the product of threads and memory
memory = lambda _: int(
int(allocated("mem", "gatk_haplotypecaller", cluster).lower().rstrip('g')) * \
int(allocated("threads", "gatk_haplotypecaller", cluster))
int(allocated("mem", "gatk_germline_haplotypecaller", cluster).lower().rstrip('g')) * \
int(allocated("threads", "gatk_germline_haplotypecaller", cluster))
)-1 if run_mode == "uge" \
else allocated("mem", "gatk_haplotypecaller", cluster).lower().rstrip('g'),
else allocated("mem", "gatk_germline_haplotypecaller", cluster).lower().rstrip('g'),
message: "Running GATK4 HaplotypeCaller on '{input.bam}' input file"
threads: int(allocated("threads", "gatk_haplotypecaller", cluster))
threads: int(allocated("threads", "gatk_germline_haplotypecaller", cluster))
container: config['images']['genome-seek']
envmodules: config['tools']['gatk4']
shell: """
Expand All @@ -69,3 +69,60 @@ rule gatk_haplotypecaller:
--max-alternate-alleles 3 \\
--intervals {params.chrom}
"""


rule gatk_germline_merge_gcvfs_across_chromosomes:
"""
Data-processing step to merge gVCFs for each chromosome into a single gVCF.
This rule is intended to be used in conjunction with the HaplotypeCaller to
merge the scattered germline calling to speed up the germline calling process.
@Input:
Single-sample per-chromosome GVCF file with raw, unfiltered SNP and indel calls.
(gather-per-sample-per-chrom).
@Output:
Single-sample GVCF file with raw, unfiltered SNP and indel calls.
"""
input:
gvcfs = expand(
join(workpath, "haplotypecaller", "gVCFs", "chunks", "{{name}}", "{chrom}.g.vcf.gz"),
chrom=chunks
),
idxs = expand(
join(workpath, "haplotypecaller", "gVCFs", "chunks", "{{name}}", "{chrom}.g.vcf.gz.tbi"),
chrom=chunks
),
output:
gvcf = join(workpath, "haplotypecaller", "gVCFs", "{name}.g.vcf.gz"),
idx = join(workpath, "haplotypecaller", "gVCFs", "{name}.g.vcf.gz.tbi"),
lsl = join(workpath, "haplotypecaller", "gVCFs", "{name}.gvcfs.list"),
params:
rname = "gatk_gl_merge_chroms",
genome = config['references']['GENOME'],
sample = "{name}",
gvcfdir = join(workpath, "haplotypecaller", "gVCFs", "chunks", "{name}"),
# For UGE/SGE clusters memory is allocated
# per cpu, so we must calculate total mem
# as the product of threads and memory
memory = lambda _: int(
int(allocated("mem", "gatk_germline_merge_gcvfs_across_chromosomes", cluster).lower().rstrip('g')) * \
int(allocated("threads", "gatk_germline_merge_gcvfs_across_chromosomes", cluster))
)-1 if run_mode == "uge" \
else allocated("mem", "gatk_germline_merge_gcvfs_across_chromosomes", cluster).lower().rstrip('g'),
threads: int(allocated("threads", "gatk_germline_merge_gcvfs_across_chromosomes", cluster))
container: config['images']['genome-seek']
envmodules: config['tools']['gatk4']
shell: """
# Get a list of per-chromosome gVCFs
# to merge into a single gVCF. Avoids
# ARG_MAX issue which will limit max
# length of a command.
find {params.gvcfdir}/ -type f \\
-iname '*.g.vcf.gz' \\
> {output.lsl}
# Merge GATK Germline gVCFs for each
# chromosome into a single gVCF file.
gatk --java-options '-Xmx{params.memory}g' MergeVcfs \\
--OUTPUT {output.gvcf} \\
--INPUT {output.lsl}
"""

0 comments on commit b8ce9f7

Please sign in to comment.