From 2c72e6dc2d2d6537df13c011e75753721e0db754 Mon Sep 17 00:00:00 2001 From: fcaretti Date: Fri, 21 Jun 2024 16:17:15 +0200 Subject: [PATCH] feat(rule): add VEP annotation (no wrapper) --- .test/config/config.yml | 11 +++++- config/config.yml | 12 +++--- workflow/Snakefile | 3 +- workflow/envs/curl.yml | 5 +++ workflow/envs/unzip.yml | 7 ++++ workflow/rules/vep.smk | 83 +++++++++++++++++++++++++---------------- 6 files changed, 81 insertions(+), 40 deletions(-) create mode 100644 workflow/envs/curl.yml create mode 100644 workflow/envs/unzip.yml diff --git a/.test/config/config.yml b/.test/config/config.yml index a3538fa..0509166 100644 --- a/.test/config/config.yml +++ b/.test/config/config.yml @@ -10,4 +10,13 @@ known_sites: filename: "placeholder.vcf" filtering: - params: "" \ No newline at end of file + params: "" + +vep: + cache_dir: "" + zip_name: "homo_sapiens_vep_112_GRCh38.tar.gz" + url: "https://ftp.ensembl.org/pub/release-112/variation/indexed_vep_cache/homo_sapiens_vep_112_GRCh38.tar.gz" + image: "docker://ensemblorg/ensembl-vep:release_112.0" + filters: "--filter " + impact_levels: [ "MODERATE", "HIGH"] + species: "homo_sapiens" \ No newline at end of file diff --git a/config/config.yml b/config/config.yml index 6c2e1d0..cb51e12 100644 --- a/config/config.yml +++ b/config/config.yml @@ -13,8 +13,10 @@ filtering: params: "ex: -i 'QUAL > 30 && INFO/DP > 10'" vep: - cache_dir: "cache_folder" - plugins_dir: "plugins_folder" - species: "" - build: "" - release: "" \ No newline at end of file + cache_dir: "" + zip_name: "ex: homo_sapiens_vep_112_GRCh38.tar.gz" + url: "ex: https://ftp.ensembl.org/pub/release-112/variation/indexed_vep_cache/homo_sapiens_vep_112_GRCh38.tar.gz" + image: "ex: docker://ensemblorg/ensembl-vep:release_112.0" + filters: "--filter " + impact_levels: [ "MODERATE", "HIGH"] + species: "ex: homo_sapiens" \ No newline at end of file diff --git a/workflow/Snakefile b/workflow/Snakefile index c970c9c..6588379 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -39,7 +39,7 @@ haplo_calls = "results/calls/calls_gatk.vcf" rule all: input: - "results/calls/filtered_calls.vcf", + "results/calls/annotated_calls.vcf", first_summaries, second_summaries, @@ -52,3 +52,4 @@ include: "rules/recalibration.smk" include: "rules/alignment_summary.smk" include: "rules/gatk_haplocaller.smk" include: "rules/filter.smk" +include: "rules/vep.smk" diff --git a/workflow/envs/curl.yml b/workflow/envs/curl.yml new file mode 100644 index 0000000..9e3ed03 --- /dev/null +++ b/workflow/envs/curl.yml @@ -0,0 +1,5 @@ +name: wget +channels: + - conda-forge +dependencies: + - curl=8.8.0 \ No newline at end of file diff --git a/workflow/envs/unzip.yml b/workflow/envs/unzip.yml new file mode 100644 index 0000000..46d22db --- /dev/null +++ b/workflow/envs/unzip.yml @@ -0,0 +1,7 @@ +name: unzip +channels: + - bioconda + - conda-forge + - defaults +dependencies: + - htslib=1.19.1 \ No newline at end of file diff --git a/workflow/rules/vep.smk b/workflow/rules/vep.smk index 8133fde..0dc5e02 100644 --- a/workflow/rules/vep.smk +++ b/workflow/rules/vep.smk @@ -1,43 +1,60 @@ -rule annotate_variants: - input: - calls="results/calls/calls_gatk.vcf", # .vcf, .vcf.gz or .bcf - cache=config["vep"]["cache_dir"], # can be omitted if fasta and gff are specified - plugins=config["vep"]["plugins_dir"], - fasta=reference, - fai=reference_idx, +rule download_vep_cache: output: - calls="results/calls/annotated_calls.vcf", # .vcf, .vcf.gz or .bcf - stats="results/calls/variants.html", - params: - # Pass a list of plugins to use, see https://www.ensembl.org/info/docs/tools/vep/script/vep_plugins.html - # Plugin args can be added as well, e.g. via an entry "MyPlugin,1,FOO", see docs. - plugins=["LoFtool"], - extra="--everything", # optional: extra arguments + expand( + "{dir}/{zip_name}", + dir=config["vep"]["cache_dir"], + zip_name=config["vep"]["zip_name"], + ), log: - "logs/vep/annotate.log", - threads: 4 - wrapper: - "v3.12.1/bio/vep/annotate" + log_file="logs/vep/download_vep_cache.log", + params: + cache_url=lambda wc: config["vep"]["url"], + directory=config["vep"]["cache_dir"], + conda: + "../envs/curl.yml" # Updated to use a conda environment with curl + shell: + """ + mkdir -p {params.directory} + curl -L -o {output} {params.cache_url} >> {log.log_file} 2>&1 + """ -rule get_vep_cache: +rule unzip_vep_cache: + input: + tar_file=expand( + "{dir}/{zip_name}", + dir=config["vep"]["cache_dir"], + zip_name=config["vep"]["zip_name"], + ), output: - directory(config["vep"]["cache_dir"]), - params: - species=config["vep"]["species"], - build=config["vep"]["build"], - release=config["vep"]["release"], + species_dir=directory("{cache_dir}/{species}".format(**config["vep"])), log: - "logs/vep/cache.log", - cache: "omit-software" # save space and time with between workflow caching (see docs) - wrapper: - "v3.12.1/bio/vep/cache" + log_file="logs/vep/unzip_vep_cache.log", + conda: + "../envs/unzip.yml" + shell: + """ + tar -xzvf {input.tar_file} >> {log.log_file} 2>&1 + """ -rule download_vep_plugins: +rule vep_annotation: + input: + vcf="results/calls/filtered_calls.vcf", + dir="{cache_dir}/{species}".format(**config["vep"]), output: - temp(directory(config["vep"]["plugins_dir"])), + annotated_vcf="results/calls/annotated_calls.vcf", params: - release=config["vep"]["release"], - wrapper: - "v3.12.1/bio/vep/plugins" + cache_dir=lambda wc: config["vep"]["cache_dir"], + species=lambda wc: config["vep"]["species"], + container: + config["vep"]["image"] + resources: + cores=4, + log: + log_file="logs/vep/vep_annotation.log", + shell: + """ + vep --input_file {input.vcf} --output_file {output.annotated_vcf} --offline --vcf --species homo_sapiens \ + --cache --dir_cache {params.cache_dir} --force_overwrite --fork {resources.cores} > {log.log_file} 2>&1 + """