From 074661e3f7f4fa462a1f77c6a3f3d75346a1a6c3 Mon Sep 17 00:00:00 2001 From: David Laehnemann Date: Mon, 27 Nov 2023 10:37:19 +0100 Subject: [PATCH 1/2] feat: improve and run sra download ci test case (#276) * perf: update to latest versions of varlociraptor and vega-lite-cli * feat: include check (and `ValueError`) for uniqueness of sample-unit pairs * feat: change SRA download test case to use meaningful yeast data that should run through in a reasonable amount of time with this configuration * fix: actually run SRA download test case * snakefmt * improve error message and fix sample-unit uniqueness check * reduce scatter-gather of calling to 4 chunks in SRA test case --- .github/workflows/main.yml | 2 +- .test/config-sra/README.md | 22 ++++++++++++++++++++++ .test/config-sra/config.yaml | 28 +++++++++++++++------------- .test/config-sra/samples.tsv | 6 ++++-- .test/config-sra/scenario.yaml | 10 +++++++--- .test/config-sra/units.tsv | 7 ++++--- workflow/envs/varlociraptor.yaml | 4 ++-- workflow/envs/vega.yaml | 2 +- workflow/rules/common.smk | 11 +++++++++++ 9 files changed, 67 insertions(+), 25 deletions(-) create mode 100644 .test/config-sra/README.md diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 0ea4cbbfa..042660ebc 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -131,7 +131,7 @@ jobs: with: directory: .test snakefile: workflow/Snakefile - args: "--configfile .test/config-sra/config.yaml --show-failed-logs --use-conda -n -j 10 --conda-cleanup-pkgs cache" + args: "--configfile .test/config-sra/config.yaml --show-failed-logs --use-conda --cores all --set-scatter calling=4 --conda-cleanup-pkgs cache" stagein: | rm -rf .test/results show-disk-usage-on-error: true diff --git a/.test/config-sra/README.md b/.test/config-sra/README.md new file mode 100644 index 000000000..041ba3ab0 --- /dev/null +++ b/.test/config-sra/README.md @@ -0,0 +1,22 @@ +The files used in this test come from the following project: + +https://www.ncbi.nlm.nih.gov/bioproject/PRJNA755173 + +They are basically different (but closely related) strains of yeast that were isolated at the same geolocation. They are described in this manuscript: + +> Lee TJ et al., "Extensive sampling of Saccharomyces cerevisiae in Taiwan reveals ecology and evolution of predomesticated lineages.", Genome Res, 2022 May;32(5):864-877, https://doi.org/10.1101/gr.276286.121 + +A detailed description of samples can be found in Supplementary Table S6 in this document: +https://genome.cshlp.org/content/suppl/2022/05/02/gr.276286.121.DC1/Supplemental_Tables_.xlsx + +I basically chose samples with a comparably low coverage via looking through the list at SRA Run Selector, by restricting the `Assay Type` to `wxs` and sorting by the `Bytes` column: +https://www.ncbi.nlm.nih.gov/Traces/study/?acc=SRP334167 + +Further, I made sure that they: +* had all been collected at the same site (same `Latitude` and `Longitude` values in Supplementary Table S6) +* had all annotations in common, apart from the two variables in Supplementary Table S6 that define the two different groups: + * `Substrate isolated` defines the group `soil` made up of `PD09A` and `PD12A` + *`Medium used in isolation` defines the group `medium_L` made up of `PD13B` and `PD12A` +* all belong to the same clonal group, so that they will only differ in very few mutations + +The setup reusing `PD12A` in both groups is meant to mimic a tumor-normal setup, where only a panel of normals is available and is reused in every group. \ No newline at end of file diff --git a/.test/config-sra/config.yaml b/.test/config-sra/config.yaml index 89c8594cf..ed558bfa9 100644 --- a/.test/config-sra/config.yaml +++ b/.test/config-sra/config.yaml @@ -5,11 +5,11 @@ units: config-sra/units.tsv ref: # Number of chromosomes to consider for calling. # The first n entries of the FASTA will be considered. - n_chromosomes: 25 + n_chromosomes: 17 # Ensembl species name species: saccharomyces_cerevisiae # Ensembl release - release: 100 + release: 110 # Genome build build: R64-1-1 @@ -22,11 +22,9 @@ primers: # Estimation of tumor mutational burden. mutational_burden: - activate: false + activate: true events: - - SOMATIC_TUMOR_LOW - - SOMATIC_TUMOR_MEDIUM - - SOMATIC_TUMOR_HIGH + - changed_only calling: delly: @@ -37,13 +35,17 @@ calling: scenario: config-sra/scenario.yaml # See http://snpeff.sourceforge.net/SnpSift.html#filter filter: + candidates: "ANN['IMPACT'] in ['MODERATE', 'HIGH']" moderate: "ANN['IMPACT'] == 'MODERATE'" fdr-control: threshold: 0.05 + local: true events: present: varlociraptor: - - "PRESENT" + - base_only + - changed_only + - both filter: - moderate desc: Variants with moderate impact @@ -52,16 +54,16 @@ annotations: dbnsfp: activate: false vcfs: - activate: true + activate: false known: resources/variation.vcf.gz dgidb: activate: false vep: candidate_calls: - params: "" + params: "--fields IMPACT" plugins: [] final_calls: - params: --everything + params: "--fields IMPACT" plugins: [] params: @@ -80,12 +82,12 @@ params: call: "" # Add extra arguments for varlociraptor preprocess. By default, we limit the depth to 200. # Increase this value for panel sequencing! - preprocess: "--max-depth 200" + preprocess: "--max-depth 50" freebayes: - min_alternate_fraction: 0.05 # Reduce for calling variants with lower VAFs + min_alternate_fraction: 0.4 # Reduce for calling variants with lower VAFs gene_coverage: - min_avg_coverage: 5 + min_avg_coverage: 8 report: activate: false diff --git a/.test/config-sra/samples.tsv b/.test/config-sra/samples.tsv index 9d39093cf..924cbca05 100644 --- a/.test/config-sra/samples.tsv +++ b/.test/config-sra/samples.tsv @@ -1,3 +1,5 @@ sample_name group alias platform -a a ILLUMINA -b b ILLUMINA +PD12A medium_L base ILLUMINA +PD13B medium_L changed ILLUMINA +PD09A soil changed ILLUMINA +PD12A soil base ILLUMINA diff --git a/.test/config-sra/scenario.yaml b/.test/config-sra/scenario.yaml index 7949da67f..6b7f5e7f1 100644 --- a/.test/config-sra/scenario.yaml +++ b/.test/config-sra/scenario.yaml @@ -4,10 +4,14 @@ __definitions__: samples: - ?sample: + base: + resolution: 0.01 + universe: "[0.0,1.0]" + changed: resolution: 0.01 universe: "[0.0,1.0]" events: - present: ?f"{sample}:]0.05,1.0]" - ffpe_artifact: ?f"{sample}:]0.0,0.05]" + changed_only: "changed:]0.0,1.0] & base:0.0" + base_only: "base:]0.0,1.0] & changed:0.0" + both: "base:]0.0,1.0] & changed:]0.0,1.0]" diff --git a/.test/config-sra/units.tsv b/.test/config-sra/units.tsv index 3c27e9038..4b979be4c 100644 --- a/.test/config-sra/units.tsv +++ b/.test/config-sra/units.tsv @@ -1,3 +1,4 @@ -sample_name unit_name group fq1 fq2 adapters sra -a lane1 "-a ACGGCTAGCTA -A AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC" ERR267986 -b lane1 "-a ACGGCTAGCTA -A AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC" ERR267986 +sample_name unit_name fq1 fq2 adapters sra +PD12A u1 -a CTGTCTCTTATACACATCT -A CTGTCTCTTATACACATCT SRR15616337 +PD13B u1 -a CTGTCTCTTATACACATCT -A CTGTCTCTTATACACATCT SRR15616336 +PD09A u1 -a CTGTCTCTTATACACATCT -A CTGTCTCTTATACACATCT SRR15616341 \ No newline at end of file diff --git a/workflow/envs/varlociraptor.yaml b/workflow/envs/varlociraptor.yaml index bdf3ea740..b02570d2a 100644 --- a/workflow/envs/varlociraptor.yaml +++ b/workflow/envs/varlociraptor.yaml @@ -3,5 +3,5 @@ channels: - bioconda - nodefaults dependencies: - - varlociraptor =8.3 - - vega-lite-cli =3.4 + - varlociraptor =8.4 + - vega-lite-cli =5.16 diff --git a/workflow/envs/vega.yaml b/workflow/envs/vega.yaml index 043f2c870..72a75f46f 100644 --- a/workflow/envs/vega.yaml +++ b/workflow/envs/vega.yaml @@ -1,4 +1,4 @@ channels: - conda-forge dependencies: - - vega-lite-cli =3.4 + - vega-lite-cli =5.16 diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 4c98ac2c2..caf07760e 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -227,11 +227,22 @@ def get_sra_reads(sample, unit, fq): def get_raw_reads(sample, unit, fq): pattern = units.loc[sample].loc[unit, fq] + if pd.isna(pattern): assert fq.startswith("fq") fq = fq[len("fq") :] return get_sra_reads(sample, unit, fq) + if type(pattern) is not str and len(pattern) > 1: + raise ValueError( + f"Multiple units.tsv entries found for sample '{sample}' and " + f"unit '{unit}'.\n" + "The units.tsv should contain only one entry for each combination " + "of sample and unit.\n" + "Found:\n" + f"{pattern}" + ) + if "*" in pattern: files = sorted(glob.glob(units.loc[sample].loc[unit, fq])) if not files: From 12f22e9f6c1127546ed9e20add76bdebf8bbec36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20M=C3=B6lder?= Date: Wed, 29 Nov 2023 16:35:52 +0100 Subject: [PATCH 2/2] feat: map primers by bwa and update datavzrd template (#275) * feat: replace bowtie * set min score * fmt * cleanup * cleanup * set primer params * fmt --- workflow/envs/bowtie.yaml | 6 --- workflow/envs/bwa.yaml | 5 -- .../variant-calls-template.datavzrd.yaml | 50 ++++++++++++++----- workflow/rules/common.smk | 34 +++++++++---- workflow/rules/primers.smk | 42 +++++----------- 5 files changed, 72 insertions(+), 65 deletions(-) delete mode 100644 workflow/envs/bowtie.yaml delete mode 100644 workflow/envs/bwa.yaml diff --git a/workflow/envs/bowtie.yaml b/workflow/envs/bowtie.yaml deleted file mode 100644 index 84c3d3d69..000000000 --- a/workflow/envs/bowtie.yaml +++ /dev/null @@ -1,6 +0,0 @@ -channels: - - conda-forge - - bioconda -dependencies: - - bowtie =1.3 - - samtools =1.12 \ No newline at end of file diff --git a/workflow/envs/bwa.yaml b/workflow/envs/bwa.yaml deleted file mode 100644 index 9e71f2a5a..000000000 --- a/workflow/envs/bwa.yaml +++ /dev/null @@ -1,5 +0,0 @@ -channels: - - conda-forge - - bioconda -dependencies: - - bwa =0.7.17 diff --git a/workflow/resources/datavzrd/variant-calls-template.datavzrd.yaml b/workflow/resources/datavzrd/variant-calls-template.datavzrd.yaml index a526cf894..e31446c37 100644 --- a/workflow/resources/datavzrd/variant-calls-template.datavzrd.yaml +++ b/workflow/resources/datavzrd/variant-calls-template.datavzrd.yaml @@ -224,7 +224,7 @@ views: scale: ordinal revel: optional: true - plot: + plot: heatmap: scale: "linear" domain: [0.0, 1.0] @@ -232,14 +232,16 @@ views: - white - "#ff5555" "regex('.+: allele frequency')": - plot: - ticks: + plot: + heatmap: scale: "linear" - domain: [0.0, 1.0] - aux-domain-columns: - - "regex('.+: allele frequency')" + domain: [0.0, 0.05, 1.0] + range: + - white + - "#d3e9f8" + - "#1f77b4" "regex('.+: read depth')": - plot: + plot: ticks: scale: "linear" aux-domain-columns: @@ -252,6 +254,16 @@ views: range: - white - "#1f77b4" + "gnomad genome af": + optional: true + plot: + heatmap: + scale: "linear" + domain: [0.0, 0.01, 1.0] + range: + - white + - "#adebad" + - "#29a329" gene: display-mode: hidden feature: @@ -328,14 +340,16 @@ views: - "#ec9b00" - "#ecca00" "regex('.+: allele frequency')": - plot: - ticks: + plot: + heatmap: scale: "linear" - domain: [0.0, 1.0] - aux-domain-columns: - - "regex('.+: allele frequency')" + domain: [0.0, 0.05, 1.0] + range: + - white + - "#d3e9f8" + - "#1f77b4" "regex('.+: read depth')": - plot: + plot: ticks: scale: "linear" aux-domain-columns: @@ -348,6 +362,16 @@ views: range: - white - "#1f77b4" + "gnomad genome af": + optional: true + plot: + heatmap: + scale: "linear" + domain: [0.0, 0.01, 1.0] + range: + - white + - "#adebad" + - "#29a329" id: optional: true display-mode: hidden diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index caf07760e..13c713ae5 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -927,15 +927,9 @@ def get_dgidb_datasources(): return "" -def get_bowtie_insertsize(): - if config["primers"]["trimming"].get("library_length", 0) != 0: - return "-X {}".format(config["primers"]["trimming"].get("library_length")) - return "" - - def get_filter_params(wc): if isinstance(get_panel_primer_input(wc.panel), list): - return "-b -f 2" + return "-b -F 12" return "-b -F 4" @@ -945,10 +939,28 @@ def get_single_primer_flag(wc): return "" -def format_bowtie_primers(wc, primers): - if isinstance(primers, list): - return "-1 {r1} -2 {r2}".format(r1=primers[0], r2=primers[1]) - return primers +def get_shortest_primer_length(primers): + primers = primers if isinstance(primers, list) else [primers] + # set to 32 to match bwa-mem default value considering offset of 2 + min_length = 32 + for primer_file in primers: + with open(primer_file, "r") as p: + min_primer = min( + [len(p.strip()) for i, p in enumerate(p.readlines()) if i % 2 == 1] + ) + min_length = min(min_length, min_primer) + return min_length + + +def get_primer_extra(wc, input): + extra = rf"-R '@RG\tID:{wc.panel}\tSM:{wc.panel}' -L 100" + min_primer_len = get_shortest_primer_length(input.reads) + # Check if shortest primer is below default values + if min_primer_len < 32: + extra += f" -T {min_primer_len - 2}" + if min_primer_len < 19: + extra += f" -k {min_primer_len}" + return extra def get_datavzrd_data(impact="coding"): diff --git a/workflow/rules/primers.smk b/workflow/rules/primers.smk index 1929ef077..56a599672 100644 --- a/workflow/rules/primers.smk +++ b/workflow/rules/primers.smk @@ -44,40 +44,22 @@ rule trim_primers: "fgbio TrimPrimers -H -i {input.bam} -p {input.primers} -s {params.sort_order} {params.single_primer} -o {output.trimmed} &> {log}" -rule bowtie_build: +rule map_primers: input: - genome, - output: - directory("resources/bowtie_build/"), - params: - prefix=f"resources/bowtie_build/{genome_name}", - log: - "logs/bowtie/build.log", - conda: - "../envs/bowtie.yaml" - cache: True - shell: - "mkdir {output} & " - "bowtie-build {input} {params.prefix} &> {log}" - - -rule bowtie_map: - input: - primers=lambda w: get_panel_primer_input(w.panel), - idx="resources/bowtie_build", + reads=lambda wc: get_panel_primer_input(wc.panel), + idx=rules.bwa_index.output, output: "results/primers/{panel}_primers.bam", - params: - primers=lambda wc, input: format_bowtie_primers(wc, input.primers), - prefix=f"resources/bowtie_build/{genome_name}", - insertsize=get_bowtie_insertsize(), - primer_format=lambda wc, input: "-f" if input_is_fasta(input.primers) else "", log: - "logs/bowtie/{panel}_map.log", - conda: - "../envs/bowtie.yaml" - shell: - "bowtie {params.primers} -x {params.prefix} {params.insertsize} {params.primer_format} -S | samtools view -b - > {output} 2> {log}" + "logs/bwa_mem/{panel}.log", + params: + extra=lambda wc, input: get_primer_extra(wc, input), + sorting="none", # Can be 'none', 'samtools' or 'picard'. + sort_order="queryname", # Can be 'queryname' or 'coordinate'. + sort_extra="", # Extra args for samtools/picard. + threads: 8 + wrapper: + "v2.13.0/bio/bwa/mem" rule filter_unmapped_primers: