From 074661e3f7f4fa462a1f77c6a3f3d75346a1a6c3 Mon Sep 17 00:00:00 2001
From: David Laehnemann <david.laehnemann@hhu.de>
Date: Mon, 27 Nov 2023 10:37:19 +0100
Subject: [PATCH 1/2] feat: improve and run sra download ci test case (#276)

* perf: update to latest versions of varlociraptor and vega-lite-cli

* feat: include check (and `ValueError`) for uniqueness of sample-unit pairs

* feat: change SRA download test case to use meaningful yeast data that should run through in a reasonable amount of time with this configuration

* fix: actually run SRA download test case

* snakefmt

* improve error message and fix sample-unit uniqueness check

* reduce scatter-gather of calling to 4 chunks in SRA test case
---
 .github/workflows/main.yml       |  2 +-
 .test/config-sra/README.md       | 22 ++++++++++++++++++++++
 .test/config-sra/config.yaml     | 28 +++++++++++++++-------------
 .test/config-sra/samples.tsv     |  6 ++++--
 .test/config-sra/scenario.yaml   | 10 +++++++---
 .test/config-sra/units.tsv       |  7 ++++---
 workflow/envs/varlociraptor.yaml |  4 ++--
 workflow/envs/vega.yaml          |  2 +-
 workflow/rules/common.smk        | 11 +++++++++++
 9 files changed, 67 insertions(+), 25 deletions(-)
 create mode 100644 .test/config-sra/README.md

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 0ea4cbbfa..042660ebc 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -131,7 +131,7 @@ jobs:
       with:
         directory: .test
         snakefile: workflow/Snakefile
-        args: "--configfile .test/config-sra/config.yaml --show-failed-logs --use-conda -n -j 10 --conda-cleanup-pkgs cache"
+        args: "--configfile .test/config-sra/config.yaml --show-failed-logs --use-conda --cores all --set-scatter calling=4 --conda-cleanup-pkgs cache"
         stagein: |
           rm -rf .test/results
         show-disk-usage-on-error: true
diff --git a/.test/config-sra/README.md b/.test/config-sra/README.md
new file mode 100644
index 000000000..041ba3ab0
--- /dev/null
+++ b/.test/config-sra/README.md
@@ -0,0 +1,22 @@
+The files used in this test come from the following project:
+
+https://www.ncbi.nlm.nih.gov/bioproject/PRJNA755173
+
+They are basically different (but closely related) strains of yeast that were isolated at the same geolocation. They are described in this manuscript:
+
+> Lee TJ et al., "Extensive sampling of Saccharomyces cerevisiae in Taiwan reveals ecology and evolution of predomesticated lineages.", Genome Res, 2022 May;32(5):864-877, https://doi.org/10.1101/gr.276286.121
+
+A detailed description of samples can be found in Supplementary Table S6 in this document:
+https://genome.cshlp.org/content/suppl/2022/05/02/gr.276286.121.DC1/Supplemental_Tables_.xlsx
+
+I basically chose samples with a comparably low coverage via looking through the list at SRA Run Selector, by restricting the `Assay Type` to `wxs` and sorting by the `Bytes` column:
+https://www.ncbi.nlm.nih.gov/Traces/study/?acc=SRP334167
+
+Further, I made sure that they:
+* had all been collected at the same site (same `Latitude` and `Longitude` values in Supplementary Table S6)
+* had all annotations in common, apart from the two variables in Supplementary Table S6 that define the two different groups:
+  * `Substrate isolated` defines the group `soil` made up of `PD09A` and `PD12A`
+  *`Medium used in isolation` defines the group `medium_L` made up of `PD13B` and `PD12A`
+* all belong to the same clonal group, so that they will only differ in very few mutations
+
+The setup reusing `PD12A` in both groups is meant to mimic a tumor-normal setup, where only a panel of normals is available and is reused in every group.
\ No newline at end of file
diff --git a/.test/config-sra/config.yaml b/.test/config-sra/config.yaml
index 89c8594cf..ed558bfa9 100644
--- a/.test/config-sra/config.yaml
+++ b/.test/config-sra/config.yaml
@@ -5,11 +5,11 @@ units: config-sra/units.tsv
 ref:
   # Number of chromosomes to consider for calling.
   # The first n entries of the FASTA will be considered.
-  n_chromosomes: 25
+  n_chromosomes: 17
   # Ensembl species name
   species: saccharomyces_cerevisiae
   # Ensembl release
-  release: 100
+  release: 110
   # Genome build
   build: R64-1-1
 
@@ -22,11 +22,9 @@ primers:
 
 # Estimation of tumor mutational burden.
 mutational_burden:
-  activate: false
+  activate: true
   events:
-    - SOMATIC_TUMOR_LOW
-    - SOMATIC_TUMOR_MEDIUM
-    - SOMATIC_TUMOR_HIGH
+    - changed_only
 
 calling:
   delly:
@@ -37,13 +35,17 @@ calling:
   scenario: config-sra/scenario.yaml
   # See http://snpeff.sourceforge.net/SnpSift.html#filter
   filter:
+    candidates: "ANN['IMPACT'] in ['MODERATE', 'HIGH']"
     moderate: "ANN['IMPACT'] == 'MODERATE'"
   fdr-control:
     threshold: 0.05
+    local: true
     events: 
       present:
         varlociraptor: 
-          - "PRESENT"
+          - base_only
+          - changed_only
+          - both
         filter:
           - moderate
         desc: Variants with moderate impact
@@ -52,16 +54,16 @@ annotations:
   dbnsfp:
     activate: false
   vcfs:
-    activate: true
+    activate: false
     known: resources/variation.vcf.gz
   dgidb:
     activate: false
   vep:
     candidate_calls:
-      params: ""
+      params: "--fields IMPACT"
       plugins: []
     final_calls:
-      params: --everything
+      params: "--fields IMPACT"
       plugins: []
 
 params:
@@ -80,12 +82,12 @@ params:
     call: ""
     # Add extra arguments for varlociraptor preprocess. By default, we limit the depth to 200.
     # Increase this value for panel sequencing!
-    preprocess: "--max-depth 200"
+    preprocess: "--max-depth 50"
   freebayes:
-    min_alternate_fraction: 0.05 # Reduce for calling variants with lower VAFs
+    min_alternate_fraction: 0.4 # Reduce for calling variants with lower VAFs
 
 gene_coverage:
-  min_avg_coverage: 5
+  min_avg_coverage: 8
 
 report:
   activate: false
diff --git a/.test/config-sra/samples.tsv b/.test/config-sra/samples.tsv
index 9d39093cf..924cbca05 100644
--- a/.test/config-sra/samples.tsv
+++ b/.test/config-sra/samples.tsv
@@ -1,3 +1,5 @@
 sample_name	group	alias	platform
-a		a	ILLUMINA
-b		b	ILLUMINA
+PD12A	medium_L	base	ILLUMINA
+PD13B	medium_L	changed	ILLUMINA
+PD09A	soil	changed	ILLUMINA
+PD12A	soil	base	ILLUMINA
diff --git a/.test/config-sra/scenario.yaml b/.test/config-sra/scenario.yaml
index 7949da67f..6b7f5e7f1 100644
--- a/.test/config-sra/scenario.yaml
+++ b/.test/config-sra/scenario.yaml
@@ -4,10 +4,14 @@ __definitions__:
 
 
 samples:
-  ?sample:
+  base:
+    resolution: 0.01
+    universe: "[0.0,1.0]"
+  changed:
     resolution: 0.01
     universe: "[0.0,1.0]"
 
 events:
-  present: ?f"{sample}:]0.05,1.0]"
-  ffpe_artifact: ?f"{sample}:]0.0,0.05]"
+  changed_only: "changed:]0.0,1.0] & base:0.0"
+  base_only: "base:]0.0,1.0] & changed:0.0"
+  both: "base:]0.0,1.0] & changed:]0.0,1.0]"
diff --git a/.test/config-sra/units.tsv b/.test/config-sra/units.tsv
index 3c27e9038..4b979be4c 100644
--- a/.test/config-sra/units.tsv
+++ b/.test/config-sra/units.tsv
@@ -1,3 +1,4 @@
-sample_name	unit_name	group	fq1	fq2	adapters	sra
-a	lane1				"-a ACGGCTAGCTA -A AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC"	ERR267986
-b	lane1				"-a ACGGCTAGCTA -A AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC"	ERR267986
+sample_name	unit_name	fq1	fq2	adapters	sra
+PD12A	u1			-a CTGTCTCTTATACACATCT -A CTGTCTCTTATACACATCT	SRR15616337
+PD13B	u1			-a CTGTCTCTTATACACATCT -A CTGTCTCTTATACACATCT	SRR15616336
+PD09A	u1			-a CTGTCTCTTATACACATCT -A CTGTCTCTTATACACATCT	SRR15616341
\ No newline at end of file
diff --git a/workflow/envs/varlociraptor.yaml b/workflow/envs/varlociraptor.yaml
index bdf3ea740..b02570d2a 100644
--- a/workflow/envs/varlociraptor.yaml
+++ b/workflow/envs/varlociraptor.yaml
@@ -3,5 +3,5 @@ channels:
   - bioconda
   - nodefaults
 dependencies:
-  - varlociraptor =8.3
-  - vega-lite-cli =3.4
+  - varlociraptor =8.4
+  - vega-lite-cli =5.16
diff --git a/workflow/envs/vega.yaml b/workflow/envs/vega.yaml
index 043f2c870..72a75f46f 100644
--- a/workflow/envs/vega.yaml
+++ b/workflow/envs/vega.yaml
@@ -1,4 +1,4 @@
 channels:
   - conda-forge
 dependencies:
-  - vega-lite-cli =3.4
+  - vega-lite-cli =5.16
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index 4c98ac2c2..caf07760e 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -227,11 +227,22 @@ def get_sra_reads(sample, unit, fq):
 
 def get_raw_reads(sample, unit, fq):
     pattern = units.loc[sample].loc[unit, fq]
+
     if pd.isna(pattern):
         assert fq.startswith("fq")
         fq = fq[len("fq") :]
         return get_sra_reads(sample, unit, fq)
 
+    if type(pattern) is not str and len(pattern) > 1:
+        raise ValueError(
+            f"Multiple units.tsv entries found for sample '{sample}' and "
+            f"unit '{unit}'.\n"
+            "The units.tsv should contain only one entry for each combination "
+            "of sample and unit.\n"
+            "Found:\n"
+            f"{pattern}"
+        )
+
     if "*" in pattern:
         files = sorted(glob.glob(units.loc[sample].loc[unit, fq]))
         if not files:

From 12f22e9f6c1127546ed9e20add76bdebf8bbec36 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Felix=20M=C3=B6lder?= <felix.moelder@uk-essen.de>
Date: Wed, 29 Nov 2023 16:35:52 +0100
Subject: [PATCH 2/2] feat: map primers by bwa and update datavzrd template
 (#275)

* feat: replace bowtie

* set min score

* fmt

* cleanup

* cleanup

* set primer params

* fmt
---
 workflow/envs/bowtie.yaml                     |  6 ---
 workflow/envs/bwa.yaml                        |  5 --
 .../variant-calls-template.datavzrd.yaml      | 50 ++++++++++++++-----
 workflow/rules/common.smk                     | 34 +++++++++----
 workflow/rules/primers.smk                    | 42 +++++-----------
 5 files changed, 72 insertions(+), 65 deletions(-)
 delete mode 100644 workflow/envs/bowtie.yaml
 delete mode 100644 workflow/envs/bwa.yaml

diff --git a/workflow/envs/bowtie.yaml b/workflow/envs/bowtie.yaml
deleted file mode 100644
index 84c3d3d69..000000000
--- a/workflow/envs/bowtie.yaml
+++ /dev/null
@@ -1,6 +0,0 @@
-channels:
-  - conda-forge
-  - bioconda
-dependencies:
-  - bowtie =1.3
-  - samtools =1.12
\ No newline at end of file
diff --git a/workflow/envs/bwa.yaml b/workflow/envs/bwa.yaml
deleted file mode 100644
index 9e71f2a5a..000000000
--- a/workflow/envs/bwa.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-channels:
-  - conda-forge
-  - bioconda
-dependencies:
-    - bwa =0.7.17
diff --git a/workflow/resources/datavzrd/variant-calls-template.datavzrd.yaml b/workflow/resources/datavzrd/variant-calls-template.datavzrd.yaml
index a526cf894..e31446c37 100644
--- a/workflow/resources/datavzrd/variant-calls-template.datavzrd.yaml
+++ b/workflow/resources/datavzrd/variant-calls-template.datavzrd.yaml
@@ -224,7 +224,7 @@ views:
                 scale: ordinal
           revel:
             optional: true
-            plot: 
+            plot:
               heatmap:
                 scale: "linear"
                 domain: [0.0, 1.0]
@@ -232,14 +232,16 @@ views:
                   - white
                   - "#ff5555"
           "regex('.+: allele frequency')":
-            plot: 
-              ticks:
+            plot:
+              heatmap:
                 scale: "linear"
-                domain: [0.0, 1.0]
-                aux-domain-columns:
-                  - "regex('.+: allele frequency')"
+                domain: [0.0, 0.05, 1.0]
+                range:
+                  - white
+                  - "#d3e9f8"
+                  - "#1f77b4"
           "regex('.+: read depth')":
-            plot: 
+            plot:
               ticks:
                 scale: "linear"
                 aux-domain-columns:
@@ -252,6 +254,16 @@ views:
                 range:
                   - white
                   - "#1f77b4"
+          "gnomad genome af":
+            optional: true
+            plot:
+              heatmap:
+                scale: "linear"
+                domain: [0.0, 0.01, 1.0]
+                range:
+                  - white
+                  - "#adebad"
+                  - "#29a329"
           gene:
             display-mode: hidden
           feature:
@@ -328,14 +340,16 @@ views:
                   - "#ec9b00"
                   - "#ecca00"
           "regex('.+: allele frequency')":
-            plot: 
-              ticks:
+            plot:
+              heatmap:
                 scale: "linear"
-                domain: [0.0, 1.0]
-                aux-domain-columns:
-                  - "regex('.+: allele frequency')"
+                domain: [0.0, 0.05, 1.0]
+                range:
+                  - white
+                  - "#d3e9f8"
+                  - "#1f77b4"
           "regex('.+: read depth')":
-            plot: 
+            plot:
               ticks:
                 scale: "linear"
                 aux-domain-columns:
@@ -348,6 +362,16 @@ views:
                 range:
                   - white
                   - "#1f77b4"
+          "gnomad genome af":
+            optional: true
+            plot:
+              heatmap:
+                scale: "linear"
+                domain: [0.0, 0.01, 1.0]
+                range:
+                  - white
+                  - "#adebad"
+                  - "#29a329"
           id:
             optional: true
             display-mode: hidden
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index caf07760e..13c713ae5 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -927,15 +927,9 @@ def get_dgidb_datasources():
     return ""
 
 
-def get_bowtie_insertsize():
-    if config["primers"]["trimming"].get("library_length", 0) != 0:
-        return "-X {}".format(config["primers"]["trimming"].get("library_length"))
-    return ""
-
-
 def get_filter_params(wc):
     if isinstance(get_panel_primer_input(wc.panel), list):
-        return "-b -f 2"
+        return "-b -F 12"
     return "-b -F 4"
 
 
@@ -945,10 +939,28 @@ def get_single_primer_flag(wc):
     return ""
 
 
-def format_bowtie_primers(wc, primers):
-    if isinstance(primers, list):
-        return "-1 {r1} -2 {r2}".format(r1=primers[0], r2=primers[1])
-    return primers
+def get_shortest_primer_length(primers):
+    primers = primers if isinstance(primers, list) else [primers]
+    # set to 32 to match bwa-mem default value considering offset of 2
+    min_length = 32
+    for primer_file in primers:
+        with open(primer_file, "r") as p:
+            min_primer = min(
+                [len(p.strip()) for i, p in enumerate(p.readlines()) if i % 2 == 1]
+            )
+            min_length = min(min_length, min_primer)
+    return min_length
+
+
+def get_primer_extra(wc, input):
+    extra = rf"-R '@RG\tID:{wc.panel}\tSM:{wc.panel}' -L 100"
+    min_primer_len = get_shortest_primer_length(input.reads)
+    # Check if shortest primer is below default values
+    if min_primer_len < 32:
+        extra += f" -T {min_primer_len - 2}"
+    if min_primer_len < 19:
+        extra += f" -k {min_primer_len}"
+    return extra
 
 
 def get_datavzrd_data(impact="coding"):
diff --git a/workflow/rules/primers.smk b/workflow/rules/primers.smk
index 1929ef077..56a599672 100644
--- a/workflow/rules/primers.smk
+++ b/workflow/rules/primers.smk
@@ -44,40 +44,22 @@ rule trim_primers:
         "fgbio TrimPrimers -H -i {input.bam} -p {input.primers} -s {params.sort_order} {params.single_primer} -o {output.trimmed} &> {log}"
 
 
-rule bowtie_build:
+rule map_primers:
     input:
-        genome,
-    output:
-        directory("resources/bowtie_build/"),
-    params:
-        prefix=f"resources/bowtie_build/{genome_name}",
-    log:
-        "logs/bowtie/build.log",
-    conda:
-        "../envs/bowtie.yaml"
-    cache: True
-    shell:
-        "mkdir {output} & "
-        "bowtie-build {input} {params.prefix} &> {log}"
-
-
-rule bowtie_map:
-    input:
-        primers=lambda w: get_panel_primer_input(w.panel),
-        idx="resources/bowtie_build",
+        reads=lambda wc: get_panel_primer_input(wc.panel),
+        idx=rules.bwa_index.output,
     output:
         "results/primers/{panel}_primers.bam",
-    params:
-        primers=lambda wc, input: format_bowtie_primers(wc, input.primers),
-        prefix=f"resources/bowtie_build/{genome_name}",
-        insertsize=get_bowtie_insertsize(),
-        primer_format=lambda wc, input: "-f" if input_is_fasta(input.primers) else "",
     log:
-        "logs/bowtie/{panel}_map.log",
-    conda:
-        "../envs/bowtie.yaml"
-    shell:
-        "bowtie {params.primers} -x {params.prefix} {params.insertsize} {params.primer_format} -S | samtools view -b - > {output} 2> {log}"
+        "logs/bwa_mem/{panel}.log",
+    params:
+        extra=lambda wc, input: get_primer_extra(wc, input),
+        sorting="none",  # Can be 'none', 'samtools' or 'picard'.
+        sort_order="queryname",  # Can be 'queryname' or 'coordinate'.
+        sort_extra="",  # Extra args for samtools/picard.
+    threads: 8
+    wrapper:
+        "v2.13.0/bio/bwa/mem"
 
 
 rule filter_unmapped_primers: