Merge branch 'master' into fusion_calling

snakemake-workflows · Nov 29, 2023 · aa8d20a · aa8d20a
2 parents b968321 + 12f22e9
commit aa8d20a
Show file tree

Hide file tree

Showing 13 changed files with 139 additions and 90 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -131,7 +131,7 @@ jobs:
       with:
         directory: .test
         snakefile: workflow/Snakefile
-        args: "--configfile .test/config-sra/config.yaml --show-failed-logs --use-conda -n -j 10 --conda-cleanup-pkgs cache"
+        args: "--configfile .test/config-sra/config.yaml --show-failed-logs --use-conda --cores all --set-scatter calling=4 --conda-cleanup-pkgs cache"
         stagein: |
           rm -rf .test/results
         show-disk-usage-on-error: true
diff --git a/.test/config-sra/README.md b/.test/config-sra/README.md
@@ -0,0 +1,22 @@
+The files used in this test come from the following project:
+
+https://www.ncbi.nlm.nih.gov/bioproject/PRJNA755173
+
+They are basically different (but closely related) strains of yeast that were isolated at the same geolocation. They are described in this manuscript:
+
+> Lee TJ et al., "Extensive sampling of Saccharomyces cerevisiae in Taiwan reveals ecology and evolution of predomesticated lineages.", Genome Res, 2022 May;32(5):864-877, https://doi.org/10.1101/gr.276286.121
+
+A detailed description of samples can be found in Supplementary Table S6 in this document:
+https://genome.cshlp.org/content/suppl/2022/05/02/gr.276286.121.DC1/Supplemental_Tables_.xlsx
+
+I basically chose samples with a comparably low coverage via looking through the list at SRA Run Selector, by restricting the `Assay Type` to `wxs` and sorting by the `Bytes` column:
+https://www.ncbi.nlm.nih.gov/Traces/study/?acc=SRP334167
+
+Further, I made sure that they:
+* had all been collected at the same site (same `Latitude` and `Longitude` values in Supplementary Table S6)
+* had all annotations in common, apart from the two variables in Supplementary Table S6 that define the two different groups:
+  * `Substrate isolated` defines the group `soil` made up of `PD09A` and `PD12A`
+  *`Medium used in isolation` defines the group `medium_L` made up of `PD13B` and `PD12A`
+* all belong to the same clonal group, so that they will only differ in very few mutations
+
+The setup reusing `PD12A` in both groups is meant to mimic a tumor-normal setup, where only a panel of normals is available and is reused in every group.
diff --git a/.test/config-sra/config.yaml b/.test/config-sra/config.yaml
@@ -5,11 +5,11 @@ units: config-sra/units.tsv
 ref:
   # Number of chromosomes to consider for calling.
   # The first n entries of the FASTA will be considered.
-  n_chromosomes: 25
+  n_chromosomes: 17
   # Ensembl species name
   species: saccharomyces_cerevisiae
   # Ensembl release
-  release: 100
+  release: 110
   # Genome build
   build: R64-1-1
 
@@ -22,11 +22,9 @@ primers:
 
 # Estimation of tumor mutational burden.
 mutational_burden:
-  activate: false
+  activate: true
   events:
-    - SOMATIC_TUMOR_LOW
-    - SOMATIC_TUMOR_MEDIUM
-    - SOMATIC_TUMOR_HIGH
+    - changed_only
 
 calling:
   delly:
@@ -37,13 +35,17 @@ calling:
   scenario: config-sra/scenario.yaml
   # See http://snpeff.sourceforge.net/SnpSift.html#filter
   filter:
+    candidates: "ANN['IMPACT'] in ['MODERATE', 'HIGH']"
     moderate: "ANN['IMPACT'] == 'MODERATE'"
   fdr-control:
     threshold: 0.05
+    local: true
     events: 
       present:
         varlociraptor: 
-          - "PRESENT"
+          - base_only
+          - changed_only
+          - both
         filter:
           - moderate
         desc: Variants with moderate impact
@@ -52,16 +54,16 @@ annotations:
   dbnsfp:
     activate: false
   vcfs:
-    activate: true
+    activate: false
     known: resources/variation.vcf.gz
   dgidb:
     activate: false
   vep:
     candidate_calls:
-      params: ""
+      params: "--fields IMPACT"
       plugins: []
     final_calls:
-      params: --everything
+      params: "--fields IMPACT"
       plugins: []
 
 params:
@@ -80,12 +82,12 @@ params:
     call: ""
     # Add extra arguments for varlociraptor preprocess. By default, we limit the depth to 200.
     # Increase this value for panel sequencing!
-    preprocess: "--max-depth 200"
+    preprocess: "--max-depth 50"
   freebayes:
-    min_alternate_fraction: 0.05 # Reduce for calling variants with lower VAFs
+    min_alternate_fraction: 0.4 # Reduce for calling variants with lower VAFs
 
 gene_coverage:
-  min_avg_coverage: 5
+  min_avg_coverage: 8
 
 report:
   activate: false

diff --git a/.test/config-sra/samples.tsv b/.test/config-sra/samples.tsv
@@ -1,3 +1,5 @@
 sample_name	group	alias	platform	datatype
-a		a	ILLUMINA	variants
-b		b	ILLUMINA	variants
+PD12A	medium_L	base	ILLUMINA	variants
+PD13B	medium_L	changed	ILLUMINA	variants
+PD09A	soil	changed	ILLUMINA	variants
+PD12A	soil	base	ILLUMINA	variants
diff --git a/.test/config-sra/scenario.yaml b/.test/config-sra/scenario.yaml
@@ -4,10 +4,14 @@ __definitions__:
 
 
 samples:
-  ?sample:
+  base:
+    resolution: 0.01
+    universe: "[0.0,1.0]"
+  changed:
     resolution: 0.01
     universe: "[0.0,1.0]"
 
 events:
-  present: ?f"{sample}:]0.05,1.0]"
-  ffpe_artifact: ?f"{sample}:]0.0,0.05]"
+  changed_only: "changed:]0.0,1.0] & base:0.0"
+  base_only: "base:]0.0,1.0] & changed:0.0"
+  both: "base:]0.0,1.0] & changed:]0.0,1.0]"
diff --git a/.test/config-sra/units.tsv b/.test/config-sra/units.tsv
@@ -1,3 +1,4 @@
-sample_name	unit_name	group	fq1	fq2	adapters	sra
-a	lane1				"-a ACGGCTAGCTA -A AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC"	ERR267986
-b	lane1				"-a ACGGCTAGCTA -A AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC"	ERR267986
+sample_name	unit_name	fq1	fq2	adapters	sra
+PD12A	u1			-a CTGTCTCTTATACACATCT -A CTGTCTCTTATACACATCT	SRR15616337
+PD13B	u1			-a CTGTCTCTTATACACATCT -A CTGTCTCTTATACACATCT	SRR15616336
+PD09A	u1			-a CTGTCTCTTATACACATCT -A CTGTCTCTTATACACATCT	SRR15616341
diff --git a/workflow/envs/bowtie.yaml b/workflow/envs/bowtie.yaml
diff --git a/workflow/envs/bwa.yaml b/workflow/envs/bwa.yaml
diff --git a/workflow/envs/varlociraptor.yaml b/workflow/envs/varlociraptor.yaml
@@ -3,5 +3,5 @@ channels:
   - bioconda
   - nodefaults
 dependencies:
-  - varlociraptor =8.3
-  - vega-lite-cli =3.4
+  - varlociraptor =8.4
+  - vega-lite-cli =5.16
diff --git a/workflow/envs/vega.yaml b/workflow/envs/vega.yaml
@@ -1,4 +1,4 @@
 channels:
   - conda-forge
 dependencies:
-  - vega-lite-cli =3.4
+  - vega-lite-cli =5.16
diff --git a/workflow/resources/datavzrd/variant-calls-template.datavzrd.yaml b/workflow/resources/datavzrd/variant-calls-template.datavzrd.yaml
@@ -224,22 +224,24 @@ views:
                 scale: ordinal
           revel:
             optional: true
-            plot: 
+            plot:
               heatmap:
                 scale: "linear"
                 domain: [0.0, 1.0]
                 range:
                   - white
                   - "#ff5555"
           "regex('.+: allele frequency')":
-            plot: 
-              ticks:
+            plot:
+              heatmap:
                 scale: "linear"
-                domain: [0.0, 1.0]
-                aux-domain-columns:
-                  - "regex('.+: allele frequency')"
+                domain: [0.0, 0.05, 1.0]
+                range:
+                  - white
+                  - "#d3e9f8"
+                  - "#1f77b4"
           "regex('.+: read depth')":
-            plot: 
+            plot:
               ticks:
                 scale: "linear"
                 aux-domain-columns:
@@ -252,6 +254,16 @@ views:
                 range:
                   - white
                   - "#1f77b4"
+          "gnomad genome af":
+            optional: true
+            plot:
+              heatmap:
+                scale: "linear"
+                domain: [0.0, 0.01, 1.0]
+                range:
+                  - white
+                  - "#adebad"
+                  - "#29a329"
           gene:
             display-mode: hidden
           feature:
@@ -328,14 +340,16 @@ views:
                   - "#ec9b00"
                   - "#ecca00"
           "regex('.+: allele frequency')":
-            plot: 
-              ticks:
+            plot:
+              heatmap:
                 scale: "linear"
-                domain: [0.0, 1.0]
-                aux-domain-columns:
-                  - "regex('.+: allele frequency')"
+                domain: [0.0, 0.05, 1.0]
+                range:
+                  - white
+                  - "#d3e9f8"
+                  - "#1f77b4"
           "regex('.+: read depth')":
-            plot: 
+            plot:
               ticks:
                 scale: "linear"
                 aux-domain-columns:
@@ -348,6 +362,16 @@ views:
                 range:
                   - white
                   - "#1f77b4"
+          "gnomad genome af":
+            optional: true
+            plot:
+              heatmap:
+                scale: "linear"
+                domain: [0.0, 0.01, 1.0]
+                range:
+                  - white
+                  - "#adebad"
+                  - "#29a329"
           id:
             optional: true
             display-mode: hidden

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -278,11 +278,22 @@ def get_sra_reads(sample, unit, fq):
 
 def get_raw_reads(sample, unit, fq):
     pattern = units.loc[sample].loc[unit, fq]
+
     if pd.isna(pattern):
         assert fq.startswith("fq")
         fq = fq[len("fq") :]
         return get_sra_reads(sample, unit, fq)
 
+    if type(pattern) is not str and len(pattern) > 1:
+        raise ValueError(
+            f"Multiple units.tsv entries found for sample '{sample}' and "
+            f"unit '{unit}'.\n"
+            "The units.tsv should contain only one entry for each combination "
+            "of sample and unit.\n"
+            "Found:\n"
+            f"{pattern}"
+        )
+
     if "*" in pattern:
         files = sorted(glob.glob(units.loc[sample].loc[unit, fq]))
         if not files:
@@ -1004,15 +1015,9 @@ def get_dgidb_datasources():
     return ""
 
 
-def get_bowtie_insertsize():
-    if config["primers"]["trimming"].get("library_length", 0) != 0:
-        return "-X {}".format(config["primers"]["trimming"].get("library_length"))
-    return ""
-
-
 def get_filter_params(wc):
     if isinstance(get_panel_primer_input(wc.panel), list):
-        return "-b -f 2"
+        return "-b -F 12"
     return "-b -F 4"
 
 
@@ -1022,10 +1027,28 @@ def get_single_primer_flag(wc):
     return ""
 
 
-def format_bowtie_primers(wc, primers):
-    if isinstance(primers, list):
-        return "-1 {r1} -2 {r2}".format(r1=primers[0], r2=primers[1])
-    return primers
+def get_shortest_primer_length(primers):
+    primers = primers if isinstance(primers, list) else [primers]
+    # set to 32 to match bwa-mem default value considering offset of 2
+    min_length = 32
+    for primer_file in primers:
+        with open(primer_file, "r") as p:
+            min_primer = min(
+                [len(p.strip()) for i, p in enumerate(p.readlines()) if i % 2 == 1]
+            )
+            min_length = min(min_length, min_primer)
+    return min_length
+
+
+def get_primer_extra(wc, input):
+    extra = rf"-R '@RG\tID:{wc.panel}\tSM:{wc.panel}' -L 100"
+    min_primer_len = get_shortest_primer_length(input.reads)
+    # Check if shortest primer is below default values
+    if min_primer_len < 32:
+        extra += f" -T {min_primer_len - 2}"
+    if min_primer_len < 19:
+        extra += f" -k {min_primer_len}"
+    return extra
 
 
 def get_datavzrd_data(impact="coding"):

diff --git a/workflow/rules/primers.smk b/workflow/rules/primers.smk
@@ -44,40 +44,22 @@ rule trim_primers:
         "fgbio TrimPrimers -H -i {input.bam} -p {input.primers} -s {params.sort_order} {params.single_primer} -o {output.trimmed} &> {log}"
 
 
-rule bowtie_build:
+rule map_primers:
     input:
-        genome,
-    output:
-        directory("resources/bowtie_build/"),
-    params:
-        prefix=f"resources/bowtie_build/{genome_name}",
-    log:
-        "logs/bowtie/build.log",
-    conda:
-        "../envs/bowtie.yaml"
-    cache: True
-    shell:
-        "mkdir {output} & "
-        "bowtie-build {input} {params.prefix} &> {log}"
-
-
-rule bowtie_map:
-    input:
-        primers=lambda w: get_panel_primer_input(w.panel),
-        idx="resources/bowtie_build",
+        reads=lambda wc: get_panel_primer_input(wc.panel),
+        idx=rules.bwa_index.output,
     output:
         "results/primers/{panel}_primers.bam",
-    params:
-        primers=lambda wc, input: format_bowtie_primers(wc, input.primers),
-        prefix=f"resources/bowtie_build/{genome_name}",
-        insertsize=get_bowtie_insertsize(),
-        primer_format=lambda wc, input: "-f" if input_is_fasta(input.primers) else "",
     log:
-        "logs/bowtie/{panel}_map.log",
-    conda:
-        "../envs/bowtie.yaml"
-    shell:
-        "bowtie {params.primers} -x {params.prefix} {params.insertsize} {params.primer_format} -S | samtools view -b - > {output} 2> {log}"
+        "logs/bwa_mem/{panel}.log",
+    params:
+        extra=lambda wc, input: get_primer_extra(wc, input),
+        sorting="none",  # Can be 'none', 'samtools' or 'picard'.
+        sort_order="queryname",  # Can be 'queryname' or 'coordinate'.
+        sort_extra="",  # Extra args for samtools/picard.
+    threads: 8
+    wrapper:
+        "v2.13.0/bio/bwa/mem"
 
 
 rule filter_unmapped_primers: