feat: remove circle_score < 50, improve color scale (#20)

* feat: remove circle_score < 50, improve color scale * perf: update to latest datavzrd wrapper: v.3.8.0 * feat: include length column, improve filtering: min 1 discordant and split read, min circle_score 200 remove uncovered * feat: adapt template to new filtering, add ensembl linkout * fix: do inplace things separately * fix: try again with circle_score >= 50 * feat: make circle filtering configurable * fix: change tabs to spaces, fix syntax * fix: whitespace to underscores (syntax) * snakefmt * fix: example config typo * fix: harmonize config.yaml files (incl. .test/config/)
snakemake-workflows · Apr 23, 2024 · 5f45fa6 · 5f45fa6
1 parent ff9c7c3
commit 5f45fa6
Show file tree

Hide file tree

Showing 7 changed files with 116 additions and 8 deletions.
diff --git a/.test/config/config.yaml b/.test/config/config.yaml
@@ -15,4 +15,32 @@ ref:
   # Optionally, instead of downloading the whole reference from Ensembl via the 
   # parameters above, specify a specific chromosome below and uncomment the line.
   # This is usually only relevant for testing.
-  chromosome: 7
+  chromosome: 7
+
+# These filters mostly correspond to the output columns of Circle-Map:
+# https://github.com/iprada/Circle-Map/wiki/Circle-Map-Realign-output-files
+# In addition, you can filter on the length of the circle.
+circle_filtering:
+  min_circle_score: 100
+  min_split_reads: 0
+  min_discordant_read_pairs: 0
+  max_uncovered_fraction: 0.8
+  min_mean_coverage: 2.5
+  min_circle_length: 500
+  max_circle_length: 80000000
+
+# You can pass extra command line arguments to the following tools. However, it is
+# very unlikely that you will need this functionality. Usually, all tools should be
+# configured correctly for the purposes of this analysis out of the box.
+# 
+# Also note, that this section is NOT for specifying resources that a rule might
+# want to reserve. Please directly annotate the rules via the `resources:` directive
+# if you have to specify those (for example for a cluster / scheduler that needs them).
+# For details on resource specifications, see:
+# https://snakemake.readthedocs.io/en/latest/snakefiles/rules.html#resources
+params:
+  cutadapt: ""
+  gatk:
+    BaseRecalibrator: ""
+    applyBQSR: ""
+
diff --git a/config/config.yaml b/config/config.yaml
@@ -17,6 +17,18 @@ ref:
   # This is usually only relevant for testing.
   # chromosome: 21
 
+# These filters mostly correspond to the output columns of Circle-Map:
+# https://github.com/iprada/Circle-Map/wiki/Circle-Map-Realign-output-files
+# In addition, you can filter on the length of the circle.
+circle_filtering:
+  min_circle_score: 100
+  min_split_reads: 0
+  min_discordant_read_pairs: 0
+  max_uncovered_fraction: 0.8
+  min_mean_coverage: 2.5
+  min_circle_length: 500
+  max_circle_length: 80000000
+
 # You can pass extra command line arguments to the following tools. However, it is
 # very unlikely that you will need this functionality. Usually, all tools should be
 # configured correctly for the purposes of this analysis out of the box.
@@ -30,4 +42,5 @@ params:
   cutadapt: ""
   gatk:
     BaseRecalibrator: ""
-    applyBQSR: ""
+    applyBQSR: ""
+
diff --git a/workflow/resources/circles.datavzrd.yaml b/workflow/resources/circles.datavzrd.yaml
@@ -21,6 +21,11 @@ views:
       columns:
         region:
           display-mode: normal
+          link-to-url:
+            ensembl:
+              url: "https://www.ensembl.org/Homo_sapiens/Location/View?r={region}"
+        length:
+          display-mode: normal
         discordant_reads:
           precision: 0
           plot:
@@ -36,20 +41,18 @@ views:
             heatmap:
               scale: linear
               range:
-                - "#af8dc3"
-                - "#af8dc3"
+                - "#e7d4e8"
                 - "#e7d4e8"
                 - "white"
                 - "#d9f0d3"
                 - "#7fbf7b"
                 - "#7fbf7b"
               domain:
                 - 0
-                - 10
-                - 40
                 - 50
-                - 60
                 - 200
+                - 300
+                - 800
                 - 10000000
         mean_coverage:
           plot:

diff --git a/workflow/rules/circle_map.smk b/workflow/rules/circle_map.smk
@@ -64,5 +64,15 @@ rule clean_circle_map_realign_output:
         "logs/circle-map/{sample}.circles.cleaned.log",
     conda:
         "../envs/pandas.yaml"
+    params:
+        min_circle_score=config["circle_filtering"]["min_circle_score"],
+        min_split_reads=config["circle_filtering"]["min_split_reads"],
+        min_discordant_read_pairs=config["circle_filtering"][
+            "min_discordant_read_pairs"
+        ],
+        max_uncovered_fraction=config["circle_filtering"]["max_uncovered_fraction"],
+        min_mean_coverage=config["circle_filtering"]["min_mean_coverage"],
+        min_circle_length=config["circle_filtering"]["min_circle_length"],
+        max_circle_length=config["circle_filtering"]["max_circle_length"],
     script:
         "../scripts/clean_circle_map_realign_output.py"
diff --git a/workflow/rules/datavzrd.smk b/workflow/rules/datavzrd.smk
@@ -25,4 +25,4 @@ rule datavzrd:
     log:
         "logs/datavzrd/circles/{sample}.log",
     wrapper:
-        "v1.21.2/utils/datavzrd"
+        "v3.8.0/utils/datavzrd"
diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml
@@ -28,8 +28,34 @@ properties:
       - release
       - build
       - n_chromosomes
+  circle_filtering:
+    type: object
+    properties:
+      min_circle_score:
+        type: number
+      min_split_reads:
+        type: integer
+      min_discordant_read_pairs:
+        type: integer
+      max_uncovered_fraction:
+        type: number
+      min_mean_coverage:
+        type: number
+      min_circle_length:
+        type: integer
+      max_circle_length:
+        type: integer
+    required:
+      - min_circle_score
+      - min_split_reads
+      - min_discordant_read_pairs
+      - max_uncovered_fraction
+      - min_mean_coverage
+      - min_circle_length
+      - max_circle_length
 
 required:
   - samples
   - units
   - ref
+  - circle_filtering
diff --git a/workflow/scripts/clean_circle_map_realign_output.py b/workflow/scripts/clean_circle_map_realign_output.py
@@ -35,11 +35,39 @@
 # turn int cols into int
 circles.loc[:, int_cols] = circles.loc[:, int_cols].round(0).applymap(lambda v: int(v) if not pd.isna(v) else pd.NA)
 
+# filter out low-quality circles, according to:
+# https://github.com/iprada/Circle-Map/wiki/Circle-Map-Realign-output-files
+circles = circles.loc[
+    ( circles["circle_score"] >= snakemake.params["min_circle_score"] ) &
+    ( circles["discordant_reads"] > snakemake.params["min_discordant_read_pairs"] ) &
+    ( circles["split_reads"] > snakemake.params["min_split_reads"] ) &
+    ( circles["uncovered_fraction"] <= snakemake.params["max_uncovered_fraction"] ) &
+    ( circles["mean_coverage"] >= snakemake.params["min_mean_coverage"] )
+]
+
+
 circles["region"] = circles.agg(
     lambda row: f"{row['chromosome']}:{row['start']}-{row['end']}",
     axis='columns',
 )
 
+circles["length"] = circles.agg(
+    # both start and end position are 0-based:
+    # https://github.com/iprada/Circle-Map/wiki/Circle-Map-Realign-output-files
+    lambda row: row['end'] - row['start'] + 1,
+    axis='columns',
+)
+
+circles = circles.loc[
+    ( circles["length"] >= snakemake.params["min_circle_length"] ) &
+    ( circles["length"] <= snakemake.params["max_circle_length"] )
+]
+
+circles.sort_values(
+    by=['chromosome', 'start', 'end'],
+    inplace=True
+)
+
 circles.drop(
     labels=[
         "chromosome",