broadinstitute · ekiernan · Oct 30, 2024 · Oct 22, 2024 · Oct 22, 2024 · Oct 22, 2024
diff --git a/pipeline_versions.txt b/pipeline_versions.txt
@@ -30,11 +30,11 @@ ExomeReprocessing	 3.3.1	2024-09-17
 BuildIndices	 3.0.0	2023-12-06 
 scATAC	 1.3.2	2023-08-03 
 snm3C	 4.0.4	2024-08-06 
-Multiome	 5.7.1	2024-10-18 
-PairedTag	 1.7.1	2024-10-18 
+Multiome	 5.8.0	2024-10-23 
+PairedTag	 1.8.0	2024-10-23 
 MultiSampleSmartSeq2	 2.2.22	2024-09-11 
-MultiSampleSmartSeq2SingleNucleus	 2.0.1	2024-09-24 
-Optimus	 7.7.0	2024-09-24 
-atac	 2.3.2	2024-10-18 
+MultiSampleSmartSeq2SingleNucleus	 2.0.2	2024-10-23 
+Optimus	 7.8.0	2024-10-23 
+atac	 2.4.0	2024-10-23 
 SmartSeq2SingleSample	 5.1.21	2024-09-11 
-SlideSeq	 3.4.2	2024-09-24 
+SlideSeq	 3.4.3	2024-10-24 
diff --git a/pipelines/skylab/atac/atac.changelog.md b/pipelines/skylab/atac/atac.changelog.md
@@ -1,3 +1,11 @@
+# 2.4.0
+2024-10-23 (Date of Last Commit)
+
+* Added a new input parameter for atac_expected_cells, which describes the numnber of cells used for the library preparation
+* Updated the ATAC library CSV to be consistent in file naming convention and to have similar case for metric names to the Optimus workflow library CSV
+* Added a new metric to the ATAC library CSV to calculate percent_target, which is the number of estimated cells by SnapATAC2 divided by expected_cells input
+
+
 # 2.3.2
 2024-10-18 (Date of Last Commit)
 

diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl
@@ -23,6 +23,9 @@ workflow ATAC {
     # Additional library aliquot ID
     String? atac_nhash_id
 
+    #Expected cells from library preparation
+    Int atac_expected_cells = 3000
+
     # Option for running files with preindex
     Boolean preindex = false
 
@@ -46,7 +49,7 @@ workflow ATAC {
     String adapter_seq_read3 = "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG"
   }
 
-  String pipeline_version = "2.3.2"
+  String pipeline_version = "2.4.0"
 
   # Determine docker prefix based on cloud provider
   String gcr_docker_prefix = "us.gcr.io/broad-gotc-prod/"
@@ -139,7 +142,9 @@ workflow ATAC {
         annotations_gtf = annotations_gtf,
         preindex = preindex,
         docker_path = docker_prefix + snap_atac_docker,
-        atac_nhash_id = atac_nhash_id
+        atac_nhash_id = atac_nhash_id,
+        atac_expected_cells = atac_expected_cells,
+        input_id = input_id
     }
   }
   if (!preindex) {
@@ -150,7 +155,9 @@ workflow ATAC {
         annotations_gtf = annotations_gtf,
         preindex = preindex,
         docker_path = docker_prefix + snap_atac_docker,
-        atac_nhash_id = atac_nhash_id
+        atac_nhash_id = atac_nhash_id,
+        atac_expected_cells = atac_expected_cells,
+        input_id = input_id
 
     }
   }
@@ -512,10 +519,10 @@ task CreateFragmentFile {
     String cpuPlatform = "Intel Cascade Lake"
     String docker_path
     String atac_nhash_id = ""
+    String input_id
+    Int atac_expected_cells = 3000
   }
 
-  String bam_base_name = basename(bam, ".bam")
-
   parameter_meta {
     bam: "Aligned bam with CB in CB tag. This is the output of the BWAPairedEndAlignment task."
     chrom_sizes: "Text file containing chrom_sizes for genome build (i.e. hg38)."
@@ -532,11 +539,12 @@ task CreateFragmentFile {
 
     # set parameters
     bam = "~{bam}"
-    bam_base_name = "~{bam_base_name}"
+    input_id = "~{input_id}"
     chrom_sizes = "~{chrom_sizes}"
     atac_gtf = "~{annotations_gtf}"
     preindex = "~{preindex}"
     atac_nhash_id = "~{atac_nhash_id}"
+    expected_cells = ~{atac_expected_cells}
 
     # calculate chrom size dictionary based on text file
     chrom_size_dict={}
@@ -554,12 +562,22 @@ task CreateFragmentFile {
 
     # extract CB or BB (if preindex is true) tag from bam file to create fragment file
     if preindex == "true":
-      data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="BB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None)
+      data = pp.recipe_10x_metrics("~{bam}", "~{input_id}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="BB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None)
     elif preindex == "false":
-      data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="CB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None)
-    
+      data = pp.recipe_10x_metrics("~{bam}", "~{input_id}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="CB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None)
+
     # Add NHashID to metrics 
     data = OrderedDict({'NHashID': atac_nhash_id, **data})
+
+    # Calculate atac percent target
+    print("Calculating percent target")
+    number_of_cells = data['Cells']['Number_of_cells']
+    print("Print number of cells", number_of_cells)
+    atac_percent_target = number_of_cells / expected_cells*100
+    print("Setting percent target in nested dictionary")
+    data['Cells']['percent_target'] = atac_percent_target
+
+
     # Flatten the dictionary
     flattened_data = []
     for category, metrics in data.items():
@@ -569,8 +587,11 @@ task CreateFragmentFile {
         else:
             flattened_data.append((category, metrics))
 
+    # Convert the flattened keys to lowercase (except for 'NHashID')
+    flattened_data = [(metric if metric == 'NHashID' else str(metric).lower(), value) for metric, value in flattened_data]
+
     # Write to CSV
-    csv_file_path = "~{bam_base_name}_~{atac_nhash_id}.atac_metrics.csv"
+    csv_file_path = "~{input_id}_~{atac_nhash_id}_library_metrics.csv"
     with open(csv_file_path, mode='w', newline='') as file:
         writer = csv.writer(file)
         writer.writerows(flattened_data)  # Write data
@@ -583,7 +604,7 @@ task CreateFragmentFile {
     # calculate tsse metrics
     snap.metrics.tsse(atac_data, atac_gtf)
     # Write new atac file
-    atac_data.write_h5ad("~{bam_base_name}.metrics.h5ad")
+    atac_data.write_h5ad("~{input_id}.metrics.h5ad")
 
     CODE
   >>>
@@ -597,8 +618,8 @@ task CreateFragmentFile {
   }
 
   output {
-    File fragment_file = "~{bam_base_name}.fragments.tsv"
-    File Snap_metrics = "~{bam_base_name}.metrics.h5ad"
-    File atac_library_metrics = "~{bam_base_name}_~{atac_nhash_id}.atac_metrics.csv"
+    File fragment_file = "~{input_id}.fragments.tsv"
+    File Snap_metrics = "~{input_id}.metrics.h5ad"
+    File atac_library_metrics = "~{input_id}_~{atac_nhash_id}_library_metrics.csv"
   }
 }
diff --git a/pipelines/skylab/multiome/Multiome.changelog.md b/pipelines/skylab/multiome/Multiome.changelog.md
@@ -1,3 +1,10 @@
+# 5.8.0
+2024-10-23 (Date of Last Commit)
+
+* Updated the workflow to include a new expected_cells input parameter describing the number of cells used as input to the library preparation; this is passed to both the ATAC workflows and Optimus workflows and the default is set to 3000 cells
+* Updated the ATAC library CSV and the Gene Expression library CSV to be consistent in file naming convention and to have similar case for metric names
+* Added a new metric to the ATAC library CSV to calculate percent_target, which is the number of estimated cells by SnapATAC2 divided by expected_cells input
+
 # 5.7.1
 2024-10-18 (Date of Last Commit)
 

diff --git a/pipelines/skylab/multiome/Multiome.wdl b/pipelines/skylab/multiome/Multiome.wdl
@@ -9,7 +9,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils
 
 workflow Multiome {
 
-    String pipeline_version = "5.7.1"
+    String pipeline_version = "5.8.0"
 
 
     input {
@@ -18,6 +18,7 @@ workflow Multiome {
         # Additional library aliquot ID
         String? gex_nhash_id
         String? atac_nhash_id
+        Int expected_cells = 3000
 
         # Optimus Inputs
         String counting_mode = "sn_rna"
@@ -102,7 +103,8 @@ workflow Multiome {
             star_strand_mode = star_strand_mode,
             count_exons = count_exons,
             soloMultiMappers = soloMultiMappers,
-            cloud_provider = cloud_provider
+            cloud_provider = cloud_provider,
+            gex_expected_cells = expected_cells
     }
 
     # Call the ATAC workflow
@@ -120,7 +122,8 @@ workflow Multiome {
             vm_size = vm_size,
             annotations_gtf = annotations_gtf,
             atac_nhash_id = atac_nhash_id,
-            adapter_seq_read3 = adapter_seq_read3
+            adapter_seq_read3 = adapter_seq_read3,
+            atac_expected_cells = expected_cells
     }
     call H5adUtils.JoinMultiomeBarcodes as JoinBarcodes {
         input:

diff --git a/pipelines/skylab/optimus/Optimus.changelog.md b/pipelines/skylab/optimus/Optimus.changelog.md
@@ -1,3 +1,10 @@
+# 7.8.0
+2024-10-23 (Date of Last Commit)
+
+* Renamed the input expected_cells to gex_expected_cells
+* Updated gex_expected_cells to a required output
+* Reformatted the library CSV output filename to remove an extra gex
+
 # 7.7.0
 2024-09-24 (Date of Last Commit)
 
@@ -6,6 +13,7 @@
 
 # 7.6.1
 2024-09-11 (Date of Last Commit)
+
 * Updated warp-tools docker which added create_h5ad_snss2.py to the docker image. This change does not affect the Optimus pipeline
 
 # 7.6.0

diff --git a/pipelines/skylab/optimus/Optimus.wdl b/pipelines/skylab/optimus/Optimus.wdl
@@ -36,7 +36,7 @@ workflow Optimus {
     File annotations_gtf
     File? mt_genes
     String? soloMultiMappers = "Uniform"
-    Int? expected_cells
+    Int gex_expected_cells = 3000
 
     # Chemistry options include: 2 or 3
     Int tenx_chemistry_version
@@ -71,7 +71,7 @@ workflow Optimus {
   # version of this pipeline
 
 
-  String pipeline_version = "7.7.0"
+  String pipeline_version = "7.8.0"
 
 
   # this is used to scatter matched [r1_fastq, r2_fastq, i1_fastq] arrays
@@ -223,7 +223,7 @@ workflow Optimus {
       input_id = input_id,
       counting_mode = counting_mode,
       star_merge_docker_path = docker_prefix + star_merge_docker,
-      expected_cells = expected_cells,
+      expected_cells = gex_expected_cells,
       gex_nhash_id = gex_nhash_id
   }
   if (counting_mode == "sc_rna"){
@@ -242,7 +242,7 @@ workflow Optimus {
       input:
         input_id = input_id,
         gex_nhash_id = gex_nhash_id,
-        expected_cells = expected_cells,
+        expected_cells = gex_expected_cells,
         input_name = input_name,
         input_id_metadata_field = input_id_metadata_field,
         input_name_metadata_field = input_name_metadata_field,
@@ -279,7 +279,7 @@ workflow Optimus {
       input:
         input_id = input_id,
         gex_nhash_id = gex_nhash_id,
-        expected_cells = expected_cells,
+        expected_cells = gex_expected_cells,
         input_name = input_name,
         counting_mode = counting_mode,
         input_id_metadata_field = input_id_metadata_field,

diff --git a/pipelines/skylab/paired_tag/PairedTag.changelog.md b/pipelines/skylab/paired_tag/PairedTag.changelog.md
@@ -1,15 +1,24 @@
+# 1.8.0
+2024-10-23 (Date of Last Commit)
+
+* Updated the workflow to include a new expected_cells input parameter describing the number of cells used as input to the library preparation; this is passed to both the ATAC workflows and Optimus workflows and the default is set to 3000 cells
+* Updated the ATAC library CSV and the Gene Expression library CSV to be consistent in file naming convention and to have similar case for metric names
+* Added a new metric to the ATAC library CSV to calculate percent_target, which is the number of estimated cells by SnapATAC2 divided by expected_cells input
+
 # 1.7.1
 2024-10-18 (Date of Last Commit)
 
 * Removed the underscore of the NHashID in the ATAC library metrics CSV
 
 # 1.7.0
 2024-09-24 (Date of Last Commit)
+
 * Added a python implementation of DoubletFinder to calculate doublet scores in gene expression data; percent doublets are now available as a library-level metric and individual doublet scores for cell barcodes are in the h5ad
 * Updated gene_names in the final h5ad to be unique
 
 # 1.6.1
 2024-09-11 (Date of Last Commit)
+
 * Updated warp-tools docker which added create_h5ad_snss2.py to the docker image. This change does not affect the PairedTag pipeline
 
 # 1.6.0
@@ -21,6 +30,7 @@
 2024-08-06 (Date of Last Commit)
 
 * Updated the warp-tools docker to calculate mitochondrial reads from unique reads in cell and gene metrics; these metrics are in the cell and gene metrics CSV as well as h5ad
+
 # 1.4.1
 2024-08-02 (Date of Last Commit)
 
@@ -71,7 +81,6 @@
 
 * Updated the demultiplex task so that some intermediate input names have been renamed. There is no change to the outputs.
 
-
 # 0.6.0
 2024-05-10 (Date)
 

diff --git a/pipelines/skylab/paired_tag/PairedTag.wdl b/pipelines/skylab/paired_tag/PairedTag.wdl
@@ -8,7 +8,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils
 
 workflow PairedTag {
 
-    String pipeline_version = "1.7.1"
+    String pipeline_version = "1.8.0"
 
 
     input {
@@ -109,7 +109,7 @@ workflow PairedTag {
               read1_fastq = atac_r1_fastq[idx],
               read3_fastq = atac_r3_fastq[idx],
               barcodes_fastq = atac_r2_fastq[idx],
-              input_id = input_id,
+              input_id = input_id + "_atac",
               whitelist = atac_whitelist,
               preindex = preindex,
               docker_path = docker_prefix + upstools_docker

diff --git a/pipelines/skylab/slideseq/SlideSeq.changelog.md b/pipelines/skylab/slideseq/SlideSeq.changelog.md
@@ -1,3 +1,8 @@
+# 3.4.3
+2024-10-24 (Date of Last Commit)
+
+* Updated the h5adUtils WDL to rename the gene expression library CSV filename; this does not impact slideseq
+
 # 3.4.2
 2024-09-24 (Date of Last Commit)
 

diff --git a/pipelines/skylab/slideseq/SlideSeq.wdl b/pipelines/skylab/slideseq/SlideSeq.wdl
@@ -25,7 +25,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils
 
 workflow SlideSeq {
 
-    String pipeline_version = "3.4.2"
+    String pipeline_version = "3.4.3"
 
     input {
         Array[File] r1_fastq

diff --git a/...tseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md b/...tseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md
@@ -1,5 +1,11 @@
+# 2.0.2
+2024-10-23 (Date of Last Commit)
+
+* Updated the h5adUtils WDL to rename the gene expression library CSV filename; this does not impact slideseq
+
 # 2.0.1
 2024-09-24 (Date of Last Commit)
+
 * Added a python implementation of DoubletFinder to calculate doublet scores in gene expression data; this does not affect the snSS2 workflow
 
 # 2.0.0

diff --git a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl
@@ -57,7 +57,7 @@ workflow MultiSampleSmartSeq2SingleNucleus {
   }
 
   # Version of this pipeline
-  String pipeline_version = "2.0.1"
+  String pipeline_version = "2.0.2"
 
   if (false) {
      String? none = "None"