Merge pull request #1364 from broadinstitute/lk-PD-2738-Snapatac2metrics

adding 10x wrapper function
broadinstitute · Sep 6, 2024 · 7a03227 · 7a03227
2 parents c350acb + 2551d39
commit 7a03227
Show file tree

Hide file tree

Showing 11 changed files with 114 additions and 36 deletions.
diff --git a/pipeline_versions.txt b/pipeline_versions.txt
@@ -1,42 +1,42 @@
 Pipeline Name	Version	Date of Last Commit
+MultiSampleSmartSeq2SingleNucleus	 1.4.2	2024-08-25-02 
+MultiSampleSmartSeq2	 2.2.21	2023-04-19 
+PairedTag	 1.6.0	2024-08-02 
 Optimus	 7.6.0	2024-08-06 
-Multiome	 5.5.0	2024-08-06 
-PairedTag	 1.5.0	2024-08-06 
-atac	 2.2.3	2024-08-02 
-SlideSeq	 3.4.0	2024-08-06 
+atac	 2.3.0	2024-08-29 
 snm3C	 4.0.4	2024-08-06 
-MultiSampleSmartSeq2SingleNucleus	 1.4.2	2024-08-25-02 
-scATAC	 1.3.2	2023-08-03 
 SmartSeq2SingleSample	 5.1.20	2023-04-19 
+Multiome	 5.6.0	2024-08-02 
+scATAC	 1.3.2	2023-08-03 
 BuildIndices	 3.0.0	2023-12-06 
-MultiSampleSmartSeq2	 2.2.21	2023-04-19 
-CEMBA	 1.1.6	2023-12-18 
+SlideSeq	 3.4.0	2024-08-06 
 BuildCembaReferences	 1.0.0	2020-11-15 
-UltimaGenomicsWholeGenomeCramOnly	 1.0.20	2024-08-02 
+CEMBA	 1.1.6	2023-12-18 
 GDCWholeGenomeSomaticSingleSample	 1.3.2	2024-08-02  
-ExomeGermlineSingleSample	 3.1.22	2024-06-12 
-UltimaGenomicsWholeGenomeGermline	 1.0.20	2024-08-02 
-WholeGenomeGermlineSingleSample	 3.2.1	2024-06-12 
-VariantCalling	 2.2.1	2024-06-12 
+UltimaGenomicsWholeGenomeCramOnly	 1.0.20	2024-08-02 
+JointGenotypingByChromosomePartOne	 1.4.12	2023-12-18 
+JointGenotypingByChromosomePartTwo	 1.4.11	2023-12-18 
 UltimaGenomicsJointGenotyping	 1.1.7	2023-12-18 
 JointGenotyping	 1.6.10	2023-12-18 
 ReblockGVCF	 2.2.1	2024-06-12 
-JointGenotypingByChromosomePartTwo	 1.4.11	2023-12-18 
-JointGenotypingByChromosomePartOne	 1.4.12	2023-12-18 
-ExternalExomeReprocessing	 3.2.2	2024-08-02 
-ExternalWholeGenomeReprocessing	 2.2.2	2024-08-02 
-ExomeReprocessing	 3.2.2	2024-08-02 
-CramToUnmappedBams	 1.1.3	2024-08-02 
-WholeGenomeReprocessing	 3.2.2	2024-08-02 
-IlluminaGenotypingArray	 1.12.21	2024-08-02 
-Arrays	 2.6.27	2024-08-02 
-MultiSampleArrays	 1.6.2	2024-08-02 
+VariantCalling	 2.2.1	2024-06-12 
+WholeGenomeGermlineSingleSample	 3.2.1	2024-06-12 
+UltimaGenomicsWholeGenomeGermline	 1.0.20	2024-08-02 
+ExomeGermlineSingleSample	 3.1.22	2024-06-12 
 ValidateChip	 1.16.5	2024-08-02 
+Arrays	 2.6.27	2024-08-02 
 Imputation	 1.1.13	2024-05-21 
-RNAWithUMIsPipeline	 1.0.16	2023-12-18 
+MultiSampleArrays	 1.6.2	2024-08-02 
 BroadInternalUltimaGenomics	 1.0.21	2024-08-02 
 BroadInternalArrays	 1.1.11	2024-08-02 
 BroadInternalImputation	 1.1.12	2024-08-02 
 BroadInternalRNAWithUMIs	 1.0.33	2024-08-02 
+CramToUnmappedBams	 1.1.3	2024-08-02 
+ExternalWholeGenomeReprocessing	 2.2.2	2024-08-02 
+ExternalExomeReprocessing	 3.2.2	2024-08-02 
+WholeGenomeReprocessing	 3.2.2	2024-08-02 
+ExomeReprocessing	 3.2.2	2024-08-02 
+IlluminaGenotypingArray	 1.12.21	2024-08-02 
 CheckFingerprint	 1.0.20	2024-08-02 
 AnnotationFiltration	 1.2.5	2023-12-18 
+RNAWithUMIsPipeline	 1.0.16	2023-12-18 
diff --git a/pipelines/skylab/atac/atac.changelog.md b/pipelines/skylab/atac/atac.changelog.md
@@ -1,3 +1,10 @@
+# 2.3.0
+2024-08-29 (Date of Last Commit)
+
+* Updated the SnapATAC2 docker to include v2.7.0; the pipeline will now produce a library-level summary metric CSV for the BAM. 
+
+* Updated the memory for the CreateFragmentFile task
+
 # 2.2.3
 2024-08-02 (Date of Last Commit)
 

diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl
@@ -46,7 +46,7 @@ workflow ATAC {
     String adapter_seq_read3 = "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG"
   }
 
-  String pipeline_version = "2.2.3"
+  String pipeline_version = "2.3.0"
 
   # Determine docker prefix based on cloud provider
   String gcr_docker_prefix = "us.gcr.io/broad-gotc-prod/"
@@ -58,7 +58,7 @@ workflow ATAC {
   String cutadapt_docker = "cutadapt:1.0.0-4.4-1686752919"
   String samtools_docker = "samtools-dist-bwa:3.0.0"
   String upstools_docker = "upstools:1.0.0-2023.03.03-1704300311"
-  String snap_atac_docker = "snapatac2:1.0.9-2.6.3-1715865353"
+  String snap_atac_docker = "snapatac2:1.1.0"
 
   # Make sure either 'gcp' or 'azure' is supplied as cloud_provider input. If not, raise an error
   if ((cloud_provider != "gcp") && (cloud_provider != "azure")) {
@@ -158,11 +158,13 @@ workflow ATAC {
   File bam_aligned_output_atac = select_first([BBTag.bb_bam, BWAPairedEndAlignment.bam_aligned_output])
   File fragment_file_atac = select_first([BB_fragment.fragment_file, CreateFragmentFile.fragment_file])
   File snap_metrics_atac = select_first([BB_fragment.Snap_metrics,CreateFragmentFile.Snap_metrics])
+  File library_metrics = select_first([BB_fragment.atac_library_metrics, CreateFragmentFile.atac_library_metrics])
 
   output {
     File bam_aligned_output = bam_aligned_output_atac
     File fragment_file = fragment_file_atac
     File snap_metrics = snap_metrics_atac
+    File library_metrics_file = library_metrics
   }
 }
 
@@ -505,7 +507,7 @@ task CreateFragmentFile {
     File annotations_gtf
     Boolean preindex
     Int disk_size = 500
-    Int mem_size = 16
+    Int mem_size = 64
     Int nthreads = 4
     String cpuPlatform = "Intel Cascade Lake"
     String docker_path
@@ -547,17 +549,35 @@ task CreateFragmentFile {
     import snapatac2.preprocessing as pp
     import snapatac2 as snap
     import anndata as ad
+    from collections import OrderedDict
+    import csv
 
     # extract CB or BB (if preindex is true) tag from bam file to create fragment file
     if preindex == "true":
-      pp.make_fragment_file("~{bam}", "~{bam_base_name}.fragments.tsv", is_paired=True, barcode_tag="BB")
+      data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="BB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None)
     elif preindex == "false":
-      pp.make_fragment_file("~{bam}", "~{bam_base_name}.fragments.tsv", is_paired=True, barcode_tag="CB")
-      
+      data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="CB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None)
+    
+    # Add NHashID to metrics 
+    nhash_ID_value = "XXX"
+    data = OrderedDict({'NHash_ID': atac_nhash_id, **data})
+    # Flatten the dictionary
+    flattened_data = []
+    for category, metrics in data.items():
+        if isinstance(metrics, dict):
+            for metric, value in metrics.items():
+                flattened_data.append((metric, value))
+        else:
+            flattened_data.append((category, metrics))
+
+    # Write to CSV
+    csv_file_path = "~{bam_base_name}_~{atac_nhash_id}.atac_metrics.csv"
+    with open(csv_file_path, mode='w', newline='') as file:
+        writer = csv.writer(file)
+        writer.writerows(flattened_data)  # Write data
+
+    print(f"Dictionary successfully written to {csv_file_path}")
 
-    # calculate quality metrics; note min_num_fragments and min_tsse are set to 0 instead of default
-    # those settings allow us to retain all barcodes
-    pp.import_data("~{bam_base_name}.fragments.tsv", file="temp_metrics.h5ad", chrom_sizes=chrom_size_dict, min_num_fragments=0)
     atac_data = ad.read_h5ad("temp_metrics.h5ad")
     # Add nhash_id to h5ad file as unstructured metadata
     atac_data.uns['NHashID'] = atac_nhash_id
@@ -580,5 +600,6 @@ task CreateFragmentFile {
   output {
     File fragment_file = "~{bam_base_name}.fragments.tsv"
     File Snap_metrics = "~{bam_base_name}.metrics.h5ad"
+    File atac_library_metrics = "~{bam_base_name}_~{atac_nhash_id}.atac_metrics.csv"
   }
 }
diff --git a/pipelines/skylab/multiome/Multiome.changelog.md b/pipelines/skylab/multiome/Multiome.changelog.md
@@ -1,3 +1,8 @@
+# 5.6.0
+2024-08-02 (Date of Last Commit)
+
+* Updated the SnapATAC2 docker to include v2.7.0; the pipeline will now produce a library-level summary metric CSV for the BAM.
+
 # 5.5.0
 2024-08-06 (Date of Last Commit)
 

diff --git a/pipelines/skylab/multiome/Multiome.wdl b/pipelines/skylab/multiome/Multiome.wdl
@@ -9,7 +9,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils
 
 workflow Multiome {
 
-    String pipeline_version = "5.5.0"
+    String pipeline_version = "5.6.0"
 
 
     input {
@@ -179,6 +179,7 @@ workflow Multiome {
         File fragment_file_atac = JoinBarcodes.atac_fragment_tsv
         File fragment_file_index = JoinBarcodes.atac_fragment_tsv_tbi
         File snap_metrics_atac = JoinBarcodes.atac_h5ad_file
+        File atac_library_metrics = Atac.library_metrics_file
 
         # optimus outputs
         File genomic_reference_version_gex = Optimus.genomic_reference_version

diff --git a/pipelines/skylab/paired_tag/PairedTag.changelog.md b/pipelines/skylab/paired_tag/PairedTag.changelog.md
@@ -1,3 +1,8 @@
+# 1.6.0
+2024-08-02 (Date of Last Commit)
+
+* Updated the SnapATAC2 docker to include v2.7.0; the pipeline will now produce a library-level summary metric CSV for the BAM.
+
 # 1.5.0
 2024-08-06 (Date of Last Commit)
 

diff --git a/pipelines/skylab/paired_tag/PairedTag.wdl b/pipelines/skylab/paired_tag/PairedTag.wdl
@@ -8,7 +8,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils
 
 workflow PairedTag {
 
-    String pipeline_version = "1.5.0"
+    String pipeline_version = "1.6.0"
 
 
     input {
@@ -149,6 +149,7 @@ workflow PairedTag {
 
     File atac_fragment_out = select_first([ParseBarcodes.atac_fragment_tsv,Atac_preindex.fragment_file])
     File atac_h5ad_out = select_first([ParseBarcodes.atac_h5ad_file, Atac_preindex.snap_metrics])
+
     output {
 
         String pairedtag_pipeline_version_out = pipeline_version
@@ -157,6 +158,7 @@ workflow PairedTag {
         File bam_aligned_output_atac = Atac_preindex.bam_aligned_output
         File fragment_file_atac = atac_fragment_out
         File snap_metrics_atac = atac_h5ad_out
+        File atac_library_final = Atac_preindex.library_metrics_file
 
         # optimus outputs
         File genomic_reference_version_gex = Optimus.genomic_reference_version

diff --git a/pipelines/skylab/paired_tag/README.md b/pipelines/skylab/paired_tag/README.md
@@ -1,6 +1,6 @@
 ## Announcing a new site for WARP documentation!
 
-Paired-tag documentation has moved! Read more about the [Paired-Tag workflow](https://broadinstitute.github.io/warp/docs/Pipelines/PairedTag_Pipeline/README) on the new [WARP documentation site](https://broadinstitute.github.io/warp/)!
+Paired-tag documentation has moved! Read more about the [Paired-Tag workflow](https://broadinstitute.github.io/warp/docs/Pipelines/PairedTag_Pipeline/README) on the new [WARP documentation site](https://broadinstitute.github.io/warp/)! 
 
 ### Paired-Tag summary
 

diff --git a/website/docs/Pipelines/ATAC/README.md b/website/docs/Pipelines/ATAC/README.md
@@ -93,6 +93,7 @@ To see specific tool parameters, select the task WDL link in the table; then vie
 | bam_aligned_output | `<input_id>`.bam | BAM containing aligned reads from ATAC workflow. |
 | fragment_file | `<input_id>`.fragments.tsv | TSV containing fragment start and stop coordinates per barcode. In order, the columns are "Chromosome", "Start", "Stop", "ATAC Barcode", and "Number Reads". | 
 | snap_metrics | `<input_id`.metrics.h5ad | h5ad (Anndata) containing per barcode metrics from [SnapATAC2](https://github.com/kaizhang/SnapATAC2). A detailed list of these metrics is found in the [ATAC Count Matrix Overview](./count-matrix-overview.md). |
+ library_metrics | `<input_id>`_`<atac_nhash_id>`.atac_metrics.csv | CSV file containing library-level metrics. Read more in the [Library Metrics Overview](library-metrics.md)
 
 ## Versioning and testing
 

diff --git a/website/docs/Pipelines/ATAC/library-metrics.md b/website/docs/Pipelines/ATAC/library-metrics.md
@@ -0,0 +1,35 @@
+---
+sidebar_position: 2
+---
+
+# ATAC Library Metrics Overview
+
+The [ATAC pipeline](README.md) uses [SnapATAC2](https://github.com/kaizhang/SnapATAC2) to generate library-level metrics in CSV format.  
+
+
+| Metric | Description |
+| --- | --- |
+| NHash_ID | A unique identifier used to track and reference the specific sample or dataset. |
+| Sequenced_reads | The total number of reads generated from the sequencing process, which includes both reads that are mapped and unmapped. |
+| Sequenced_read_pairs | The total number of read pairs (two reads per pair) generated from the sequencing process. This is typically half of the total sequenced reads if all reads are paired. |
+| Fraction_valid_barcode | The fraction of reads that contain a valid barcode, indicating the proportion of reads that are correctly assigned to a specific cell or sample. |
+| Fraction_Q30_bases_in_read_1 | The proportion of bases in Read 1 that have a Phred quality score of 30 or higher, indicating high-confidence base calls. |
+| Fraction_Q30_bases_in_read_2 | The proportion of bases in Read 2 that have a Phred quality score of 30 or higher, indicating high-confidence base calls. |
+| Number_of_cells | The estimated number of cells captured and sequenced in the experiment, based on the barcodes identified. |
+| Mean_raw_read_pairs_per_cell | The average number of raw read pairs associated with each cell, providing an indication of the sequencing depth per cell. |
+| Median_high-quality_fragments_per_cell | The median number of high-quality (e.g., confidently mapped) fragments associated with each cell, representing typical fragment quality across cells. |
+| Fraction of high-quality fragments in cells | The fraction of high-quality fragments that are associated with identified cells, indicating the proportion of good-quality data that is cell-associated. |
+| Fraction_of_transposition_events_in_peaks_in_cells | The fraction of transposition events within identified cells that occur within peaks, which are regions of accessible chromatin. |
+| Fraction_duplicates | The fraction of sequenced fragments that are duplicates, which can result from PCR amplification or other factors, indicating the redundancy in the sequencing data. |
+| Fraction_confidently_mapped | The fraction of sequenced fragments that are confidently mapped to the reference genome, indicating the proportion of reads that align well to the genome. |
+| Fraction_unmapped | The fraction of sequenced fragments that could not be mapped to the reference genome, which can indicate sequencing errors, contamination, or regions not covered by the reference. |
+| Fraction_nonnuclear | The fraction of sequenced fragments that are mapped to non-nuclear (e.g., mitochondrial or other organellar) DNA, providing insight into contamination or organellar activity. |
+| Fraction_fragment_in_nucleosome_free_region | The fraction of sequenced fragments that map to nucleosome-free regions, which are indicative of accessible chromatin. |
+| Fraction_fragment_flanking_single_nucleosome | The fraction of sequenced fragments that map to regions flanking single nucleosomes, indicating regions with partial chromatin accessibility. |
+| TSS_enrichment_score | A measure of the enrichment of transposition events at transcription start sites (TSS), indicating the accessibility of promoters across the genome. |
+| Fraction_of_high-quality_fragments_overlapping_TSS | The fraction of high-quality fragments that overlap transcription start sites (TSS), providing insight into promoter accessibility. |
+| Number_of_peaks | The total number of peaks, or regions of accessible chromatin, identified in the dataset, representing potential regulatory elements. |
+| Fraction_of_genome_in_peaks | The fraction of the genome that is covered by identified peaks, indicating the extent of chromatin accessibility across the genome. |
+| Fraction_of_high-quality_fragments_overlapping_peaks | The fraction of high-quality fragments that overlap with identified peaks, providing an indication of the efficiency of the assay in capturing accessible regions. |
+
+
diff --git a/website/docs/Pipelines/Multiome_Pipeline/README.md b/website/docs/Pipelines/Multiome_Pipeline/README.md
@@ -107,6 +107,7 @@ The Multiome workflow calls two WARP subworkflows, one external subworkflow (opt
 | fragment_file_atac | `<input_id>_atac.fragments.sorted.tsv.gz` | Sorted and bgzipped TSV file containing fragment start and stop coordinates per barcode. The columns are "Chromosome", "Start", "Stop", "ATAC Barcode", "Number of reads", and "GEX Barcode". | 
 | fragment_file_index |  `<input_id>_atac.fragments.sorted.tsv.gz.tbi` | tabix index file for the fragment file. |
 | snap_metrics_atac | `<input_id>_atac.metrics.h5ad` | h5ad (Anndata) file containing per-barcode metrics from SnapATAC2. Also contains the equivalent gene expression barcode for each ATAC barcode in the `gex_barcodes` column of the `h5ad.obs` property. See the [ATAC Count Matrix Overview](../ATAC/count-matrix-overview.md) for more details. |
+| atac_library_metrics | `<input_id>_<nhash_id>.atac.metrics.csv` | CSV with library-level metrics produced by SnapATAC2. See the ATAC [Library Level Metrics Overview](../ATAC/library-metrics.md) for more details. |
 | genomic_reference_version_gex | `<reference_version>.txt` | File containing the Genome build, source and GTF annotation version. |
 | bam_gex | `<input_id>_gex.bam` | BAM file containing aligned reads from Optimus workflow. |
 | matrix_gex | `<input_id>_gex_sparse_counts.npz` | NPZ file containing raw gene by cell counts. |