From 1cd67fe457109f8fea1c6e0179c129d4dd2f55c3 Mon Sep 17 00:00:00 2001 From: ekiernan <55763654+ekiernan@users.noreply.github.com> Date: Wed, 30 Oct 2024 08:27:00 -0400 Subject: [PATCH 1/4] Lk pd 2786 add atac expectedcells (#1398) Added percent_target to ATAC library metrics and made library CSV formatting consistent between ATAC and Optimus --- pipeline_versions.txt | 12 ++--- pipelines/skylab/atac/atac.changelog.md | 8 +++ pipelines/skylab/atac/atac.wdl | 49 +++++++++++++------ .../skylab/multiome/Multiome.changelog.md | 7 +++ pipelines/skylab/multiome/Multiome.wdl | 9 ++-- pipelines/skylab/optimus/Optimus.changelog.md | 8 +++ pipelines/skylab/optimus/Optimus.wdl | 10 ++-- .../skylab/paired_tag/PairedTag.changelog.md | 11 ++++- pipelines/skylab/paired_tag/PairedTag.wdl | 4 +- .../skylab/slideseq/SlideSeq.changelog.md | 5 ++ pipelines/skylab/slideseq/SlideSeq.wdl | 2 +- ...iSampleSmartSeq2SingleNucleus.changelog.md | 6 +++ .../MultiSampleSmartSeq2SingleNucleus.wdl | 2 +- tasks/skylab/H5adUtils.wdl | 8 +-- website/docs/Pipelines/ATAC/README.md | 33 +++++++------ .../docs/Pipelines/ATAC/library-metrics.md | 41 ++++++++-------- .../Pipelines/Multiome_Pipeline/README.md | 11 +++-- .../docs/Pipelines/Optimus_Pipeline/README.md | 4 +- .../Pipelines/PairedTag_Pipeline/README.md | 13 ++--- 19 files changed, 157 insertions(+), 86 deletions(-) diff --git a/pipeline_versions.txt b/pipeline_versions.txt index 66a514f5a9..c176a31945 100644 --- a/pipeline_versions.txt +++ b/pipeline_versions.txt @@ -30,11 +30,11 @@ ExomeReprocessing 3.3.1 2024-09-17 BuildIndices 3.0.0 2023-12-06 scATAC 1.3.2 2023-08-03 snm3C 4.0.4 2024-08-06 -Multiome 5.7.1 2024-10-18 -PairedTag 1.7.1 2024-10-18 +Multiome 5.8.0 2024-10-23 +PairedTag 1.8.0 2024-10-23 MultiSampleSmartSeq2 2.2.22 2024-09-11 -MultiSampleSmartSeq2SingleNucleus 2.0.1 2024-09-24 -Optimus 7.7.0 2024-09-24 -atac 2.3.2 2024-10-18 +MultiSampleSmartSeq2SingleNucleus 2.0.2 2024-10-23 +Optimus 7.8.0 2024-10-23 +atac 2.4.0 2024-10-23 SmartSeq2SingleSample 5.1.21 2024-09-11 -SlideSeq 3.4.2 2024-09-24 +SlideSeq 3.4.3 2024-10-24 diff --git a/pipelines/skylab/atac/atac.changelog.md b/pipelines/skylab/atac/atac.changelog.md index 34b5704e59..4b886f32b2 100644 --- a/pipelines/skylab/atac/atac.changelog.md +++ b/pipelines/skylab/atac/atac.changelog.md @@ -1,3 +1,11 @@ +# 2.4.0 +2024-10-23 (Date of Last Commit) + +* Added a new input parameter for atac_expected_cells, which describes the numnber of cells used for the library preparation +* Updated the ATAC library CSV to be consistent in file naming convention and to have similar case for metric names to the Optimus workflow library CSV +* Added a new metric to the ATAC library CSV to calculate percent_target, which is the number of estimated cells by SnapATAC2 divided by expected_cells input + + # 2.3.2 2024-10-18 (Date of Last Commit) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 2acb133c2b..153f817c6f 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -23,6 +23,9 @@ workflow ATAC { # Additional library aliquot ID String? atac_nhash_id + #Expected cells from library preparation + Int atac_expected_cells = 3000 + # Option for running files with preindex Boolean preindex = false @@ -46,7 +49,7 @@ workflow ATAC { String adapter_seq_read3 = "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG" } - String pipeline_version = "2.3.2" + String pipeline_version = "2.4.0" # Determine docker prefix based on cloud provider String gcr_docker_prefix = "us.gcr.io/broad-gotc-prod/" @@ -139,7 +142,9 @@ workflow ATAC { annotations_gtf = annotations_gtf, preindex = preindex, docker_path = docker_prefix + snap_atac_docker, - atac_nhash_id = atac_nhash_id + atac_nhash_id = atac_nhash_id, + atac_expected_cells = atac_expected_cells, + input_id = input_id } } if (!preindex) { @@ -150,7 +155,9 @@ workflow ATAC { annotations_gtf = annotations_gtf, preindex = preindex, docker_path = docker_prefix + snap_atac_docker, - atac_nhash_id = atac_nhash_id + atac_nhash_id = atac_nhash_id, + atac_expected_cells = atac_expected_cells, + input_id = input_id } } @@ -512,10 +519,10 @@ task CreateFragmentFile { String cpuPlatform = "Intel Cascade Lake" String docker_path String atac_nhash_id = "" + String input_id + Int atac_expected_cells = 3000 } - String bam_base_name = basename(bam, ".bam") - parameter_meta { bam: "Aligned bam with CB in CB tag. This is the output of the BWAPairedEndAlignment task." chrom_sizes: "Text file containing chrom_sizes for genome build (i.e. hg38)." @@ -532,11 +539,12 @@ task CreateFragmentFile { # set parameters bam = "~{bam}" - bam_base_name = "~{bam_base_name}" + input_id = "~{input_id}" chrom_sizes = "~{chrom_sizes}" atac_gtf = "~{annotations_gtf}" preindex = "~{preindex}" atac_nhash_id = "~{atac_nhash_id}" + expected_cells = ~{atac_expected_cells} # calculate chrom size dictionary based on text file chrom_size_dict={} @@ -554,12 +562,22 @@ task CreateFragmentFile { # extract CB or BB (if preindex is true) tag from bam file to create fragment file if preindex == "true": - data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="BB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None) + data = pp.recipe_10x_metrics("~{bam}", "~{input_id}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="BB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None) elif preindex == "false": - data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="CB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None) - + data = pp.recipe_10x_metrics("~{bam}", "~{input_id}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="CB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None) + # Add NHashID to metrics data = OrderedDict({'NHashID': atac_nhash_id, **data}) + + # Calculate atac percent target + print("Calculating percent target") + number_of_cells = data['Cells']['Number_of_cells'] + print("Print number of cells", number_of_cells) + atac_percent_target = number_of_cells / expected_cells*100 + print("Setting percent target in nested dictionary") + data['Cells']['percent_target'] = atac_percent_target + + # Flatten the dictionary flattened_data = [] for category, metrics in data.items(): @@ -569,8 +587,11 @@ task CreateFragmentFile { else: flattened_data.append((category, metrics)) + # Convert the flattened keys to lowercase (except for 'NHashID') + flattened_data = [(metric if metric == 'NHashID' else str(metric).lower(), value) for metric, value in flattened_data] + # Write to CSV - csv_file_path = "~{bam_base_name}_~{atac_nhash_id}.atac_metrics.csv" + csv_file_path = "~{input_id}_~{atac_nhash_id}_library_metrics.csv" with open(csv_file_path, mode='w', newline='') as file: writer = csv.writer(file) writer.writerows(flattened_data) # Write data @@ -583,7 +604,7 @@ task CreateFragmentFile { # calculate tsse metrics snap.metrics.tsse(atac_data, atac_gtf) # Write new atac file - atac_data.write_h5ad("~{bam_base_name}.metrics.h5ad") + atac_data.write_h5ad("~{input_id}.metrics.h5ad") CODE >>> @@ -597,8 +618,8 @@ task CreateFragmentFile { } output { - File fragment_file = "~{bam_base_name}.fragments.tsv" - File Snap_metrics = "~{bam_base_name}.metrics.h5ad" - File atac_library_metrics = "~{bam_base_name}_~{atac_nhash_id}.atac_metrics.csv" + File fragment_file = "~{input_id}.fragments.tsv" + File Snap_metrics = "~{input_id}.metrics.h5ad" + File atac_library_metrics = "~{input_id}_~{atac_nhash_id}_library_metrics.csv" } } diff --git a/pipelines/skylab/multiome/Multiome.changelog.md b/pipelines/skylab/multiome/Multiome.changelog.md index 378678f9ba..1cdbef30a8 100644 --- a/pipelines/skylab/multiome/Multiome.changelog.md +++ b/pipelines/skylab/multiome/Multiome.changelog.md @@ -1,3 +1,10 @@ +# 5.8.0 +2024-10-23 (Date of Last Commit) + +* Updated the workflow to include a new expected_cells input parameter describing the number of cells used as input to the library preparation; this is passed to both the ATAC workflows and Optimus workflows and the default is set to 3000 cells +* Updated the ATAC library CSV and the Gene Expression library CSV to be consistent in file naming convention and to have similar case for metric names +* Added a new metric to the ATAC library CSV to calculate percent_target, which is the number of estimated cells by SnapATAC2 divided by expected_cells input + # 5.7.1 2024-10-18 (Date of Last Commit) diff --git a/pipelines/skylab/multiome/Multiome.wdl b/pipelines/skylab/multiome/Multiome.wdl index 821e5bead6..ca8b16ea3d 100644 --- a/pipelines/skylab/multiome/Multiome.wdl +++ b/pipelines/skylab/multiome/Multiome.wdl @@ -9,7 +9,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils workflow Multiome { - String pipeline_version = "5.7.1" + String pipeline_version = "5.8.0" input { @@ -18,6 +18,7 @@ workflow Multiome { # Additional library aliquot ID String? gex_nhash_id String? atac_nhash_id + Int expected_cells = 3000 # Optimus Inputs String counting_mode = "sn_rna" @@ -102,7 +103,8 @@ workflow Multiome { star_strand_mode = star_strand_mode, count_exons = count_exons, soloMultiMappers = soloMultiMappers, - cloud_provider = cloud_provider + cloud_provider = cloud_provider, + gex_expected_cells = expected_cells } # Call the ATAC workflow @@ -120,7 +122,8 @@ workflow Multiome { vm_size = vm_size, annotations_gtf = annotations_gtf, atac_nhash_id = atac_nhash_id, - adapter_seq_read3 = adapter_seq_read3 + adapter_seq_read3 = adapter_seq_read3, + atac_expected_cells = expected_cells } call H5adUtils.JoinMultiomeBarcodes as JoinBarcodes { input: diff --git a/pipelines/skylab/optimus/Optimus.changelog.md b/pipelines/skylab/optimus/Optimus.changelog.md index f8418bce8d..8d82cdf07f 100644 --- a/pipelines/skylab/optimus/Optimus.changelog.md +++ b/pipelines/skylab/optimus/Optimus.changelog.md @@ -1,3 +1,10 @@ +# 7.8.0 +2024-10-23 (Date of Last Commit) + +* Renamed the input expected_cells to gex_expected_cells +* Updated gex_expected_cells to a required output +* Reformatted the library CSV output filename to remove an extra gex + # 7.7.0 2024-09-24 (Date of Last Commit) @@ -6,6 +13,7 @@ # 7.6.1 2024-09-11 (Date of Last Commit) + * Updated warp-tools docker which added create_h5ad_snss2.py to the docker image. This change does not affect the Optimus pipeline # 7.6.0 diff --git a/pipelines/skylab/optimus/Optimus.wdl b/pipelines/skylab/optimus/Optimus.wdl index 70402c6ced..f8343388ab 100644 --- a/pipelines/skylab/optimus/Optimus.wdl +++ b/pipelines/skylab/optimus/Optimus.wdl @@ -36,7 +36,7 @@ workflow Optimus { File annotations_gtf File? mt_genes String? soloMultiMappers = "Uniform" - Int? expected_cells + Int gex_expected_cells = 3000 # Chemistry options include: 2 or 3 Int tenx_chemistry_version @@ -71,7 +71,7 @@ workflow Optimus { # version of this pipeline - String pipeline_version = "7.7.0" + String pipeline_version = "7.8.0" # this is used to scatter matched [r1_fastq, r2_fastq, i1_fastq] arrays @@ -223,7 +223,7 @@ workflow Optimus { input_id = input_id, counting_mode = counting_mode, star_merge_docker_path = docker_prefix + star_merge_docker, - expected_cells = expected_cells, + expected_cells = gex_expected_cells, gex_nhash_id = gex_nhash_id } if (counting_mode == "sc_rna"){ @@ -242,7 +242,7 @@ workflow Optimus { input: input_id = input_id, gex_nhash_id = gex_nhash_id, - expected_cells = expected_cells, + expected_cells = gex_expected_cells, input_name = input_name, input_id_metadata_field = input_id_metadata_field, input_name_metadata_field = input_name_metadata_field, @@ -279,7 +279,7 @@ workflow Optimus { input: input_id = input_id, gex_nhash_id = gex_nhash_id, - expected_cells = expected_cells, + expected_cells = gex_expected_cells, input_name = input_name, counting_mode = counting_mode, input_id_metadata_field = input_id_metadata_field, diff --git a/pipelines/skylab/paired_tag/PairedTag.changelog.md b/pipelines/skylab/paired_tag/PairedTag.changelog.md index f6ce64b4ca..a7071e222f 100644 --- a/pipelines/skylab/paired_tag/PairedTag.changelog.md +++ b/pipelines/skylab/paired_tag/PairedTag.changelog.md @@ -1,3 +1,10 @@ +# 1.8.0 +2024-10-23 (Date of Last Commit) + +* Updated the workflow to include a new expected_cells input parameter describing the number of cells used as input to the library preparation; this is passed to both the ATAC workflows and Optimus workflows and the default is set to 3000 cells +* Updated the ATAC library CSV and the Gene Expression library CSV to be consistent in file naming convention and to have similar case for metric names +* Added a new metric to the ATAC library CSV to calculate percent_target, which is the number of estimated cells by SnapATAC2 divided by expected_cells input + # 1.7.1 2024-10-18 (Date of Last Commit) @@ -5,11 +12,13 @@ # 1.7.0 2024-09-24 (Date of Last Commit) + * Added a python implementation of DoubletFinder to calculate doublet scores in gene expression data; percent doublets are now available as a library-level metric and individual doublet scores for cell barcodes are in the h5ad * Updated gene_names in the final h5ad to be unique # 1.6.1 2024-09-11 (Date of Last Commit) + * Updated warp-tools docker which added create_h5ad_snss2.py to the docker image. This change does not affect the PairedTag pipeline # 1.6.0 @@ -21,6 +30,7 @@ 2024-08-06 (Date of Last Commit) * Updated the warp-tools docker to calculate mitochondrial reads from unique reads in cell and gene metrics; these metrics are in the cell and gene metrics CSV as well as h5ad + # 1.4.1 2024-08-02 (Date of Last Commit) @@ -71,7 +81,6 @@ * Updated the demultiplex task so that some intermediate input names have been renamed. There is no change to the outputs. - # 0.6.0 2024-05-10 (Date) diff --git a/pipelines/skylab/paired_tag/PairedTag.wdl b/pipelines/skylab/paired_tag/PairedTag.wdl index 83b470ba47..2cef2bb297 100644 --- a/pipelines/skylab/paired_tag/PairedTag.wdl +++ b/pipelines/skylab/paired_tag/PairedTag.wdl @@ -8,7 +8,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils workflow PairedTag { - String pipeline_version = "1.7.1" + String pipeline_version = "1.8.0" input { @@ -109,7 +109,7 @@ workflow PairedTag { read1_fastq = atac_r1_fastq[idx], read3_fastq = atac_r3_fastq[idx], barcodes_fastq = atac_r2_fastq[idx], - input_id = input_id, + input_id = input_id + "_atac", whitelist = atac_whitelist, preindex = preindex, docker_path = docker_prefix + upstools_docker diff --git a/pipelines/skylab/slideseq/SlideSeq.changelog.md b/pipelines/skylab/slideseq/SlideSeq.changelog.md index b9cb1f7a56..0835b105a7 100644 --- a/pipelines/skylab/slideseq/SlideSeq.changelog.md +++ b/pipelines/skylab/slideseq/SlideSeq.changelog.md @@ -1,3 +1,8 @@ +# 3.4.3 +2024-10-24 (Date of Last Commit) + +* Updated the h5adUtils WDL to rename the gene expression library CSV filename; this does not impact slideseq + # 3.4.2 2024-09-24 (Date of Last Commit) diff --git a/pipelines/skylab/slideseq/SlideSeq.wdl b/pipelines/skylab/slideseq/SlideSeq.wdl index 0cd1f29e4c..553760e49d 100644 --- a/pipelines/skylab/slideseq/SlideSeq.wdl +++ b/pipelines/skylab/slideseq/SlideSeq.wdl @@ -25,7 +25,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils workflow SlideSeq { - String pipeline_version = "3.4.2" + String pipeline_version = "3.4.3" input { Array[File] r1_fastq diff --git a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md index 16ed6cb5c8..90d6830c49 100644 --- a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md +++ b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md @@ -1,5 +1,11 @@ +# 2.0.2 +2024-10-23 (Date of Last Commit) + +* Updated the h5adUtils WDL to rename the gene expression library CSV filename; this does not impact slideseq + # 2.0.1 2024-09-24 (Date of Last Commit) + * Added a python implementation of DoubletFinder to calculate doublet scores in gene expression data; this does not affect the snSS2 workflow # 2.0.0 diff --git a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl index debce094b0..124820a4a5 100644 --- a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl +++ b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl @@ -57,7 +57,7 @@ workflow MultiSampleSmartSeq2SingleNucleus { } # Version of this pipeline - String pipeline_version = "2.0.1" + String pipeline_version = "2.0.2" if (false) { String? none = "None" diff --git a/tasks/skylab/H5adUtils.wdl b/tasks/skylab/H5adUtils.wdl index 0ac5a3dd66..f5fb796b49 100644 --- a/tasks/skylab/H5adUtils.wdl +++ b/tasks/skylab/H5adUtils.wdl @@ -101,7 +101,7 @@ task OptimusH5adGeneration { --counting_mode ~{counting_mode} \ --expected_cells ~{expected_cells} - mv library_metrics.csv ~{input_id}_~{gex_nhash_id}_gex_library_metrics.csv + mv library_metrics.csv ~{input_id}_~{gex_nhash_id}_library_metrics.csv >>> @@ -116,7 +116,7 @@ task OptimusH5adGeneration { output { File h5ad_output = "~{input_id}.h5ad" - File library_metrics = "~{input_id}_~{gex_nhash_id}_gex_library_metrics.csv" + File library_metrics = "~{input_id}_~{gex_nhash_id}_library_metrics.csv" } } @@ -207,7 +207,7 @@ task SingleNucleusOptimusH5adOutput { --expected_cells ~{expected_cells} - mv library_metrics.csv ~{input_id}_~{gex_nhash_id}_gex_library_metrics.csv + mv library_metrics.csv ~{input_id}_~{gex_nhash_id}_library_metrics.csv >>> runtime { @@ -221,7 +221,7 @@ task SingleNucleusOptimusH5adOutput { output { File h5ad_output = "~{input_id}.h5ad" - File library_metrics = "~{input_id}_~{gex_nhash_id}_gex_library_metrics.csv" + File library_metrics = "~{input_id}_~{gex_nhash_id}_library_metrics.csv" } } diff --git a/website/docs/Pipelines/ATAC/README.md b/website/docs/Pipelines/ATAC/README.md index 9f632d8497..86d4b55f11 100644 --- a/website/docs/Pipelines/ATAC/README.md +++ b/website/docs/Pipelines/ATAC/README.md @@ -8,7 +8,7 @@ slug: /Pipelines/ATAC/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [2.3.0](https://github.com/broadinstitute/warp/releases) | September, 2024 | Kaylee Mathews | Please [file an issue in WARP](https://github.com/broadinstitute/warp/issues). | +| [2.4.0](https://github.com/broadinstitute/warp/releases) | October, 2024 | WARP Pipelines | Please [file an issue in WARP](https://github.com/broadinstitute/warp/issues). | ## Introduction to the ATAC workflow @@ -47,22 +47,23 @@ The following describes the inputs of the ATAC workflow. For more details on how | Variable name | Description | | --- |--- | -| read1_fastq_gzipped | Fastq inputs (array of compressed read 1 FASTQ files). | -| read2_fastq_gzipped | Fastq inputs (array of compressed read 2 FASTQ files containing cellular barcodes). | -| read3_fastq_gzipped | Fastq inputs (array of compressed read 3 FASTQ files). | +| read1_fastq_gzipped | Fastq inputs (array of compressed read 1 FASTQ files). | +| read2_fastq_gzipped | Fastq inputs (array of compressed read 2 FASTQ files containing cellular barcodes). | +| read3_fastq_gzipped | Fastq inputs (array of compressed read 3 FASTQ files). | | input_id | Output prefix/base name for all intermediate files and pipeline outputs. | | cloud_provider | String describing the cloud provider that should be used to run the workflow; value should be "gcp" or "azure". | String | -| preindex | Boolean used for paired-tag data and not applicable to ATAC data types; default is set to false. | -| tar_bwa_reference | BWA reference (tar file containing reference fasta and corresponding files). | -| num_threads_bwa | Optional integer defining the number of CPUs per node for the BWA-mem alignment task (default: 128). | -| mem_size_bwa | Optional integer defining the memory size for the BWA-mem alignment task in GB (default: 512). | -| cpu_platform_bwa | Optional string defining the CPU platform for the BWA-mem alignment task (default: "Intel Ice Lake"). | -| annotations_gtf | CreateFragmentFile input variable: GTF file for SnapATAC2 to calculate TSS sites of fragment file. | -| chrom_sizes | CreateFragmentFile input variable: Text file containing chrom_sizes for genome build (i.e., hg38) | -| whitelist | Whitelist file for ATAC cellular barcodes. | -| adapter_seq_read1 | TrimAdapters input: Sequence adapter for read 1 fastq. | -| adapter_seq_read3 | TrimAdapters input: Sequence adapter for read 3 fastq. | -| vm_size | String defining the Azure virtual machine family for the workflow (default: "Standard_M128s"). | String | +| preindex | Boolean used for paired-tag data and not applicable to ATAC data types; default is set to false. | +| atac_expected_cells | Number of cells loaded to create the ATAC library; default is set to 3000. | +| tar_bwa_reference | BWA reference (tar file containing reference fasta and corresponding files). | +| num_threads_bwa | Optional integer defining the number of CPUs per node for the BWA-mem alignment task (default: 128). | +| mem_size_bwa | Optional integer defining the memory size for the BWA-mem alignment task in GB (default: 512). | +| cpu_platform_bwa | Optional string defining the CPU platform for the BWA-mem alignment task (default: "Intel Ice Lake"). | +| annotations_gtf | CreateFragmentFile input variable: GTF file for SnapATAC2 to calculate TSS sites of fragment file. | +| chrom_sizes | CreateFragmentFile input variable: Text file containing chrom_sizes for genome build (i.e., hg38) | +| whitelist | Whitelist file for ATAC cellular barcodes. | +| adapter_seq_read1 | TrimAdapters input: Sequence adapter for read 1 fastq. | +| adapter_seq_read3 | TrimAdapters input: Sequence adapter for read 3 fastq. | +| vm_size | String defining the Azure virtual machine family for the workflow (default: "Standard_M128s"). | atac_nhash_id | String that represents an optional library aliquot identifier. When used, it is echoed in the h5ad unstructured data. | ## ATAC tasks and tools @@ -94,7 +95,7 @@ To see specific tool parameters, select the task WDL link in the table; then vie | bam_aligned_output | ``.bam | BAM containing aligned reads from ATAC workflow. | | fragment_file | ``.fragments.tsv | TSV containing fragment start and stop coordinates per barcode. In order, the columns are "Chromosome", "Start", "Stop", "ATAC Barcode", and "Number Reads". | | snap_metrics | ``_``.atac_metrics.csv | CSV file containing library-level metrics. Read more in the [Library Metrics Overview](library-metrics.md) + library_metrics | ``_`_library_metrics.csv | CSV file containing library-level metrics. Read more in the [Library Metrics Overview](library-metrics.md) ## Versioning and testing diff --git a/website/docs/Pipelines/ATAC/library-metrics.md b/website/docs/Pipelines/ATAC/library-metrics.md index 184cfeb8eb..3e80bc85e4 100644 --- a/website/docs/Pipelines/ATAC/library-metrics.md +++ b/website/docs/Pipelines/ATAC/library-metrics.md @@ -10,26 +10,27 @@ The [ATAC pipeline](README.md) uses [SnapATAC2](https://github.com/kaizhang/Snap | Metric | Description | | --- | --- | | NHash_ID | A unique identifier used to track and reference the specific sample or dataset. | -| Sequenced_reads | The total number of reads generated from the sequencing process, which includes both reads that are mapped and unmapped. | -| Sequenced_read_pairs | The total number of read pairs (two reads per pair) generated from the sequencing process. This is typically half of the total sequenced reads if all reads are paired. | -| Fraction_valid_barcode | The fraction of reads that contain a valid barcode, indicating the proportion of reads that are correctly assigned to a specific cell or sample. | -| Fraction_Q30_bases_in_read_1 | The proportion of bases in Read 1 that have a Phred quality score of 30 or higher, indicating high-confidence base calls. | -| Fraction_Q30_bases_in_read_2 | The proportion of bases in Read 2 that have a Phred quality score of 30 or higher, indicating high-confidence base calls. | -| Number_of_cells | The estimated number of cells captured and sequenced in the experiment, based on the barcodes identified. | -| Mean_raw_read_pairs_per_cell | The average number of raw read pairs associated with each cell, providing an indication of the sequencing depth per cell. | -| Median_high-quality_fragments_per_cell | The median number of high-quality (e.g., confidently mapped) fragments associated with each cell, representing typical fragment quality across cells. | -| Fraction of high-quality fragments in cells | The fraction of high-quality fragments that are associated with identified cells, indicating the proportion of good-quality data that is cell-associated. | -| Fraction_of_transposition_events_in_peaks_in_cells | The fraction of transposition events within identified cells that occur within peaks, which are regions of accessible chromatin. | -| Fraction_duplicates | The fraction of sequenced fragments that are duplicates, which can result from PCR amplification or other factors, indicating the redundancy in the sequencing data. | -| Fraction_confidently_mapped | The fraction of sequenced fragments that are confidently mapped to the reference genome, indicating the proportion of reads that align well to the genome. | -| Fraction_unmapped | The fraction of sequenced fragments that could not be mapped to the reference genome, which can indicate sequencing errors, contamination, or regions not covered by the reference. | -| Fraction_nonnuclear | The fraction of sequenced fragments that are mapped to non-nuclear (e.g., mitochondrial or other organellar) DNA, providing insight into contamination or organellar activity. | -| Fraction_fragment_in_nucleosome_free_region | The fraction of sequenced fragments that map to nucleosome-free regions, which are indicative of accessible chromatin. | -| Fraction_fragment_flanking_single_nucleosome | The fraction of sequenced fragments that map to regions flanking single nucleosomes, indicating regions with partial chromatin accessibility. | -| TSS_enrichment_score | A measure of the enrichment of transposition events at transcription start sites (TSS), indicating the accessibility of promoters across the genome. | -| Fraction_of_high-quality_fragments_overlapping_TSS | The fraction of high-quality fragments that overlap transcription start sites (TSS), providing insight into promoter accessibility. | +| sequenced_reads | The total number of reads generated from the sequencing process, which includes both reads that are mapped and unmapped. | +| sequenced_read_pairs | The total number of read pairs (two reads per pair) generated from the sequencing process. This is typically half of the total sequenced reads if all reads are paired. | +| fraction_valid_barcode | The fraction of reads that contain a valid barcode, indicating the proportion of reads that are correctly assigned to a specific cell or sample. | +| fraction_Q30_bases_in_read_1 | The proportion of bases in Read 1 that have a Phred quality score of 30 or higher, indicating high-confidence base calls. | +| fraction_Q30_bases_in_read_2 | The proportion of bases in Read 2 that have a Phred quality score of 30 or higher, indicating high-confidence base calls. | +| number_of_cells | The estimated number of cells captured and sequenced in the experiment, based on the barcodes identified. | +| mean_raw_read_pairs_per_cell | The average number of raw read pairs associated with each cell, providing an indication of the sequencing depth per cell. | +| median_high-quality_fragments_per_cell | The median number of high-quality (e.g., confidently mapped) fragments associated with each cell, representing typical fragment quality across cells. | +| fraction of high-quality fragments in cells | The fraction of high-quality fragments that are associated with identified cells, indicating the proportion of good-quality data that is cell-associated. | +| fraction_of_transposition_events_in_peaks_in_cells | The fraction of transposition events within identified cells that occur within peaks, which are regions of accessible chromatin. | +| fraction_duplicates | The fraction of sequenced fragments that are duplicates, which can result from PCR amplification or other factors, indicating the redundancy in the sequencing data. | +| fraction_confidently_mapped | The fraction of sequenced fragments that are confidently mapped to the reference genome, indicating the proportion of reads that align well to the genome. | +| fraction_unmapped | The fraction of sequenced fragments that could not be mapped to the reference genome, which can indicate sequencing errors, contamination, or regions not covered by the reference. | +| fraction_nonnuclear | The fraction of sequenced fragments that are mapped to non-nuclear (e.g., mitochondrial or other organellar) DNA, providing insight into contamination or organellar activity. | +| fraction_fragment_in_nucleosome_free_region | The fraction of sequenced fragments that map to nucleosome-free regions, which are indicative of accessible chromatin. | +| fraction_fragment_flanking_single_nucleosome | The fraction of sequenced fragments that map to regions flanking single nucleosomes, indicating regions with partial chromatin accessibility. | +| tss_enrichment_score | A measure of the enrichment of transposition events at transcription start sites (TSS), indicating the accessibility of promoters across the genome. | +| fraction_of_high-quality_fragments_overlapping_TSS | The fraction of high-quality fragments that overlap transcription start sites (TSS), providing insight into promoter accessibility. | | Number_of_peaks | The total number of peaks, or regions of accessible chromatin, identified in the dataset, representing potential regulatory elements. | -| Fraction_of_genome_in_peaks | The fraction of the genome that is covered by identified peaks, indicating the extent of chromatin accessibility across the genome. | -| Fraction_of_high-quality_fragments_overlapping_peaks | The fraction of high-quality fragments that overlap with identified peaks, providing an indication of the efficiency of the assay in capturing accessible regions. | +| fraction_of_genome_in_peaks | The fraction of the genome that is covered by identified peaks, indicating the extent of chromatin accessibility across the genome. | +| fraction_of_high-quality_fragments_overlapping_peaks | The fraction of high-quality fragments that overlap with identified peaks, providing an indication of the efficiency of the assay in capturing accessible regions. | +| percent_target | Percent of cells recovered; value is calculated as estimated_cells/expected_cells. | diff --git a/website/docs/Pipelines/Multiome_Pipeline/README.md b/website/docs/Pipelines/Multiome_Pipeline/README.md index 1062b121a4..625d3320d7 100644 --- a/website/docs/Pipelines/Multiome_Pipeline/README.md +++ b/website/docs/Pipelines/Multiome_Pipeline/README.md @@ -7,7 +7,7 @@ slug: /Pipelines/Multiome_Pipeline/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [Multiome v5.7.0](https://github.com/broadinstitute/warp/releases) | September, 2024 | Kaylee Mathews | Please [file an issue in WARP](https://github.com/broadinstitute/warp/issues). | +| [Multiome v5.8.0](https://github.com/broadinstitute/warp/releases) | October, 2024 | WARP Pipelines | Please [file an issue in WARP](https://github.com/broadinstitute/warp/issues). | ![Multiome_diagram](./multiome_diagram.png) @@ -59,6 +59,7 @@ Multiome can be deployed using [Cromwell](https://cromwell.readthedocs.io/en/sta | cloud_provider | String describing the cloud provider that should be used to run the workflow; value should be "gcp" or "azure". | String | | gex_nhash_id | Optional identifier for the library aliquot; when specified, the gene expression workflow will echo the ID in the gene expression output h5ads (in the adata.uns section) and in the library-level metrics CSV. | | atac_nhash_id | Optional identifier for the library aliquot; when specified, the workflow will echo the ID in the ATAC output h5ads (in the adata.uns section) and in the library-level metrics CSV. +| expected_cells | Number of cells loaded for library preparation; default is set to 3000. | Integer | | annotations_gtf | GTF file containing gene annotations used for GEX cell metric calculation and ATAC fragment metrics; must match the GTF used to build the STAR aligner. | File | | gex_r1_fastq | Array of read 1 FASTQ files representing a single GEX 10x library. | Array[File] | | gex_r2_fastq | Array of read 2 FASTQ files representing a single GEX 10x library.| Array[File] | @@ -109,7 +110,7 @@ The Multiome workflow calls two WARP subworkflows, one external subworkflow (opt | fragment_file_atac | `_atac.fragments.sorted.tsv.gz` | Sorted and bgzipped TSV file containing fragment start and stop coordinates per barcode. The columns are "Chromosome", "Start", "Stop", "ATAC Barcode", "Number of reads", and "GEX Barcode". | | fragment_file_index | `_atac.fragments.sorted.tsv.gz.tbi` | tabix index file for the fragment file. | | snap_metrics_atac | `_atac.metrics.h5ad` | h5ad (Anndata) file containing per-barcode metrics from SnapATAC2. Also contains the equivalent gene expression barcode for each ATAC barcode in the `gex_barcodes` column of the `h5ad.obs` property. See the [ATAC Count Matrix Overview](../ATAC/count-matrix-overview.md) for more details. | -| atac_library_metrics | `_.atac.metrics.csv` | CSV with library-level metrics produced by SnapATAC2. See the ATAC [Library Level Metrics Overview](../ATAC/library-metrics.md) for more details. | +| atac_library_metrics | `_atac_.metrics.csv` | CSV with library-level metrics produced by SnapATAC2. See the ATAC [Library Level Metrics Overview](../ATAC/library-metrics.md) for more details. | | genomic_reference_version_gex | `.txt` | File containing the Genome build, source and GTF annotation version. | | bam_gex | `_gex.bam` | BAM file containing aligned reads from Optimus workflow. | | matrix_gex | `_gex_sparse_counts.npz` | NPZ file containing raw gene by cell counts. | @@ -123,9 +124,9 @@ The Multiome workflow calls two WARP subworkflows, one external subworkflow (opt | multimappers_Uniform_matrix | `UniqueAndMult-Uniform.mtx` | Optional output produced when `soloMultiMappers` is "Uniform"; see STARsolo [documentation](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md#multi-gene-reads) for more information.| | multimappers_Rescue_matrix | `UniqueAndMult-Rescue.mtx` | Optional output produced when `soloMultiMappers` is "Rescue"; see STARsolo [documentation](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md#multi-gene-reads) for more information. | | multimappers_PropUnique_matrix | `UniqueAndMult-PropUnique.mtx` | Optional output produced when `soloMultiMappers` is "PropUnique"; see STARsolo [documentation](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md#multi-gene-reads) for more information.| -| gex_aligner_metrics | `.star_metrics.tar` | Text file containing per barcode metrics (`CellReads.stats`) produced by the GEX pipeline STARsolo aligner. | -| library_metrics | `__library_metrics.csv` | Optional CSV file containing all library-level metrics calculated with STARsolo for gene expression data. | -| mtx_files | `.mtx_files.tar` | TAR file with STARsolo matrix market files (barcodes.tsv, features.tsv, and matrix.mtx) | TAR | +| gex_aligner_metrics | `_gex.star_metrics.tar` | Text file containing per barcode metrics (`CellReads.stats`) produced by the GEX pipeline STARsolo aligner. | +| library_metrics | `_gex__library_metrics.csv` | Optional CSV file containing all library-level metrics calculated with STARsolo for gene expression data. | +| mtx_files | `_gex.mtx_files.tar` | TAR file with STARsolo matrix market files (barcodes.tsv, features.tsv, and matrix.mtx) | TAR | | cell_barcodes_csv | `` | Optional output produced when `run_cellbender` is "true"; see CellBender [documentation](https://cellbender.readthedocs.io/en/latest/usage/index.html) and [GitHub repository](https://github.com/broadinstitute/CellBender/tree/master) for more information.| | checkpoint_file | `` | Optional output produced when `run_cellbender` is "true"; see CellBender [documentation](https://cellbender.readthedocs.io/en/latest/usage/index.html) and [GitHub repository](https://github.com/broadinstitute/CellBender/tree/master) for more information. | | h5_array | `` | Optional output produced when `run_cellbender` is "true"; see CellBender [documentation](https://cellbender.readthedocs.io/en/latest/usage/index.html) and [GitHub repository](https://github.com/broadinstitute/CellBender/tree/master) for more information. | diff --git a/website/docs/Pipelines/Optimus_Pipeline/README.md b/website/docs/Pipelines/Optimus_Pipeline/README.md index 9c1395ed4e..607c2b01a5 100644 --- a/website/docs/Pipelines/Optimus_Pipeline/README.md +++ b/website/docs/Pipelines/Optimus_Pipeline/README.md @@ -7,7 +7,7 @@ slug: /Pipelines/Optimus_Pipeline/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [optimus_v7.7.0](https://github.com/broadinstitute/warp/releases?q=optimus&expanded=true) | September, 2024 | Elizabeth Kiernan | Please [file an issue in WARP](https://github.com/broadinstitute/warp/issues) | +| [optimus_v7.8.0](https://github.com/broadinstitute/warp/releases?q=optimus&expanded=true) | October, 2024 | WARP Pipelines | Please [file an issue in WARP](https://github.com/broadinstitute/warp/issues) | ![Optimus_diagram](Optimus_diagram.png) @@ -107,7 +107,7 @@ The example configuration files also contain metadata for the reference files, d | ignore_r1_read_length | Boolean that overrides a check on the 10x chemistry. Default is set to false. If true, the workflow will not ensure that the 10x_chemistry_version input matches the chemistry in the read 1 FASTQ. | "true" or "false" (default) | | emptydrops_lower | UMI threshold for emptyDrops detection; default is 100. | N/A | | count_exons | Boolean indicating if the workflow should calculate exon counts **when in single-nucleus (sn_rna) mode**. If true, this option will output an additional layer for the h5ad file. By default, it is set to "false". If the parameter is true and used with sc_rnamode, the workflow will return an error. | "true" or "false" (default) | -| expected_cells | Optional integer input for the expected number of cells, which is used calculate library-level metrics. The default is set to 3,000 | +| gex_expected_cells | Optional integer input for the expected number of cells, which is used calculate library-level metrics. The default is set to 3,000. | N/A | #### Pseudogene handling The example Optimus reference files are downloaded directly from GENCODE (see Quickstart table) and are not modified to remove pseudogenes. This is in contrast to the [references created for Cell Ranger](https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/release-notes/references#header) which remove pseudogenes and small RNAs. diff --git a/website/docs/Pipelines/PairedTag_Pipeline/README.md b/website/docs/Pipelines/PairedTag_Pipeline/README.md index 64d0b956f8..d1eeb23b11 100644 --- a/website/docs/Pipelines/PairedTag_Pipeline/README.md +++ b/website/docs/Pipelines/PairedTag_Pipeline/README.md @@ -7,7 +7,7 @@ slug: /Pipelines/PairedTag_Pipeline/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | |:---:| :---: | :---: | :---: | -| [PairedTag_v1.7.0](https://github.com/broadinstitute/warp/releases) | September, 2024 | Kaylee Mathews | Please [file an issue in WARP](https://github.com/broadinstitute/warp/issues). | +| [PairedTag_v1.8.0](https://github.com/broadinstitute/warp/releases) | October, 2024 | WARP Pipelines | Please [file an issue in WARP](https://github.com/broadinstitute/warp/issues). | ## Introduction to the Paired-Tag workflow @@ -104,9 +104,9 @@ The Paired-Tag workflow calls two WARP subworkflows and an additional task which | Output variable name | Filename, if applicable | Output format and description | |--- | --- | --- | -| pairedtag_pipeline_version_out | N.A. | String describing the version of the Paired-Tag pipeline used. | -| bam_aligned_output_atac | `_atac.bam` | BAM file containing aligned reads from ATAC workflow; contains sample and cell barcodes stored in the BB tag if `preindex` is “true”. | -| fragment_file_atac | `_atac.fragments.tsv` or if preindexing = true, `_atac.fragments.BB.tsv` | TSV file containing fragment start and stop coordinates per barcode. The columns are "Chromosome", "Start", "Stop", "Barcode", and "Number of reads". When preindexing is used, additional columns include "Sample Barcode", "Cell Barcode", and "Duplicates" (which indicates if a cell barcode matches more than one sample barcode). | +| pairedtag_pipeline_version_out | N/A | String describing the version of the Paired-Tag pipeline used. | +| bam_aligned_output_atac | `_atac.bam` or if `preindex` = true, `_atac.bam.BB.bam` | BAM file containing aligned reads from ATAC workflow; contains sample and cell barcodes stored in the BB tag if `preindex` is “true”. | +| fragment_file_atac | `_atac.fragments.tsv` or if `preindex` = true, `_atac.fragments.sorted.tsv.gz` | TSV file containing fragment start and stop coordinates per barcode. The columns are "Chromosome", "Start", "Stop", "Barcode", and "Number of reads". When `preindex` = true, the file is bgzipped and has additional columns include "Sample Barcode", "Cell Barcode", and "Duplicates" (which indicates if a cell barcode matches more than one sample barcode). | | snap_metrics_atac | `_atac.metrics.h5ad` | h5ad (Anndata) file containing per-barcode metrics from SnapATAC2. See the [ATAC Count Matrix Overview](../ATAC/count-matrix-overview.md) for more details. If the preindex option is used, the h5ad.obs will contain 3 extra columns: preindex (the sample barcode), CB (cell barcodes), and duplicates (indicates with a 1 if the cell barcode matches more than preindex, otherwise it is 0).| | genomic_reference_version_gex | `.txt` | File containing the Genome build, source and GTF annotation version. | | bam_gex | `_gex.bam` | BAM file containing aligned reads from Optimus workflow. | @@ -117,8 +117,9 @@ The Paired-Tag workflow calls two WARP subworkflows and an additional task which | gene_metrics_gex | `_gex.gene_metrics.csv.gz` | CSV file containing the per-gene metrics. | | cell_calls_gex | `_gex.emptyDrops` | TSV file containing the EmptyDrops results when the Optimus workflow is run in sc_rna mode. | | h5ad_output_file_gex | `_gex.h5ad` | h5ad (Anndata) file containing the raw cell-by-gene count matrix, gene metrics, cell metrics, and global attributes. See the [Optimus Count Matrix Overview](../Optimus_Pipeline/Loom_schema.md) for more details. | -| library_metrics | `__library_metrics.csv` | Optional CSV file containing all library-level metrics calculated with STARsolo for gene expression data. | -| cloud_provider | String describing the cloud provider that should be used to run the workflow; value should be "gcp" or "azure". | String | +| library_metrics | `_gex__library_metrics.csv` | Optional CSV file containing all library-level metrics calculated with STARsolo for gene expression data. | +| atac_library_final | `_atac__library_metrics` | CSV file containing all the library-level metrics calucalted by SnapATAC2. | +| cloud_provider | N/A | String describing the cloud provider that should be used to run the workflow; value should be "gcp" or "azure". | | multimappers_EM_matrix | `UniqueAndMult-EM.mtx` | Optional output produced when `soloMultiMappers` is "EM"; see STARsolo [documentation](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md#multi-gene-reads) for more information.| | multimappers_Uniform_matrix | `UniqueAndMult-Uniform.mtx` | Optional output produced when `soloMultiMappers` is "Uniform" (default); see STARsolo [documentation](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md#multi-gene-reads) for more information.| | multimappers_Rescue_matrix | `UniqueAndMult-Rescue.mtx` | Optional output produced when `soloMultiMappers` is "Rescue"; see STARsolo [documentation](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md#multi-gene-reads) for more information. | From 672993e3f929b8041963f050e4cd178d6a269ab2 Mon Sep 17 00:00:00 2001 From: meganshand Date: Wed, 30 Oct 2024 10:20:30 -0400 Subject: [PATCH 2/4] Reducing memory required for ValidateVCF in Reblock pipeline (#1400) * Reducing memory required for ValidateVCF when the interval list to check is a GVCF * changelogs and pipeline versions * Updated pipeline_versions.txt with all pipeline version information --------- Co-authored-by: Nikelle Petrillo <38223776+nikellepetrillo@users.noreply.github.com> Co-authored-by: npetrill Co-authored-by: GitHub Action --- pipeline_versions.txt | 32 +++++++++---------- .../arrays/single_sample/Arrays.changelog.md | 5 +++ .../broad/arrays/single_sample/Arrays.wdl | 2 +- .../reblocking/ReblockGVCF.changelog.md | 5 +++ .../reblocking/ReblockGVCF.wdl | 2 +- .../ExomeGermlineSingleSample.changelog.md | 5 +++ .../exome/ExomeGermlineSingleSample.wdl | 2 +- ...maGenomicsWholeGenomeGermline.changelog.md | 5 +++ .../UltimaGenomicsWholeGenomeGermline.wdl | 2 +- ...oleGenomeGermlineSingleSample.changelog.md | 5 +++ .../wgs/WholeGenomeGermlineSingleSample.wdl | 2 +- .../VariantCalling.changelog.md | 5 +++ .../variant_calling/VariantCalling.wdl | 2 +- ...maGenomicsWholeGenomeCramOnly.changelog.md | 5 +++ .../UltimaGenomicsWholeGenomeCramOnly.wdl | 2 +- .../IlluminaGenotypingArray.changelog.md | 5 +++ .../illumina/IlluminaGenotypingArray.wdl | 2 +- .../BroadInternalArrays.changelog.md | 5 +++ .../single_sample/BroadInternalArrays.wdl | 2 +- .../BroadInternalUltimaGenomics.changelog.md | 5 +++ .../BroadInternalUltimaGenomics.wdl | 2 +- .../BroadInternalRNAWithUMIs.changelog.md | 5 +++ .../rna_seq/BroadInternalRNAWithUMIs.wdl | 2 +- .../broad/qc/CheckFingerprint.changelog.md | 5 +++ pipelines/broad/qc/CheckFingerprint.wdl | 2 +- .../exome/ExomeReprocessing.changelog.md | 5 +++ .../reprocessing/exome/ExomeReprocessing.wdl | 2 +- .../ExternalExomeReprocessing.changelog.md | 5 +++ .../exome/ExternalExomeReprocessing.wdl | 2 +- ...ternalWholeGenomeReprocessing.changelog.md | 5 +++ .../wgs/ExternalWholeGenomeReprocessing.wdl | 2 +- .../wgs/WholeGenomeReprocessing.changelog.md | 5 +++ .../wgs/WholeGenomeReprocessing.wdl | 2 +- tasks/broad/Qc.wdl | 4 +-- 34 files changed, 114 insertions(+), 34 deletions(-) diff --git a/pipeline_versions.txt b/pipeline_versions.txt index c176a31945..523a909195 100644 --- a/pipeline_versions.txt +++ b/pipeline_versions.txt @@ -1,32 +1,32 @@ Pipeline Name Version Date of Last Commit -CheckFingerprint 1.0.21 2024-09-06 +CheckFingerprint 1.0.22 2024-10-28 RNAWithUMIsPipeline 1.0.17 2024-09-06 AnnotationFiltration 1.2.6 2024-09-06 -UltimaGenomicsWholeGenomeGermline 1.1.0 2024-09-06 -WholeGenomeGermlineSingleSample 3.3.1 2024-09-17 -ExomeGermlineSingleSample 3.2.1 2024-09-17 +UltimaGenomicsWholeGenomeGermline 1.1.1 2024-10-28 +WholeGenomeGermlineSingleSample 3.3.2 2024-10-28 +ExomeGermlineSingleSample 3.2.2 2024-10-28 JointGenotypingByChromosomePartTwo 1.5.1 2024-09-10 JointGenotypingByChromosomePartOne 1.5.1 2024-09-10 -ReblockGVCF 2.3.0 2024-09-06 +ReblockGVCF 2.3.1 2024-10-28 JointGenotyping 1.7.1 2024-09-10 UltimaGenomicsJointGenotyping 1.2.1 2024-09-10 -VariantCalling 2.2.2 2024-09-06 -UltimaGenomicsWholeGenomeCramOnly 1.0.21 2024-09-06 +VariantCalling 2.2.3 2024-10-28 +UltimaGenomicsWholeGenomeCramOnly 1.0.22 2024-10-28 GDCWholeGenomeSomaticSingleSample 1.3.3 2024-09-06 -BroadInternalRNAWithUMIs 1.0.34 2024-09-06 -BroadInternalUltimaGenomics 1.1.0 2024-09-06 -BroadInternalArrays 1.1.12 2024-09-06 +BroadInternalRNAWithUMIs 1.0.35 2024-09-06 +BroadInternalUltimaGenomics 1.1.1 2024-10-28 +BroadInternalArrays 1.1.13 2024-10-28 BroadInternalImputation 1.1.13 2024-09-06 -Arrays 2.6.28 2024-09-06 +Arrays 2.6.29 2024-10-28 ValidateChip 1.16.6 2024-09-06 MultiSampleArrays 1.6.2 2024-08-02 Imputation 1.1.14 2024-09-06 -IlluminaGenotypingArray 1.12.22 2024-09-06 -ExternalWholeGenomeReprocessing 2.3.1 2024-09-17 -ExternalExomeReprocessing 3.3.1 2024-09-17 +IlluminaGenotypingArray 1.12.23 2024-10-28 +ExternalWholeGenomeReprocessing 2.3.2 2024-10-28 +ExternalExomeReprocessing 3.3.2 2024-10-28 CramToUnmappedBams 1.1.3 2024-08-02 -WholeGenomeReprocessing 3.3.1 2024-09-17 -ExomeReprocessing 3.3.1 2024-09-17 +WholeGenomeReprocessing 3.3.2 2024-10-28 +ExomeReprocessing 3.3.2 2024-10-28 BuildIndices 3.0.0 2023-12-06 scATAC 1.3.2 2023-08-03 snm3C 4.0.4 2024-08-06 diff --git a/pipelines/broad/arrays/single_sample/Arrays.changelog.md b/pipelines/broad/arrays/single_sample/Arrays.changelog.md index a45c451da1..004aaace5a 100644 --- a/pipelines/broad/arrays/single_sample/Arrays.changelog.md +++ b/pipelines/broad/arrays/single_sample/Arrays.changelog.md @@ -1,3 +1,8 @@ +# 2.6.29 +2024-10-28 (Date of Last Commit) + +* Updated the docker in the ValidateVCF task; this does not affect this pipeline + # 2.6.28 2024-09-06 (Date of Last Commit) diff --git a/pipelines/broad/arrays/single_sample/Arrays.wdl b/pipelines/broad/arrays/single_sample/Arrays.wdl index 8bffa5be5b..72004afa3d 100644 --- a/pipelines/broad/arrays/single_sample/Arrays.wdl +++ b/pipelines/broad/arrays/single_sample/Arrays.wdl @@ -23,7 +23,7 @@ import "../../../../tasks/broad/Utilities.wdl" as utils workflow Arrays { - String pipeline_version = "2.6.28" + String pipeline_version = "2.6.29" input { String chip_well_barcode diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.changelog.md b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.changelog.md index c5e8472119..27f3b07cfb 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.changelog.md +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.changelog.md @@ -1,3 +1,8 @@ +# 2.3.1 +2024-10-28 (Date of Last Commit) + +* Updated GATK for Validate Variants, which reduces the memory requirements for the task when an interval list is not provided + # 2.3.0 2024-09-06 (Date of Last Commit) diff --git a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.wdl b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.wdl index 4012c2f35f..f16fe3031d 100644 --- a/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.wdl +++ b/pipelines/broad/dna_seq/germline/joint_genotyping/reblocking/ReblockGVCF.wdl @@ -6,7 +6,7 @@ import "../../../../../../tasks/broad/Utilities.wdl" as utils workflow ReblockGVCF { - String pipeline_version = "2.3.0" + String pipeline_version = "2.3.1" input { diff --git a/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.changelog.md b/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.changelog.md index 08e18da583..7531a4a61d 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.changelog.md +++ b/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.changelog.md @@ -1,3 +1,8 @@ +# 3.2.2 +2024-10-28 (Date of Last Commit) + +* Updated the docker in the ValidateVCF task; this does not affect this pipeline + # 3.2.1 2024-09-17 (Date of Last Commit) diff --git a/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.wdl b/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.wdl index 0162c6c617..7dece64d75 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.wdl +++ b/pipelines/broad/dna_seq/germline/single_sample/exome/ExomeGermlineSingleSample.wdl @@ -45,7 +45,7 @@ import "../../../../../../tasks/broad/Utilities.wdl" as utils # WORKFLOW DEFINITION workflow ExomeGermlineSingleSample { - String pipeline_version = "3.2.1" + String pipeline_version = "3.2.2" input { diff --git a/pipelines/broad/dna_seq/germline/single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.changelog.md b/pipelines/broad/dna_seq/germline/single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.changelog.md index 7866e82efa..a28c1d6a90 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.changelog.md +++ b/pipelines/broad/dna_seq/germline/single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.changelog.md @@ -1,3 +1,8 @@ +# 1.1.1 +2024-10-28 (Date of Last Commit) + +* Updated the docker in the ValidateVCF task; this does not affect this pipeline + # 1.1.0 2024-09-06 (Date of Last Commit) diff --git a/pipelines/broad/dna_seq/germline/single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.wdl b/pipelines/broad/dna_seq/germline/single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.wdl index b30e320f77..d86df9539e 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.wdl +++ b/pipelines/broad/dna_seq/germline/single_sample/ugwgs/UltimaGenomicsWholeGenomeGermline.wdl @@ -50,7 +50,7 @@ workflow UltimaGenomicsWholeGenomeGermline { filtering_model_no_gt_name: "String describing the optional filtering model; default set to rf_model_ignore_gt_incl_hpol_runs" } - String pipeline_version = "1.1.0" + String pipeline_version = "1.1.1" References references = alignment_references.references diff --git a/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.changelog.md b/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.changelog.md index 8c3501292c..91bf0f0f7f 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.changelog.md +++ b/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.changelog.md @@ -1,3 +1,8 @@ +# 3.3.2 +2024-10-28 (Date of Last Commit) + +* Updated the docker in the ValidateVCF task; this does not affect this pipeline + # 3.3.1 2024-09-17 (Date of Last Commit) diff --git a/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.wdl b/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.wdl index 552efef9ae..2635fca75c 100644 --- a/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.wdl +++ b/pipelines/broad/dna_seq/germline/single_sample/wgs/WholeGenomeGermlineSingleSample.wdl @@ -40,7 +40,7 @@ import "../../../../../../structs/dna_seq/DNASeqStructs.wdl" workflow WholeGenomeGermlineSingleSample { - String pipeline_version = "3.3.1" + String pipeline_version = "3.3.2" input { diff --git a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.changelog.md b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.changelog.md index 54b9799bbd..3be5d2c9bb 100644 --- a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.changelog.md +++ b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.changelog.md @@ -1,3 +1,8 @@ +# 2.2.3 +2024-10-28 (Date of Last Commit) + +* Updated the docker in the ValidateVCF task; this does not affect this pipeline + # 2.2.2 2024-09-06 (Date of Last Commit) diff --git a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl index 0e9fc75db1..3a1f0748a4 100644 --- a/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl +++ b/pipelines/broad/dna_seq/germline/variant_calling/VariantCalling.wdl @@ -9,7 +9,7 @@ import "../../../../../tasks/broad/DragenTasks.wdl" as DragenTasks workflow VariantCalling { - String pipeline_version = "2.2.2" + String pipeline_version = "2.2.3" input { diff --git a/pipelines/broad/dna_seq/somatic/single_sample/ugwgs/UltimaGenomicsWholeGenomeCramOnly.changelog.md b/pipelines/broad/dna_seq/somatic/single_sample/ugwgs/UltimaGenomicsWholeGenomeCramOnly.changelog.md index 6f659a20c5..bcb304a356 100644 --- a/pipelines/broad/dna_seq/somatic/single_sample/ugwgs/UltimaGenomicsWholeGenomeCramOnly.changelog.md +++ b/pipelines/broad/dna_seq/somatic/single_sample/ugwgs/UltimaGenomicsWholeGenomeCramOnly.changelog.md @@ -1,3 +1,8 @@ +# 1.0.22 +2024-10-28 (Date of Last Commit) + +* Updated the docker in the ValidateVCF task; this does not affect this pipeline + # 1.0.21 2024-09-06 (Date of Last Commit) diff --git a/pipelines/broad/dna_seq/somatic/single_sample/ugwgs/UltimaGenomicsWholeGenomeCramOnly.wdl b/pipelines/broad/dna_seq/somatic/single_sample/ugwgs/UltimaGenomicsWholeGenomeCramOnly.wdl index c38469fed0..23c484698b 100644 --- a/pipelines/broad/dna_seq/somatic/single_sample/ugwgs/UltimaGenomicsWholeGenomeCramOnly.wdl +++ b/pipelines/broad/dna_seq/somatic/single_sample/ugwgs/UltimaGenomicsWholeGenomeCramOnly.wdl @@ -43,7 +43,7 @@ workflow UltimaGenomicsWholeGenomeCramOnly { save_bam_file: "If true, then save intermeidate ouputs used by germline pipeline (such as the output BAM) otherwise they won't be kept as outputs." } - String pipeline_version = "1.0.21" + String pipeline_version = "1.0.22" References references = alignment_references.references diff --git a/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.changelog.md b/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.changelog.md index ab87074163..9157f9faad 100644 --- a/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.changelog.md +++ b/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.changelog.md @@ -1,3 +1,8 @@ +# 1.12.23 +2024-10-28 (Date of Last Commit) + +* Updated the docker in the ValidateVCF task; this does not affect this pipeline + # 1.12.22 2024-09-06 (Date of Last Commit) diff --git a/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.wdl b/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.wdl index 2b865aa8a1..5dfe13b1a5 100644 --- a/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.wdl +++ b/pipelines/broad/genotyping/illumina/IlluminaGenotypingArray.wdl @@ -21,7 +21,7 @@ import "../../../../tasks/broad/Qc.wdl" as Qc workflow IlluminaGenotypingArray { - String pipeline_version = "1.12.22" + String pipeline_version = "1.12.23" input { String sample_alias diff --git a/pipelines/broad/internal/arrays/single_sample/BroadInternalArrays.changelog.md b/pipelines/broad/internal/arrays/single_sample/BroadInternalArrays.changelog.md index 66b6ae2d18..c7967f1427 100644 --- a/pipelines/broad/internal/arrays/single_sample/BroadInternalArrays.changelog.md +++ b/pipelines/broad/internal/arrays/single_sample/BroadInternalArrays.changelog.md @@ -1,3 +1,8 @@ +# 1.1.13 +2024-10-28 (Date of Last Commit) + +* Updated the docker in the ValidateVCF task; this does not affect this pipeline + # 1.1.12 2024-09-06 (Date of Last Commit) diff --git a/pipelines/broad/internal/arrays/single_sample/BroadInternalArrays.wdl b/pipelines/broad/internal/arrays/single_sample/BroadInternalArrays.wdl index f03e5bf5f4..a63ee8407c 100644 --- a/pipelines/broad/internal/arrays/single_sample/BroadInternalArrays.wdl +++ b/pipelines/broad/internal/arrays/single_sample/BroadInternalArrays.wdl @@ -9,7 +9,7 @@ workflow BroadInternalArrays { description: "Push outputs of Arrays.wdl to TDR dataset table ArraysOutputsTable." } - String pipeline_version = "1.1.12" + String pipeline_version = "1.1.13" input { # inputs to wrapper task diff --git a/pipelines/broad/internal/dna_seq/germline/single_sample/UltimaGenomics/BroadInternalUltimaGenomics.changelog.md b/pipelines/broad/internal/dna_seq/germline/single_sample/UltimaGenomics/BroadInternalUltimaGenomics.changelog.md index c0935d25e5..2365f13a21 100644 --- a/pipelines/broad/internal/dna_seq/germline/single_sample/UltimaGenomics/BroadInternalUltimaGenomics.changelog.md +++ b/pipelines/broad/internal/dna_seq/germline/single_sample/UltimaGenomics/BroadInternalUltimaGenomics.changelog.md @@ -1,3 +1,8 @@ +# 1.1.1 +2024-10-28 (Date of Last Commit) + +* Updated the docker in the ValidateVCF task; this does not affect this pipeline + # 1.1.0 2024-09-06 (Date of Last Commit) diff --git a/pipelines/broad/internal/dna_seq/germline/single_sample/UltimaGenomics/BroadInternalUltimaGenomics.wdl b/pipelines/broad/internal/dna_seq/germline/single_sample/UltimaGenomics/BroadInternalUltimaGenomics.wdl index a5d780f978..f9c9eaad7d 100644 --- a/pipelines/broad/internal/dna_seq/germline/single_sample/UltimaGenomics/BroadInternalUltimaGenomics.wdl +++ b/pipelines/broad/internal/dna_seq/germline/single_sample/UltimaGenomics/BroadInternalUltimaGenomics.wdl @@ -6,7 +6,7 @@ import "../../../../../../../pipelines/broad/qc/CheckFingerprint.wdl" as FP workflow BroadInternalUltimaGenomics { - String pipeline_version = "1.1.0" + String pipeline_version = "1.1.1" input { diff --git a/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.changelog.md b/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.changelog.md index de80b0c60f..34f0ff322f 100644 --- a/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.changelog.md +++ b/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.changelog.md @@ -1,3 +1,8 @@ +# 1.0.35 +2024-09-06 (Date of Last Commit) + +* Updated the docker in the ValidateVCF task; this does not affect this pipeline + # 1.0.34 2024-09-06 (Date of Last Commit) diff --git a/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.wdl b/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.wdl index 4d447a0451..aa13d1364d 100644 --- a/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.wdl +++ b/pipelines/broad/internal/rna_seq/BroadInternalRNAWithUMIs.wdl @@ -7,7 +7,7 @@ import "../../../../tasks/broad/Utilities.wdl" as utils workflow BroadInternalRNAWithUMIs { - String pipeline_version = "1.0.34" + String pipeline_version = "1.0.35" input { # input needs to be either "hg19" or "hg38" diff --git a/pipelines/broad/qc/CheckFingerprint.changelog.md b/pipelines/broad/qc/CheckFingerprint.changelog.md index a1e6e0b579..1d819d8734 100644 --- a/pipelines/broad/qc/CheckFingerprint.changelog.md +++ b/pipelines/broad/qc/CheckFingerprint.changelog.md @@ -1,3 +1,8 @@ +# 1.0.22 +2024-10-28 (Date of Last Commit) + +* Updated the docker in the ValidateVCF task; this does not affect this pipeline + # 1.0.21 2024-09-06 (Date of Last Commit) diff --git a/pipelines/broad/qc/CheckFingerprint.wdl b/pipelines/broad/qc/CheckFingerprint.wdl index 1dbaad70e0..eacf6b63a7 100644 --- a/pipelines/broad/qc/CheckFingerprint.wdl +++ b/pipelines/broad/qc/CheckFingerprint.wdl @@ -24,7 +24,7 @@ import "../../../tasks/broad/Qc.wdl" as Qc workflow CheckFingerprint { - String pipeline_version = "1.0.21" + String pipeline_version = "1.0.22" input { File? input_vcf diff --git a/pipelines/broad/reprocessing/exome/ExomeReprocessing.changelog.md b/pipelines/broad/reprocessing/exome/ExomeReprocessing.changelog.md index ddebb0a520..ae7def5e1f 100644 --- a/pipelines/broad/reprocessing/exome/ExomeReprocessing.changelog.md +++ b/pipelines/broad/reprocessing/exome/ExomeReprocessing.changelog.md @@ -1,3 +1,8 @@ +# 3.3.2 +2024-10-28 (Date of Last Commit) + +* Updated the docker in the ValidateVCF task; this does not affect this pipeline + # 3.3.1 2024-09-17 (Date of Last Commit) diff --git a/pipelines/broad/reprocessing/exome/ExomeReprocessing.wdl b/pipelines/broad/reprocessing/exome/ExomeReprocessing.wdl index 9b1a08a9bf..bc824f0471 100644 --- a/pipelines/broad/reprocessing/exome/ExomeReprocessing.wdl +++ b/pipelines/broad/reprocessing/exome/ExomeReprocessing.wdl @@ -7,7 +7,7 @@ import "../../../../structs/dna_seq/DNASeqStructs.wdl" workflow ExomeReprocessing { - String pipeline_version = "3.3.1" + String pipeline_version = "3.3.2" input { File? input_cram diff --git a/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.changelog.md b/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.changelog.md index 0bb41bf60a..c262d4f6f6 100644 --- a/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.changelog.md +++ b/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.changelog.md @@ -1,3 +1,8 @@ +# 3.3.2 +2024-10-28 (Date of Last Commit) + +* Updated the docker in the ValidateVCF task; this does not affect this pipeline + # 3.3.1 2024-09-17 (Date of Last Commit) diff --git a/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.wdl b/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.wdl index 2b0bde28cb..7ee18f74e9 100644 --- a/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.wdl +++ b/pipelines/broad/reprocessing/external/exome/ExternalExomeReprocessing.wdl @@ -5,7 +5,7 @@ import "../../../../../tasks/broad/CopyFilesFromCloudToCloud.wdl" as Copy workflow ExternalExomeReprocessing { - String pipeline_version = "3.3.1" + String pipeline_version = "3.3.2" input { diff --git a/pipelines/broad/reprocessing/external/wgs/ExternalWholeGenomeReprocessing.changelog.md b/pipelines/broad/reprocessing/external/wgs/ExternalWholeGenomeReprocessing.changelog.md index 74658d622d..a7a906a3a1 100644 --- a/pipelines/broad/reprocessing/external/wgs/ExternalWholeGenomeReprocessing.changelog.md +++ b/pipelines/broad/reprocessing/external/wgs/ExternalWholeGenomeReprocessing.changelog.md @@ -1,3 +1,8 @@ +# 2.3.2 +2024-10-28 (Date of Last Commit) + +* Updated the docker in the ValidateVCF task; this does not affect this pipeline + # 2.3.1 2024-09-17 (Date of Last Commit) diff --git a/pipelines/broad/reprocessing/external/wgs/ExternalWholeGenomeReprocessing.wdl b/pipelines/broad/reprocessing/external/wgs/ExternalWholeGenomeReprocessing.wdl index 3168d61add..7c709d915a 100644 --- a/pipelines/broad/reprocessing/external/wgs/ExternalWholeGenomeReprocessing.wdl +++ b/pipelines/broad/reprocessing/external/wgs/ExternalWholeGenomeReprocessing.wdl @@ -6,7 +6,7 @@ import "../../../../../tasks/broad/CopyFilesFromCloudToCloud.wdl" as Copy workflow ExternalWholeGenomeReprocessing { - String pipeline_version = "2.3.1" + String pipeline_version = "2.3.2" input { File? input_cram diff --git a/pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.changelog.md b/pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.changelog.md index 0621f0af5b..ef8ee3c4b9 100644 --- a/pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.changelog.md +++ b/pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.changelog.md @@ -1,3 +1,8 @@ +# 3.3.2 +2024-10-28 (Date of Last Commit) + +* Updated the docker in the ValidateVCF task; this does not affect this pipeline + # 3.3.1 2024-09-17 (Date of Last Commit) diff --git a/pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.wdl b/pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.wdl index cf2be102ad..ef9d496ce7 100644 --- a/pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.wdl +++ b/pipelines/broad/reprocessing/wgs/WholeGenomeReprocessing.wdl @@ -6,7 +6,7 @@ import "../../../../structs/dna_seq/DNASeqStructs.wdl" workflow WholeGenomeReprocessing { - String pipeline_version = "3.3.1" + String pipeline_version = "3.3.2" input { File? input_cram diff --git a/tasks/broad/Qc.wdl b/tasks/broad/Qc.wdl index ff0d1b3fcf..b1561407a5 100644 --- a/tasks/broad/Qc.wdl +++ b/tasks/broad/Qc.wdl @@ -623,7 +623,7 @@ task ValidateVCF { Boolean is_gvcf = true String? extra_args #Setting default docker value for workflows that haven't yet been azurized. - String docker_path = "us.gcr.io/broad-gatk/gatk:4.6.0.0" + String docker_path = "us.gcr.io/broad-gatk/gatk:4.6.1.0" Int machine_mem_mb = 7000 } @@ -642,7 +642,7 @@ task ValidateVCF { ln -s ~{calling_interval_list} ~{calling_interval_list_basename} ln -s ~{calling_interval_list_index} ~{calling_interval_list_index_basename} gatk --java-options "-Xms~{command_mem_mb}m -Xmx~{command_mem_mb}m" \ - VcfToIntervalList -I ~{calling_interval_list_basename} -O intervals_from_gvcf.interval_list + VcfToIntervalList -I ~{calling_interval_list_basename} -O intervals_from_gvcf.interval_list --VARIANT_ID_METHOD USE_FIRST INTERVALS="intervals_from_gvcf.interval_list" else INTERVALS="~{calling_interval_list}" From a2056e73a4b8dcd04052e48887192632757cc055 Mon Sep 17 00:00:00 2001 From: Elizabeth Kiernan <55763654+ekiernan@users.noreply.github.com> Date: Wed, 30 Oct 2024 12:55:50 -0400 Subject: [PATCH 3/4] Lk add bgzip (#1404) Added bgzipped outputs to ATAC and PairedTag workflows as default --- pipelines/skylab/atac/atac.changelog.md | 1 + pipelines/skylab/atac/atac.wdl | 10 +++++++++- pipelines/skylab/multiome/Multiome.changelog.md | 1 + pipelines/skylab/optimus/Optimus.changelog.md | 1 + pipelines/skylab/paired_tag/PairedTag.changelog.md | 1 + pipelines/skylab/slideseq/SlideSeq.changelog.md | 1 + .../MultiSampleSmartSeq2SingleNucleus.changelog.md | 1 + tasks/skylab/H5adUtils.wdl | 12 +++++++++--- tasks/skylab/PairedTagUtils.wdl | 11 ++++++++--- website/docs/Pipelines/ATAC/README.md | 2 +- website/docs/Pipelines/PairedTag_Pipeline/README.md | 2 +- 11 files changed, 34 insertions(+), 9 deletions(-) diff --git a/pipelines/skylab/atac/atac.changelog.md b/pipelines/skylab/atac/atac.changelog.md index 4b886f32b2..c35fc26a23 100644 --- a/pipelines/skylab/atac/atac.changelog.md +++ b/pipelines/skylab/atac/atac.changelog.md @@ -4,6 +4,7 @@ * Added a new input parameter for atac_expected_cells, which describes the numnber of cells used for the library preparation * Updated the ATAC library CSV to be consistent in file naming convention and to have similar case for metric names to the Optimus workflow library CSV * Added a new metric to the ATAC library CSV to calculate percent_target, which is the number of estimated cells by SnapATAC2 divided by expected_cells input +* Updated the ATAC workflow so that the output fragment file is bgzipped by default # 2.3.2 diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 153f817c6f..8892f310c6 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -607,6 +607,14 @@ task CreateFragmentFile { atac_data.write_h5ad("~{input_id}.metrics.h5ad") CODE + + # sorting the file + echo "Sorting file" + sort -k1,1V -k2,2n "~{input_id}.fragments.tsv" > "~{input_id}.fragments.sorted.tsv" + echo "Starting bgzip" + bgzip "~{input_id}.fragments.sorted.tsv" + echo "Starting tabix" + tabix -s 1 -b 2 -e 3 "~{input_id}.fragments.sorted.tsv.gz" >>> runtime { @@ -618,7 +626,7 @@ task CreateFragmentFile { } output { - File fragment_file = "~{input_id}.fragments.tsv" + File fragment_file = "~{input_id}.fragments.sorted.tsv.gz" File Snap_metrics = "~{input_id}.metrics.h5ad" File atac_library_metrics = "~{input_id}_~{atac_nhash_id}_library_metrics.csv" } diff --git a/pipelines/skylab/multiome/Multiome.changelog.md b/pipelines/skylab/multiome/Multiome.changelog.md index 1cdbef30a8..ce69629951 100644 --- a/pipelines/skylab/multiome/Multiome.changelog.md +++ b/pipelines/skylab/multiome/Multiome.changelog.md @@ -4,6 +4,7 @@ * Updated the workflow to include a new expected_cells input parameter describing the number of cells used as input to the library preparation; this is passed to both the ATAC workflows and Optimus workflows and the default is set to 3000 cells * Updated the ATAC library CSV and the Gene Expression library CSV to be consistent in file naming convention and to have similar case for metric names * Added a new metric to the ATAC library CSV to calculate percent_target, which is the number of estimated cells by SnapATAC2 divided by expected_cells input +* Updated the ATAC workflow so that the output fragment file is bgzipped by default # 5.7.1 2024-10-18 (Date of Last Commit) diff --git a/pipelines/skylab/optimus/Optimus.changelog.md b/pipelines/skylab/optimus/Optimus.changelog.md index 8d82cdf07f..a201ddc395 100644 --- a/pipelines/skylab/optimus/Optimus.changelog.md +++ b/pipelines/skylab/optimus/Optimus.changelog.md @@ -4,6 +4,7 @@ * Renamed the input expected_cells to gex_expected_cells * Updated gex_expected_cells to a required output * Reformatted the library CSV output filename to remove an extra gex +* Updated the ATAC fragment file output so that it is bgzipped; this does not impact the Optimus workflow # 7.7.0 2024-09-24 (Date of Last Commit) diff --git a/pipelines/skylab/paired_tag/PairedTag.changelog.md b/pipelines/skylab/paired_tag/PairedTag.changelog.md index a7071e222f..930ed976fc 100644 --- a/pipelines/skylab/paired_tag/PairedTag.changelog.md +++ b/pipelines/skylab/paired_tag/PairedTag.changelog.md @@ -4,6 +4,7 @@ * Updated the workflow to include a new expected_cells input parameter describing the number of cells used as input to the library preparation; this is passed to both the ATAC workflows and Optimus workflows and the default is set to 3000 cells * Updated the ATAC library CSV and the Gene Expression library CSV to be consistent in file naming convention and to have similar case for metric names * Added a new metric to the ATAC library CSV to calculate percent_target, which is the number of estimated cells by SnapATAC2 divided by expected_cells input +* Updated the ATAC fragment file output so that it is bgzipped # 1.7.1 2024-10-18 (Date of Last Commit) diff --git a/pipelines/skylab/slideseq/SlideSeq.changelog.md b/pipelines/skylab/slideseq/SlideSeq.changelog.md index 0835b105a7..e67f2c83a8 100644 --- a/pipelines/skylab/slideseq/SlideSeq.changelog.md +++ b/pipelines/skylab/slideseq/SlideSeq.changelog.md @@ -2,6 +2,7 @@ 2024-10-24 (Date of Last Commit) * Updated the h5adUtils WDL to rename the gene expression library CSV filename; this does not impact slideseq +* Updated the ATAC fragment file output so that it is bgzipped; this does not impact the slideseq workflow # 3.4.2 2024-09-24 (Date of Last Commit) diff --git a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md index 90d6830c49..9b2e810e79 100644 --- a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md +++ b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md @@ -2,6 +2,7 @@ 2024-10-23 (Date of Last Commit) * Updated the h5adUtils WDL to rename the gene expression library CSV filename; this does not impact slideseq +* Updated the ATAC fragment file output so that it is bgzipped; this does not impact the Multi-snSS2 workflow # 2.0.1 2024-09-24 (Date of Last Commit) diff --git a/tasks/skylab/H5adUtils.wdl b/tasks/skylab/H5adUtils.wdl index f5fb796b49..f877aa462b 100644 --- a/tasks/skylab/H5adUtils.wdl +++ b/tasks/skylab/H5adUtils.wdl @@ -241,7 +241,7 @@ task JoinMultiomeBarcodes { } String gex_base_name = basename(gex_h5ad, ".h5ad") String atac_base_name = basename(atac_h5ad, ".h5ad") - String atac_fragment_base = basename(atac_fragment, ".tsv") + String atac_fragment_base = basename(atac_fragment, ".sorted.tsv.gz") parameter_meta { atac_h5ad: "The resulting h5ad from the ATAC workflow." @@ -254,11 +254,17 @@ task JoinMultiomeBarcodes { command <<< set -e pipefail + # decompress the bgzipped fragment file + echo "Decompressing fragment file" + bgzip -d ~{atac_fragment} > "~{atac_fragment_base}.sorted.tsv" + echo "Done decompressing" + + python3 < "~{atac_fragment_base}.sorted.tsv" + echo "Done decompressing" + python3 <`.bam | BAM containing aligned reads from ATAC workflow. | -| fragment_file | ``.fragments.tsv | TSV containing fragment start and stop coordinates per barcode. In order, the columns are "Chromosome", "Start", "Stop", "ATAC Barcode", and "Number Reads". | +| fragment_file | ``.fragments.sorted.tsv.gz | Bgzipped TSV containing fragment start and stop coordinates per barcode. In order, the columns are "Chromosome", "Start", "Stop", "ATAC Barcode", and "Number Reads". | | snap_metrics | ``_`_library_metrics.csv | CSV file containing library-level metrics. Read more in the [Library Metrics Overview](library-metrics.md) diff --git a/website/docs/Pipelines/PairedTag_Pipeline/README.md b/website/docs/Pipelines/PairedTag_Pipeline/README.md index d1eeb23b11..323b3f33b9 100644 --- a/website/docs/Pipelines/PairedTag_Pipeline/README.md +++ b/website/docs/Pipelines/PairedTag_Pipeline/README.md @@ -106,7 +106,7 @@ The Paired-Tag workflow calls two WARP subworkflows and an additional task which |--- | --- | --- | | pairedtag_pipeline_version_out | N/A | String describing the version of the Paired-Tag pipeline used. | | bam_aligned_output_atac | `_atac.bam` or if `preindex` = true, `_atac.bam.BB.bam` | BAM file containing aligned reads from ATAC workflow; contains sample and cell barcodes stored in the BB tag if `preindex` is “true”. | -| fragment_file_atac | `_atac.fragments.tsv` or if `preindex` = true, `_atac.fragments.sorted.tsv.gz` | TSV file containing fragment start and stop coordinates per barcode. The columns are "Chromosome", "Start", "Stop", "Barcode", and "Number of reads". When `preindex` = true, the file is bgzipped and has additional columns include "Sample Barcode", "Cell Barcode", and "Duplicates" (which indicates if a cell barcode matches more than one sample barcode). | +| fragment_file_atac | `_atac.fragments.sorted.tsv.gz` | Bgzipped TSV file containing fragment start and stop coordinates per barcode. The columns are "Chromosome", "Start", "Stop", "Barcode", and "Number of reads". When `preindex` = true, the file has additional columns include "Sample Barcode", "Cell Barcode", and "Duplicates" (which indicates if a cell barcode matches more than one sample barcode). | | snap_metrics_atac | `_atac.metrics.h5ad` | h5ad (Anndata) file containing per-barcode metrics from SnapATAC2. See the [ATAC Count Matrix Overview](../ATAC/count-matrix-overview.md) for more details. If the preindex option is used, the h5ad.obs will contain 3 extra columns: preindex (the sample barcode), CB (cell barcodes), and duplicates (indicates with a 1 if the cell barcode matches more than preindex, otherwise it is 0).| | genomic_reference_version_gex | `.txt` | File containing the Genome build, source and GTF annotation version. | | bam_gex | `_gex.bam` | BAM file containing aligned reads from Optimus workflow. | From 70d3e1d66394fa791b7f85c5e3eacef00937531a Mon Sep 17 00:00:00 2001 From: meganshand Date: Wed, 30 Oct 2024 15:37:09 -0400 Subject: [PATCH 4/4] update docker for vcf comparator in tests (#1399) Co-authored-by: Nikelle Petrillo <38223776+nikellepetrillo@users.noreply.github.com> --- verification/VerifyTasks.wdl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/verification/VerifyTasks.wdl b/verification/VerifyTasks.wdl index 683857e6c8..f60ba6f3a6 100644 --- a/verification/VerifyTasks.wdl +++ b/verification/VerifyTasks.wdl @@ -76,8 +76,7 @@ task CompareVCFsVerbosely { } runtime { - #TODO: update docker to next GATK release (after 4.6.0.0) which includes an updated VCFComparator - docker: "us.gcr.io/broad-dsde-methods/gatk-vcfcomparator@sha256:4c1b32dd89c46af52e68ae34f99db483ba07b08def2479d145a185de0b2d9a4a" + docker: "us.gcr.io/broad-gatk/gatk:4.6.1.0" disks: "local-disk 50 HDD" memory: "3 GiB" preemptible: 3