From 5c9148240c7d01c5d27b893ff173bce310154187 Mon Sep 17 00:00:00 2001 From: ekiernan Date: Tue, 22 Oct 2024 15:25:45 -0400 Subject: [PATCH 01/20] changing expected_cells to gex_expected_cells and adding atac metric for percent_target --- pipelines/skylab/atac/atac.wdl | 23 +++++++++++++++++++++-- pipelines/skylab/optimus/Optimus.wdl | 8 ++++---- 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 2acb133c2b..6684489387 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -23,6 +23,9 @@ workflow ATAC { # Additional library aliquot ID String? atac_nhash_id + #Expected cells from library preparation + Int atac_expected_cells + # Option for running files with preindex Boolean preindex = false @@ -139,7 +142,8 @@ workflow ATAC { annotations_gtf = annotations_gtf, preindex = preindex, docker_path = docker_prefix + snap_atac_docker, - atac_nhash_id = atac_nhash_id + atac_nhash_id = atac_nhash_id, + atac_expected_cells = atac_expected_cells } } if (!preindex) { @@ -150,7 +154,8 @@ workflow ATAC { annotations_gtf = annotations_gtf, preindex = preindex, docker_path = docker_prefix + snap_atac_docker, - atac_nhash_id = atac_nhash_id + atac_nhash_id = atac_nhash_id, + atac_expected_cells = atac_expected_cells } } @@ -537,6 +542,7 @@ task CreateFragmentFile { atac_gtf = "~{annotations_gtf}" preindex = "~{preindex}" atac_nhash_id = "~{atac_nhash_id}" + expected_cells = ~{atac_expected_cells} # calculate chrom size dictionary based on text file chrom_size_dict={} @@ -560,6 +566,19 @@ task CreateFragmentFile { # Add NHashID to metrics data = OrderedDict({'NHashID': atac_nhash_id, **data}) + + # Calculate atac percent target + if 'Number_of_cells' in data: + number_of_cells = data['Number_of_cells'] + if expected_cells != 0: # Avoid division by zero + atac_percent_target = number_of_cells / expected_cells + else: + atac_percent_target = 0 # or handle this case as needed + # Add the new metric to the dictionary + data['ATAC_percent_target'] = atac_percent_target + else: + print("Error: 'Number_of_cells' not found in the data dictionary") + # Flatten the dictionary flattened_data = [] for category, metrics in data.items(): diff --git a/pipelines/skylab/optimus/Optimus.wdl b/pipelines/skylab/optimus/Optimus.wdl index 70402c6ced..392f484832 100644 --- a/pipelines/skylab/optimus/Optimus.wdl +++ b/pipelines/skylab/optimus/Optimus.wdl @@ -36,7 +36,7 @@ workflow Optimus { File annotations_gtf File? mt_genes String? soloMultiMappers = "Uniform" - Int? expected_cells + Int? gex_expected_cells # Chemistry options include: 2 or 3 Int tenx_chemistry_version @@ -223,7 +223,7 @@ workflow Optimus { input_id = input_id, counting_mode = counting_mode, star_merge_docker_path = docker_prefix + star_merge_docker, - expected_cells = expected_cells, + expected_cells = gex_expected_cells, gex_nhash_id = gex_nhash_id } if (counting_mode == "sc_rna"){ @@ -242,7 +242,7 @@ workflow Optimus { input: input_id = input_id, gex_nhash_id = gex_nhash_id, - expected_cells = expected_cells, + expected_cells = gex_expected_cells, input_name = input_name, input_id_metadata_field = input_id_metadata_field, input_name_metadata_field = input_name_metadata_field, @@ -279,7 +279,7 @@ workflow Optimus { input: input_id = input_id, gex_nhash_id = gex_nhash_id, - expected_cells = expected_cells, + expected_cells = gex_expected_cells, input_name = input_name, counting_mode = counting_mode, input_id_metadata_field = input_id_metadata_field, From 1bb2eebdfb6606230f17627855ef7a377688d716 Mon Sep 17 00:00:00 2001 From: ekiernan Date: Tue, 22 Oct 2024 15:44:25 -0400 Subject: [PATCH 02/20] updates for ATAC CSV --- pipelines/skylab/atac/atac.wdl | 10 ++++++---- pipelines/skylab/multiome/Multiome.wdl | 7 +++++-- pipelines/skylab/optimus/Optimus.wdl | 2 +- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 6684489387..025f66f1ad 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -24,7 +24,7 @@ workflow ATAC { String? atac_nhash_id #Expected cells from library preparation - Int atac_expected_cells + Int atac_expected_cells = 3000 # Option for running files with preindex Boolean preindex = false @@ -566,16 +566,18 @@ task CreateFragmentFile { # Add NHashID to metrics data = OrderedDict({'NHashID': atac_nhash_id, **data}) + # Convert all keys to lowercase + data = OrderedDict({key.lower(): value for key, value in data.items()}) # Calculate atac percent target - if 'Number_of_cells' in data: - number_of_cells = data['Number_of_cells'] + if 'number_of_cells' in data: + number_of_cells = data['number_of_cells'] if expected_cells != 0: # Avoid division by zero atac_percent_target = number_of_cells / expected_cells else: atac_percent_target = 0 # or handle this case as needed # Add the new metric to the dictionary - data['ATAC_percent_target'] = atac_percent_target + data['percent_target'] = atac_percent_target else: print("Error: 'Number_of_cells' not found in the data dictionary") diff --git a/pipelines/skylab/multiome/Multiome.wdl b/pipelines/skylab/multiome/Multiome.wdl index 821e5bead6..232438688a 100644 --- a/pipelines/skylab/multiome/Multiome.wdl +++ b/pipelines/skylab/multiome/Multiome.wdl @@ -18,6 +18,7 @@ workflow Multiome { # Additional library aliquot ID String? gex_nhash_id String? atac_nhash_id + Int expected_cells = 3000 # Optimus Inputs String counting_mode = "sn_rna" @@ -102,7 +103,8 @@ workflow Multiome { star_strand_mode = star_strand_mode, count_exons = count_exons, soloMultiMappers = soloMultiMappers, - cloud_provider = cloud_provider + cloud_provider = cloud_provider, + gex_expected_cells = expected_cells } # Call the ATAC workflow @@ -120,7 +122,8 @@ workflow Multiome { vm_size = vm_size, annotations_gtf = annotations_gtf, atac_nhash_id = atac_nhash_id, - adapter_seq_read3 = adapter_seq_read3 + adapter_seq_read3 = adapter_seq_read3, + atac_expected_cells = expected_cells } call H5adUtils.JoinMultiomeBarcodes as JoinBarcodes { input: diff --git a/pipelines/skylab/optimus/Optimus.wdl b/pipelines/skylab/optimus/Optimus.wdl index 392f484832..4b1cbc8c68 100644 --- a/pipelines/skylab/optimus/Optimus.wdl +++ b/pipelines/skylab/optimus/Optimus.wdl @@ -36,7 +36,7 @@ workflow Optimus { File annotations_gtf File? mt_genes String? soloMultiMappers = "Uniform" - Int? gex_expected_cells + Int? gex_expected_cells = 3000 # Chemistry options include: 2 or 3 Int tenx_chemistry_version From 497fd5846c0b02fd4316a4462db8059edaaf0e5e Mon Sep 17 00:00:00 2001 From: ekiernan Date: Tue, 22 Oct 2024 17:25:49 -0400 Subject: [PATCH 03/20] added atac_expected_cells as input --- pipelines/skylab/atac/atac.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 025f66f1ad..9a380d07a6 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -517,6 +517,7 @@ task CreateFragmentFile { String cpuPlatform = "Intel Cascade Lake" String docker_path String atac_nhash_id = "" + Int atac_expected_cells = 3000 } String bam_base_name = basename(bam, ".bam") From 769cbf32ae1bb3f472eecc58d1b6ca37ffb72e38 Mon Sep 17 00:00:00 2001 From: ekiernan Date: Wed, 23 Oct 2024 08:36:06 -0400 Subject: [PATCH 04/20] revising order to lowercase --- pipelines/skylab/atac/atac.wdl | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 9a380d07a6..54e93c8990 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -565,11 +565,16 @@ task CreateFragmentFile { elif preindex == "false": data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="CB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None) - # Add NHashID to metrics - data = OrderedDict({'NHashID': atac_nhash_id, **data}) + print("Converting to lowercase") + # Convert all keys to lowercase data = OrderedDict({key.lower(): value for key, value in data.items()}) + # Add NHashID to metrics + data = OrderedDict({'NHashID': atac_nhash_id, **data}) + + + # Calculate atac percent target if 'number_of_cells' in data: number_of_cells = data['number_of_cells'] From 14df60435e7b29d02b351d763b1a04e9ac4bc78d Mon Sep 17 00:00:00 2001 From: ekiernan Date: Wed, 23 Oct 2024 09:24:06 -0400 Subject: [PATCH 05/20] adding logging --- pipelines/skylab/atac/atac.wdl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 54e93c8990..ad75884e34 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -565,11 +565,13 @@ task CreateFragmentFile { elif preindex == "false": data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="CB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None) + print("Original keys:", data.keys()) + print("Converting to lowercase") - # Convert all keys to lowercase - data = OrderedDict({key.lower(): value for key, value in data.items()}) - + data = OrderedDict({str(key).lower(): value for key, value in data.items()}) + print("Lowercase keys:", data.keys()) + # Add NHashID to metrics data = OrderedDict({'NHashID': atac_nhash_id, **data}) From 1eba2bdce56b9c0106ab7c80620adb1895c91a9d Mon Sep 17 00:00:00 2001 From: ekiernan Date: Wed, 23 Oct 2024 10:06:23 -0400 Subject: [PATCH 06/20] Update atac.wdl --- pipelines/skylab/atac/atac.wdl | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index ad75884e34..cd2b3c1b25 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -564,30 +564,15 @@ task CreateFragmentFile { data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="BB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None) elif preindex == "false": data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="CB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None) - - print("Original keys:", data.keys()) - - print("Converting to lowercase") - - data = OrderedDict({str(key).lower(): value for key, value in data.items()}) - print("Lowercase keys:", data.keys()) # Add NHashID to metrics data = OrderedDict({'NHashID': atac_nhash_id, **data}) - - # Calculate atac percent target - if 'number_of_cells' in data: - number_of_cells = data['number_of_cells'] - if expected_cells != 0: # Avoid division by zero - atac_percent_target = number_of_cells / expected_cells - else: - atac_percent_target = 0 # or handle this case as needed - # Add the new metric to the dictionary - data['percent_target'] = atac_percent_target - else: - print("Error: 'Number_of_cells' not found in the data dictionary") + number_of_cells = data['Cells']['Number_of_cells'] + atac_percent_target = number_of_cells / expected_cells + data['percent_target'] = atac_percent_target + # Flatten the dictionary flattened_data = [] @@ -598,6 +583,9 @@ task CreateFragmentFile { else: flattened_data.append((category, metrics)) + # Convert the flattened keys to lowercase (except for 'NHashID') + flattened_data = [(metric if metric == 'NHashID' else str(metric).lower(), value) for metric, value in flattened_data] + # Write to CSV csv_file_path = "~{bam_base_name}_~{atac_nhash_id}.atac_metrics.csv" with open(csv_file_path, mode='w', newline='') as file: From b611e260f32a014afbef6b0a5aac08481c43bcbd Mon Sep 17 00:00:00 2001 From: ekiernan Date: Wed, 23 Oct 2024 10:15:28 -0400 Subject: [PATCH 07/20] more logging --- pipelines/skylab/atac/atac.wdl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index cd2b3c1b25..1c9e7061bc 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -569,9 +569,12 @@ task CreateFragmentFile { data = OrderedDict({'NHashID': atac_nhash_id, **data}) # Calculate atac percent target + print("Calculating percent target") number_of_cells = data['Cells']['Number_of_cells'] + print("Print number of cells", number_of_cells) atac_percent_target = number_of_cells / expected_cells - data['percent_target'] = atac_percent_target + print("Setting percent target in nested dictionary") + data['Cells']['percent_target'] = atac_percent_target # Flatten the dictionary From dec4a4d65e73e4ff583e4ede7fb2fbf23ce4c5d0 Mon Sep 17 00:00:00 2001 From: ekiernan Date: Wed, 23 Oct 2024 11:02:23 -0400 Subject: [PATCH 08/20] making percent percent --- pipelines/skylab/atac/atac.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 1c9e7061bc..f716bbb2bb 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -572,7 +572,7 @@ task CreateFragmentFile { print("Calculating percent target") number_of_cells = data['Cells']['Number_of_cells'] print("Print number of cells", number_of_cells) - atac_percent_target = number_of_cells / expected_cells + atac_percent_target = number_of_cells / expected_cells*100 print("Setting percent target in nested dictionary") data['Cells']['percent_target'] = atac_percent_target From 72c67be0402a74dd30510061d7ad19d7e7f5c3a0 Mon Sep 17 00:00:00 2001 From: ekiernan Date: Wed, 23 Oct 2024 12:01:04 -0400 Subject: [PATCH 09/20] adjusting from bam_basename to input_id --- pipelines/skylab/atac/atac.wdl | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index f716bbb2bb..68cf34f172 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -143,7 +143,8 @@ workflow ATAC { preindex = preindex, docker_path = docker_prefix + snap_atac_docker, atac_nhash_id = atac_nhash_id, - atac_expected_cells = atac_expected_cells + atac_expected_cells = atac_expected_cells, + input_id = input_id } } if (!preindex) { @@ -155,7 +156,8 @@ workflow ATAC { preindex = preindex, docker_path = docker_prefix + snap_atac_docker, atac_nhash_id = atac_nhash_id, - atac_expected_cells = atac_expected_cells + atac_expected_cells = atac_expected_cells, + input_id = input_id } } @@ -517,11 +519,10 @@ task CreateFragmentFile { String cpuPlatform = "Intel Cascade Lake" String docker_path String atac_nhash_id = "" + String input_id Int atac_expected_cells = 3000 } - String bam_base_name = basename(bam, ".bam") - parameter_meta { bam: "Aligned bam with CB in CB tag. This is the output of the BWAPairedEndAlignment task." chrom_sizes: "Text file containing chrom_sizes for genome build (i.e. hg38)." @@ -538,7 +539,7 @@ task CreateFragmentFile { # set parameters bam = "~{bam}" - bam_base_name = "~{bam_base_name}" + input_id = "~{input_id}" chrom_sizes = "~{chrom_sizes}" atac_gtf = "~{annotations_gtf}" preindex = "~{preindex}" @@ -561,9 +562,9 @@ task CreateFragmentFile { # extract CB or BB (if preindex is true) tag from bam file to create fragment file if preindex == "true": - data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="BB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None) + data = pp.recipe_10x_metrics("~{bam}", "~{input_id}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="BB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None) elif preindex == "false": - data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="CB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None) + data = pp.recipe_10x_metrics("~{bam}", "~{input_id}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="CB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None) # Add NHashID to metrics data = OrderedDict({'NHashID': atac_nhash_id, **data}) @@ -590,7 +591,7 @@ task CreateFragmentFile { flattened_data = [(metric if metric == 'NHashID' else str(metric).lower(), value) for metric, value in flattened_data] # Write to CSV - csv_file_path = "~{bam_base_name}_~{atac_nhash_id}.atac_metrics.csv" + csv_file_path = "~{input_id}_~{atac_nhash_id}_atac_library_metrics.csv" with open(csv_file_path, mode='w', newline='') as file: writer = csv.writer(file) writer.writerows(flattened_data) # Write data @@ -603,7 +604,7 @@ task CreateFragmentFile { # calculate tsse metrics snap.metrics.tsse(atac_data, atac_gtf) # Write new atac file - atac_data.write_h5ad("~{bam_base_name}.metrics.h5ad") + atac_data.write_h5ad("~{input_id}.metrics.h5ad") CODE >>> @@ -617,8 +618,8 @@ task CreateFragmentFile { } output { - File fragment_file = "~{bam_base_name}.fragments.tsv" - File Snap_metrics = "~{bam_base_name}.metrics.h5ad" - File atac_library_metrics = "~{bam_base_name}_~{atac_nhash_id}.atac_metrics.csv" + File fragment_file = "~{input_id}.fragments.tsv" + File Snap_metrics = "~{input_id}.metrics.h5ad" + File atac_library_metrics = "~{input_id}_~{atac_nhash_id}_atac_library_metrics.csv" } } From ddccb02324a0f12ed1070452941e150985755225 Mon Sep 17 00:00:00 2001 From: ekiernan Date: Wed, 23 Oct 2024 12:12:25 -0400 Subject: [PATCH 10/20] adjusting output filenames --- pipelines/skylab/atac/atac.wdl | 4 ++-- pipelines/skylab/paired_tag/PairedTag.wdl | 2 +- tasks/skylab/H5adUtils.wdl | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 68cf34f172..08aca5345f 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -591,7 +591,7 @@ task CreateFragmentFile { flattened_data = [(metric if metric == 'NHashID' else str(metric).lower(), value) for metric, value in flattened_data] # Write to CSV - csv_file_path = "~{input_id}_~{atac_nhash_id}_atac_library_metrics.csv" + csv_file_path = "~{input_id}_~{atac_nhash_id}_library_metrics.csv" with open(csv_file_path, mode='w', newline='') as file: writer = csv.writer(file) writer.writerows(flattened_data) # Write data @@ -620,6 +620,6 @@ task CreateFragmentFile { output { File fragment_file = "~{input_id}.fragments.tsv" File Snap_metrics = "~{input_id}.metrics.h5ad" - File atac_library_metrics = "~{input_id}_~{atac_nhash_id}_atac_library_metrics.csv" + File atac_library_metrics = "~{input_id}_~{atac_nhash_id}_library_metrics.csv" } } diff --git a/pipelines/skylab/paired_tag/PairedTag.wdl b/pipelines/skylab/paired_tag/PairedTag.wdl index 83b470ba47..6e8a9e2319 100644 --- a/pipelines/skylab/paired_tag/PairedTag.wdl +++ b/pipelines/skylab/paired_tag/PairedTag.wdl @@ -109,7 +109,7 @@ workflow PairedTag { read1_fastq = atac_r1_fastq[idx], read3_fastq = atac_r3_fastq[idx], barcodes_fastq = atac_r2_fastq[idx], - input_id = input_id, + input_id = input_id + "_atac", whitelist = atac_whitelist, preindex = preindex, docker_path = docker_prefix + upstools_docker diff --git a/tasks/skylab/H5adUtils.wdl b/tasks/skylab/H5adUtils.wdl index 0ac5a3dd66..67ecee6f2d 100644 --- a/tasks/skylab/H5adUtils.wdl +++ b/tasks/skylab/H5adUtils.wdl @@ -207,7 +207,7 @@ task SingleNucleusOptimusH5adOutput { --expected_cells ~{expected_cells} - mv library_metrics.csv ~{input_id}_~{gex_nhash_id}_gex_library_metrics.csv + mv library_metrics.csv ~{input_id}_~{gex_nhash_id}_library_metrics.csv >>> runtime { @@ -221,7 +221,7 @@ task SingleNucleusOptimusH5adOutput { output { File h5ad_output = "~{input_id}.h5ad" - File library_metrics = "~{input_id}_~{gex_nhash_id}_gex_library_metrics.csv" + File library_metrics = "~{input_id}_~{gex_nhash_id}_library_metrics.csv" } } From e2a5a6464e442455faa0d5715b5e8f6bf643d1e6 Mon Sep 17 00:00:00 2001 From: ekiernan Date: Wed, 23 Oct 2024 14:11:16 -0400 Subject: [PATCH 11/20] removing optional input --- pipelines/skylab/optimus/Optimus.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/optimus/Optimus.wdl b/pipelines/skylab/optimus/Optimus.wdl index 4b1cbc8c68..8b345d29bb 100644 --- a/pipelines/skylab/optimus/Optimus.wdl +++ b/pipelines/skylab/optimus/Optimus.wdl @@ -36,7 +36,7 @@ workflow Optimus { File annotations_gtf File? mt_genes String? soloMultiMappers = "Uniform" - Int? gex_expected_cells = 3000 + Int gex_expected_cells = 3000 # Chemistry options include: 2 or 3 Int tenx_chemistry_version From 7a34d2880567ce5f376f214a782787325440da7a Mon Sep 17 00:00:00 2001 From: ekiernan Date: Wed, 23 Oct 2024 14:27:19 -0400 Subject: [PATCH 12/20] updated changelogs --- pipelines/skylab/atac/atac.changelog.md | 7 +++++++ pipelines/skylab/atac/atac.wdl | 2 +- pipelines/skylab/multiome/Multiome.changelog.md | 6 ++++++ pipelines/skylab/multiome/Multiome.wdl | 2 +- pipelines/skylab/optimus/Optimus.changelog.md | 8 ++++++++ pipelines/skylab/optimus/Optimus.wdl | 2 +- pipelines/skylab/paired_tag/PairedTag.changelog.md | 10 +++++++++- pipelines/skylab/paired_tag/PairedTag.wdl | 2 +- 8 files changed, 34 insertions(+), 5 deletions(-) diff --git a/pipelines/skylab/atac/atac.changelog.md b/pipelines/skylab/atac/atac.changelog.md index 34b5704e59..401a1bd8bf 100644 --- a/pipelines/skylab/atac/atac.changelog.md +++ b/pipelines/skylab/atac/atac.changelog.md @@ -1,3 +1,10 @@ +# 2.4.0 +2024-10-23 (Date of Last Commit) + +* Added a new input parameter for atac_expected_cells, which describes the numnber of cells used for the library preparation +* Updated the ATAC library CSV to be consistent in file naming convention and to have similar case for metric names to the Optimus workflow library CSV + + # 2.3.2 2024-10-18 (Date of Last Commit) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 08aca5345f..153f817c6f 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -49,7 +49,7 @@ workflow ATAC { String adapter_seq_read3 = "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG" } - String pipeline_version = "2.3.2" + String pipeline_version = "2.4.0" # Determine docker prefix based on cloud provider String gcr_docker_prefix = "us.gcr.io/broad-gotc-prod/" diff --git a/pipelines/skylab/multiome/Multiome.changelog.md b/pipelines/skylab/multiome/Multiome.changelog.md index 378678f9ba..da21c8046e 100644 --- a/pipelines/skylab/multiome/Multiome.changelog.md +++ b/pipelines/skylab/multiome/Multiome.changelog.md @@ -1,3 +1,9 @@ +# 5.8.0 +2024-10-23 (Date of Last Commit) + +* Updated the workflow to include a new expected_cells input parameter describing the number of cells used as input to the library preparation; this is passed to both the ATAC workflows and Optimus workflows and the default is set to 3000 cells +* Updated the ATAC library CSV and the Gene Expression library CSV to be consistent in file naming convention and to have similar case for metric names + # 5.7.1 2024-10-18 (Date of Last Commit) diff --git a/pipelines/skylab/multiome/Multiome.wdl b/pipelines/skylab/multiome/Multiome.wdl index 232438688a..ca8b16ea3d 100644 --- a/pipelines/skylab/multiome/Multiome.wdl +++ b/pipelines/skylab/multiome/Multiome.wdl @@ -9,7 +9,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils workflow Multiome { - String pipeline_version = "5.7.1" + String pipeline_version = "5.8.0" input { diff --git a/pipelines/skylab/optimus/Optimus.changelog.md b/pipelines/skylab/optimus/Optimus.changelog.md index f8418bce8d..8d82cdf07f 100644 --- a/pipelines/skylab/optimus/Optimus.changelog.md +++ b/pipelines/skylab/optimus/Optimus.changelog.md @@ -1,3 +1,10 @@ +# 7.8.0 +2024-10-23 (Date of Last Commit) + +* Renamed the input expected_cells to gex_expected_cells +* Updated gex_expected_cells to a required output +* Reformatted the library CSV output filename to remove an extra gex + # 7.7.0 2024-09-24 (Date of Last Commit) @@ -6,6 +13,7 @@ # 7.6.1 2024-09-11 (Date of Last Commit) + * Updated warp-tools docker which added create_h5ad_snss2.py to the docker image. This change does not affect the Optimus pipeline # 7.6.0 diff --git a/pipelines/skylab/optimus/Optimus.wdl b/pipelines/skylab/optimus/Optimus.wdl index 8b345d29bb..f8343388ab 100644 --- a/pipelines/skylab/optimus/Optimus.wdl +++ b/pipelines/skylab/optimus/Optimus.wdl @@ -71,7 +71,7 @@ workflow Optimus { # version of this pipeline - String pipeline_version = "7.7.0" + String pipeline_version = "7.8.0" # this is used to scatter matched [r1_fastq, r2_fastq, i1_fastq] arrays diff --git a/pipelines/skylab/paired_tag/PairedTag.changelog.md b/pipelines/skylab/paired_tag/PairedTag.changelog.md index f6ce64b4ca..b43cabd679 100644 --- a/pipelines/skylab/paired_tag/PairedTag.changelog.md +++ b/pipelines/skylab/paired_tag/PairedTag.changelog.md @@ -1,3 +1,9 @@ +# 1.8.0 +2024-10-23 (Date of Last Commit) + +* Updated the workflow to include a new expected_cells input parameter describing the number of cells used as input to the library preparation; this is passed to both the ATAC workflows and Optimus workflows and the default is set to 3000 cells +* Updated the ATAC library CSV and the Gene Expression library CSV to be consistent in file naming convention and to have similar case for metric names + # 1.7.1 2024-10-18 (Date of Last Commit) @@ -5,11 +11,13 @@ # 1.7.0 2024-09-24 (Date of Last Commit) + * Added a python implementation of DoubletFinder to calculate doublet scores in gene expression data; percent doublets are now available as a library-level metric and individual doublet scores for cell barcodes are in the h5ad * Updated gene_names in the final h5ad to be unique # 1.6.1 2024-09-11 (Date of Last Commit) + * Updated warp-tools docker which added create_h5ad_snss2.py to the docker image. This change does not affect the PairedTag pipeline # 1.6.0 @@ -21,6 +29,7 @@ 2024-08-06 (Date of Last Commit) * Updated the warp-tools docker to calculate mitochondrial reads from unique reads in cell and gene metrics; these metrics are in the cell and gene metrics CSV as well as h5ad + # 1.4.1 2024-08-02 (Date of Last Commit) @@ -71,7 +80,6 @@ * Updated the demultiplex task so that some intermediate input names have been renamed. There is no change to the outputs. - # 0.6.0 2024-05-10 (Date) diff --git a/pipelines/skylab/paired_tag/PairedTag.wdl b/pipelines/skylab/paired_tag/PairedTag.wdl index 6e8a9e2319..2cef2bb297 100644 --- a/pipelines/skylab/paired_tag/PairedTag.wdl +++ b/pipelines/skylab/paired_tag/PairedTag.wdl @@ -8,7 +8,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils workflow PairedTag { - String pipeline_version = "1.7.1" + String pipeline_version = "1.8.0" input { From 0e584fa8562c7af3b6d4e34344db2724f396d068 Mon Sep 17 00:00:00 2001 From: ekiernan Date: Wed, 23 Oct 2024 14:31:31 -0400 Subject: [PATCH 13/20] updated typo in changelog --- pipelines/skylab/atac/atac.changelog.md | 1 + pipelines/skylab/multiome/Multiome.changelog.md | 1 + pipelines/skylab/paired_tag/PairedTag.changelog.md | 1 + 3 files changed, 3 insertions(+) diff --git a/pipelines/skylab/atac/atac.changelog.md b/pipelines/skylab/atac/atac.changelog.md index 401a1bd8bf..4b886f32b2 100644 --- a/pipelines/skylab/atac/atac.changelog.md +++ b/pipelines/skylab/atac/atac.changelog.md @@ -3,6 +3,7 @@ * Added a new input parameter for atac_expected_cells, which describes the numnber of cells used for the library preparation * Updated the ATAC library CSV to be consistent in file naming convention and to have similar case for metric names to the Optimus workflow library CSV +* Added a new metric to the ATAC library CSV to calculate percent_target, which is the number of estimated cells by SnapATAC2 divided by expected_cells input # 2.3.2 diff --git a/pipelines/skylab/multiome/Multiome.changelog.md b/pipelines/skylab/multiome/Multiome.changelog.md index da21c8046e..1cdbef30a8 100644 --- a/pipelines/skylab/multiome/Multiome.changelog.md +++ b/pipelines/skylab/multiome/Multiome.changelog.md @@ -3,6 +3,7 @@ * Updated the workflow to include a new expected_cells input parameter describing the number of cells used as input to the library preparation; this is passed to both the ATAC workflows and Optimus workflows and the default is set to 3000 cells * Updated the ATAC library CSV and the Gene Expression library CSV to be consistent in file naming convention and to have similar case for metric names +* Added a new metric to the ATAC library CSV to calculate percent_target, which is the number of estimated cells by SnapATAC2 divided by expected_cells input # 5.7.1 2024-10-18 (Date of Last Commit) diff --git a/pipelines/skylab/paired_tag/PairedTag.changelog.md b/pipelines/skylab/paired_tag/PairedTag.changelog.md index b43cabd679..a7071e222f 100644 --- a/pipelines/skylab/paired_tag/PairedTag.changelog.md +++ b/pipelines/skylab/paired_tag/PairedTag.changelog.md @@ -3,6 +3,7 @@ * Updated the workflow to include a new expected_cells input parameter describing the number of cells used as input to the library preparation; this is passed to both the ATAC workflows and Optimus workflows and the default is set to 3000 cells * Updated the ATAC library CSV and the Gene Expression library CSV to be consistent in file naming convention and to have similar case for metric names +* Added a new metric to the ATAC library CSV to calculate percent_target, which is the number of estimated cells by SnapATAC2 divided by expected_cells input # 1.7.1 2024-10-18 (Date of Last Commit) From 52179c11552c15076f5a6b7280443615280fc07e Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Wed, 23 Oct 2024 18:32:29 +0000 Subject: [PATCH 14/20] Updated pipeline_versions.txt with all pipeline version information --- pipeline_versions.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pipeline_versions.txt b/pipeline_versions.txt index 66a514f5a9..e9a26ab1c8 100644 --- a/pipeline_versions.txt +++ b/pipeline_versions.txt @@ -30,11 +30,11 @@ ExomeReprocessing 3.3.1 2024-09-17 BuildIndices 3.0.0 2023-12-06 scATAC 1.3.2 2023-08-03 snm3C 4.0.4 2024-08-06 -Multiome 5.7.1 2024-10-18 -PairedTag 1.7.1 2024-10-18 +Multiome 5.8.0 2024-10-23 +PairedTag 1.8.0 2024-10-23 MultiSampleSmartSeq2 2.2.22 2024-09-11 MultiSampleSmartSeq2SingleNucleus 2.0.1 2024-09-24 -Optimus 7.7.0 2024-09-24 -atac 2.3.2 2024-10-18 +Optimus 7.8.0 2024-10-23 +atac 2.4.0 2024-10-23 SmartSeq2SingleSample 5.1.21 2024-09-11 SlideSeq 3.4.2 2024-09-24 From 2e5d88f24fdd2417b26d171d52b825a8e0e388b4 Mon Sep 17 00:00:00 2001 From: ekiernan Date: Wed, 23 Oct 2024 14:37:00 -0400 Subject: [PATCH 15/20] changingmore library metric output names from h5adutils --- tasks/skylab/H5adUtils.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tasks/skylab/H5adUtils.wdl b/tasks/skylab/H5adUtils.wdl index 67ecee6f2d..f5fb796b49 100644 --- a/tasks/skylab/H5adUtils.wdl +++ b/tasks/skylab/H5adUtils.wdl @@ -101,7 +101,7 @@ task OptimusH5adGeneration { --counting_mode ~{counting_mode} \ --expected_cells ~{expected_cells} - mv library_metrics.csv ~{input_id}_~{gex_nhash_id}_gex_library_metrics.csv + mv library_metrics.csv ~{input_id}_~{gex_nhash_id}_library_metrics.csv >>> @@ -116,7 +116,7 @@ task OptimusH5adGeneration { output { File h5ad_output = "~{input_id}.h5ad" - File library_metrics = "~{input_id}_~{gex_nhash_id}_gex_library_metrics.csv" + File library_metrics = "~{input_id}_~{gex_nhash_id}_library_metrics.csv" } } From 4c88bf89c39db328248ef7aa960d0ad195b55444 Mon Sep 17 00:00:00 2001 From: ekiernan Date: Wed, 23 Oct 2024 14:42:22 -0400 Subject: [PATCH 16/20] more changelog updates --- pipelines/skylab/slideseq/SlideSeq.changelog.md | 5 +++++ pipelines/skylab/slideseq/SlideSeq.wdl | 2 +- .../MultiSampleSmartSeq2SingleNucleus.changelog.md | 6 ++++++ .../MultiSampleSmartSeq2SingleNucleus.wdl | 2 +- 4 files changed, 13 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/slideseq/SlideSeq.changelog.md b/pipelines/skylab/slideseq/SlideSeq.changelog.md index b9cb1f7a56..0835b105a7 100644 --- a/pipelines/skylab/slideseq/SlideSeq.changelog.md +++ b/pipelines/skylab/slideseq/SlideSeq.changelog.md @@ -1,3 +1,8 @@ +# 3.4.3 +2024-10-24 (Date of Last Commit) + +* Updated the h5adUtils WDL to rename the gene expression library CSV filename; this does not impact slideseq + # 3.4.2 2024-09-24 (Date of Last Commit) diff --git a/pipelines/skylab/slideseq/SlideSeq.wdl b/pipelines/skylab/slideseq/SlideSeq.wdl index 0cd1f29e4c..553760e49d 100644 --- a/pipelines/skylab/slideseq/SlideSeq.wdl +++ b/pipelines/skylab/slideseq/SlideSeq.wdl @@ -25,7 +25,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils workflow SlideSeq { - String pipeline_version = "3.4.2" + String pipeline_version = "3.4.3" input { Array[File] r1_fastq diff --git a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md index 16ed6cb5c8..90d6830c49 100644 --- a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md +++ b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.changelog.md @@ -1,5 +1,11 @@ +# 2.0.2 +2024-10-23 (Date of Last Commit) + +* Updated the h5adUtils WDL to rename the gene expression library CSV filename; this does not impact slideseq + # 2.0.1 2024-09-24 (Date of Last Commit) + * Added a python implementation of DoubletFinder to calculate doublet scores in gene expression data; this does not affect the snSS2 workflow # 2.0.0 diff --git a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl index debce094b0..124820a4a5 100644 --- a/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl +++ b/pipelines/skylab/smartseq2_single_nucleus_multisample/MultiSampleSmartSeq2SingleNucleus.wdl @@ -57,7 +57,7 @@ workflow MultiSampleSmartSeq2SingleNucleus { } # Version of this pipeline - String pipeline_version = "2.0.1" + String pipeline_version = "2.0.2" if (false) { String? none = "None" From ae6cd825d25e6f5cc45c19629e321f37d6e441f2 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Wed, 23 Oct 2024 18:49:51 +0000 Subject: [PATCH 17/20] Updated pipeline_versions.txt with all pipeline version information --- pipeline_versions.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipeline_versions.txt b/pipeline_versions.txt index e9a26ab1c8..c176a31945 100644 --- a/pipeline_versions.txt +++ b/pipeline_versions.txt @@ -33,8 +33,8 @@ snm3C 4.0.4 2024-08-06 Multiome 5.8.0 2024-10-23 PairedTag 1.8.0 2024-10-23 MultiSampleSmartSeq2 2.2.22 2024-09-11 -MultiSampleSmartSeq2SingleNucleus 2.0.1 2024-09-24 +MultiSampleSmartSeq2SingleNucleus 2.0.2 2024-10-23 Optimus 7.8.0 2024-10-23 atac 2.4.0 2024-10-23 SmartSeq2SingleSample 5.1.21 2024-09-11 -SlideSeq 3.4.2 2024-09-24 +SlideSeq 3.4.3 2024-10-24 From 92871e221793023d2f978e02da21a4a642d7068d Mon Sep 17 00:00:00 2001 From: ekiernan Date: Tue, 29 Oct 2024 09:36:14 -0400 Subject: [PATCH 18/20] Doc updates --- website/docs/Pipelines/ATAC/README.md | 33 ++++++++++--------- .../Pipelines/Multiome_Pipeline/README.md | 11 ++++--- .../docs/Pipelines/Optimus_Pipeline/README.md | 4 +-- .../Pipelines/PairedTag_Pipeline/README.md | 9 ++--- 4 files changed, 30 insertions(+), 27 deletions(-) diff --git a/website/docs/Pipelines/ATAC/README.md b/website/docs/Pipelines/ATAC/README.md index 9f632d8497..86d4b55f11 100644 --- a/website/docs/Pipelines/ATAC/README.md +++ b/website/docs/Pipelines/ATAC/README.md @@ -8,7 +8,7 @@ slug: /Pipelines/ATAC/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [2.3.0](https://github.com/broadinstitute/warp/releases) | September, 2024 | Kaylee Mathews | Please [file an issue in WARP](https://github.com/broadinstitute/warp/issues). | +| [2.4.0](https://github.com/broadinstitute/warp/releases) | October, 2024 | WARP Pipelines | Please [file an issue in WARP](https://github.com/broadinstitute/warp/issues). | ## Introduction to the ATAC workflow @@ -47,22 +47,23 @@ The following describes the inputs of the ATAC workflow. For more details on how | Variable name | Description | | --- |--- | -| read1_fastq_gzipped | Fastq inputs (array of compressed read 1 FASTQ files). | -| read2_fastq_gzipped | Fastq inputs (array of compressed read 2 FASTQ files containing cellular barcodes). | -| read3_fastq_gzipped | Fastq inputs (array of compressed read 3 FASTQ files). | +| read1_fastq_gzipped | Fastq inputs (array of compressed read 1 FASTQ files). | +| read2_fastq_gzipped | Fastq inputs (array of compressed read 2 FASTQ files containing cellular barcodes). | +| read3_fastq_gzipped | Fastq inputs (array of compressed read 3 FASTQ files). | | input_id | Output prefix/base name for all intermediate files and pipeline outputs. | | cloud_provider | String describing the cloud provider that should be used to run the workflow; value should be "gcp" or "azure". | String | -| preindex | Boolean used for paired-tag data and not applicable to ATAC data types; default is set to false. | -| tar_bwa_reference | BWA reference (tar file containing reference fasta and corresponding files). | -| num_threads_bwa | Optional integer defining the number of CPUs per node for the BWA-mem alignment task (default: 128). | -| mem_size_bwa | Optional integer defining the memory size for the BWA-mem alignment task in GB (default: 512). | -| cpu_platform_bwa | Optional string defining the CPU platform for the BWA-mem alignment task (default: "Intel Ice Lake"). | -| annotations_gtf | CreateFragmentFile input variable: GTF file for SnapATAC2 to calculate TSS sites of fragment file. | -| chrom_sizes | CreateFragmentFile input variable: Text file containing chrom_sizes for genome build (i.e., hg38) | -| whitelist | Whitelist file for ATAC cellular barcodes. | -| adapter_seq_read1 | TrimAdapters input: Sequence adapter for read 1 fastq. | -| adapter_seq_read3 | TrimAdapters input: Sequence adapter for read 3 fastq. | -| vm_size | String defining the Azure virtual machine family for the workflow (default: "Standard_M128s"). | String | +| preindex | Boolean used for paired-tag data and not applicable to ATAC data types; default is set to false. | +| atac_expected_cells | Number of cells loaded to create the ATAC library; default is set to 3000. | +| tar_bwa_reference | BWA reference (tar file containing reference fasta and corresponding files). | +| num_threads_bwa | Optional integer defining the number of CPUs per node for the BWA-mem alignment task (default: 128). | +| mem_size_bwa | Optional integer defining the memory size for the BWA-mem alignment task in GB (default: 512). | +| cpu_platform_bwa | Optional string defining the CPU platform for the BWA-mem alignment task (default: "Intel Ice Lake"). | +| annotations_gtf | CreateFragmentFile input variable: GTF file for SnapATAC2 to calculate TSS sites of fragment file. | +| chrom_sizes | CreateFragmentFile input variable: Text file containing chrom_sizes for genome build (i.e., hg38) | +| whitelist | Whitelist file for ATAC cellular barcodes. | +| adapter_seq_read1 | TrimAdapters input: Sequence adapter for read 1 fastq. | +| adapter_seq_read3 | TrimAdapters input: Sequence adapter for read 3 fastq. | +| vm_size | String defining the Azure virtual machine family for the workflow (default: "Standard_M128s"). | atac_nhash_id | String that represents an optional library aliquot identifier. When used, it is echoed in the h5ad unstructured data. | ## ATAC tasks and tools @@ -94,7 +95,7 @@ To see specific tool parameters, select the task WDL link in the table; then vie | bam_aligned_output | ``.bam | BAM containing aligned reads from ATAC workflow. | | fragment_file | ``.fragments.tsv | TSV containing fragment start and stop coordinates per barcode. In order, the columns are "Chromosome", "Start", "Stop", "ATAC Barcode", and "Number Reads". | | snap_metrics | ``_``.atac_metrics.csv | CSV file containing library-level metrics. Read more in the [Library Metrics Overview](library-metrics.md) + library_metrics | ``_`_library_metrics.csv | CSV file containing library-level metrics. Read more in the [Library Metrics Overview](library-metrics.md) ## Versioning and testing diff --git a/website/docs/Pipelines/Multiome_Pipeline/README.md b/website/docs/Pipelines/Multiome_Pipeline/README.md index 1062b121a4..625d3320d7 100644 --- a/website/docs/Pipelines/Multiome_Pipeline/README.md +++ b/website/docs/Pipelines/Multiome_Pipeline/README.md @@ -7,7 +7,7 @@ slug: /Pipelines/Multiome_Pipeline/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [Multiome v5.7.0](https://github.com/broadinstitute/warp/releases) | September, 2024 | Kaylee Mathews | Please [file an issue in WARP](https://github.com/broadinstitute/warp/issues). | +| [Multiome v5.8.0](https://github.com/broadinstitute/warp/releases) | October, 2024 | WARP Pipelines | Please [file an issue in WARP](https://github.com/broadinstitute/warp/issues). | ![Multiome_diagram](./multiome_diagram.png) @@ -59,6 +59,7 @@ Multiome can be deployed using [Cromwell](https://cromwell.readthedocs.io/en/sta | cloud_provider | String describing the cloud provider that should be used to run the workflow; value should be "gcp" or "azure". | String | | gex_nhash_id | Optional identifier for the library aliquot; when specified, the gene expression workflow will echo the ID in the gene expression output h5ads (in the adata.uns section) and in the library-level metrics CSV. | | atac_nhash_id | Optional identifier for the library aliquot; when specified, the workflow will echo the ID in the ATAC output h5ads (in the adata.uns section) and in the library-level metrics CSV. +| expected_cells | Number of cells loaded for library preparation; default is set to 3000. | Integer | | annotations_gtf | GTF file containing gene annotations used for GEX cell metric calculation and ATAC fragment metrics; must match the GTF used to build the STAR aligner. | File | | gex_r1_fastq | Array of read 1 FASTQ files representing a single GEX 10x library. | Array[File] | | gex_r2_fastq | Array of read 2 FASTQ files representing a single GEX 10x library.| Array[File] | @@ -109,7 +110,7 @@ The Multiome workflow calls two WARP subworkflows, one external subworkflow (opt | fragment_file_atac | `_atac.fragments.sorted.tsv.gz` | Sorted and bgzipped TSV file containing fragment start and stop coordinates per barcode. The columns are "Chromosome", "Start", "Stop", "ATAC Barcode", "Number of reads", and "GEX Barcode". | | fragment_file_index | `_atac.fragments.sorted.tsv.gz.tbi` | tabix index file for the fragment file. | | snap_metrics_atac | `_atac.metrics.h5ad` | h5ad (Anndata) file containing per-barcode metrics from SnapATAC2. Also contains the equivalent gene expression barcode for each ATAC barcode in the `gex_barcodes` column of the `h5ad.obs` property. See the [ATAC Count Matrix Overview](../ATAC/count-matrix-overview.md) for more details. | -| atac_library_metrics | `_.atac.metrics.csv` | CSV with library-level metrics produced by SnapATAC2. See the ATAC [Library Level Metrics Overview](../ATAC/library-metrics.md) for more details. | +| atac_library_metrics | `_atac_.metrics.csv` | CSV with library-level metrics produced by SnapATAC2. See the ATAC [Library Level Metrics Overview](../ATAC/library-metrics.md) for more details. | | genomic_reference_version_gex | `.txt` | File containing the Genome build, source and GTF annotation version. | | bam_gex | `_gex.bam` | BAM file containing aligned reads from Optimus workflow. | | matrix_gex | `_gex_sparse_counts.npz` | NPZ file containing raw gene by cell counts. | @@ -123,9 +124,9 @@ The Multiome workflow calls two WARP subworkflows, one external subworkflow (opt | multimappers_Uniform_matrix | `UniqueAndMult-Uniform.mtx` | Optional output produced when `soloMultiMappers` is "Uniform"; see STARsolo [documentation](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md#multi-gene-reads) for more information.| | multimappers_Rescue_matrix | `UniqueAndMult-Rescue.mtx` | Optional output produced when `soloMultiMappers` is "Rescue"; see STARsolo [documentation](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md#multi-gene-reads) for more information. | | multimappers_PropUnique_matrix | `UniqueAndMult-PropUnique.mtx` | Optional output produced when `soloMultiMappers` is "PropUnique"; see STARsolo [documentation](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md#multi-gene-reads) for more information.| -| gex_aligner_metrics | `.star_metrics.tar` | Text file containing per barcode metrics (`CellReads.stats`) produced by the GEX pipeline STARsolo aligner. | -| library_metrics | `__library_metrics.csv` | Optional CSV file containing all library-level metrics calculated with STARsolo for gene expression data. | -| mtx_files | `.mtx_files.tar` | TAR file with STARsolo matrix market files (barcodes.tsv, features.tsv, and matrix.mtx) | TAR | +| gex_aligner_metrics | `_gex.star_metrics.tar` | Text file containing per barcode metrics (`CellReads.stats`) produced by the GEX pipeline STARsolo aligner. | +| library_metrics | `_gex__library_metrics.csv` | Optional CSV file containing all library-level metrics calculated with STARsolo for gene expression data. | +| mtx_files | `_gex.mtx_files.tar` | TAR file with STARsolo matrix market files (barcodes.tsv, features.tsv, and matrix.mtx) | TAR | | cell_barcodes_csv | `` | Optional output produced when `run_cellbender` is "true"; see CellBender [documentation](https://cellbender.readthedocs.io/en/latest/usage/index.html) and [GitHub repository](https://github.com/broadinstitute/CellBender/tree/master) for more information.| | checkpoint_file | `` | Optional output produced when `run_cellbender` is "true"; see CellBender [documentation](https://cellbender.readthedocs.io/en/latest/usage/index.html) and [GitHub repository](https://github.com/broadinstitute/CellBender/tree/master) for more information. | | h5_array | `` | Optional output produced when `run_cellbender` is "true"; see CellBender [documentation](https://cellbender.readthedocs.io/en/latest/usage/index.html) and [GitHub repository](https://github.com/broadinstitute/CellBender/tree/master) for more information. | diff --git a/website/docs/Pipelines/Optimus_Pipeline/README.md b/website/docs/Pipelines/Optimus_Pipeline/README.md index 9c1395ed4e..607c2b01a5 100644 --- a/website/docs/Pipelines/Optimus_Pipeline/README.md +++ b/website/docs/Pipelines/Optimus_Pipeline/README.md @@ -7,7 +7,7 @@ slug: /Pipelines/Optimus_Pipeline/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | | :----: | :---: | :----: | :--------------: | -| [optimus_v7.7.0](https://github.com/broadinstitute/warp/releases?q=optimus&expanded=true) | September, 2024 | Elizabeth Kiernan | Please [file an issue in WARP](https://github.com/broadinstitute/warp/issues) | +| [optimus_v7.8.0](https://github.com/broadinstitute/warp/releases?q=optimus&expanded=true) | October, 2024 | WARP Pipelines | Please [file an issue in WARP](https://github.com/broadinstitute/warp/issues) | ![Optimus_diagram](Optimus_diagram.png) @@ -107,7 +107,7 @@ The example configuration files also contain metadata for the reference files, d | ignore_r1_read_length | Boolean that overrides a check on the 10x chemistry. Default is set to false. If true, the workflow will not ensure that the 10x_chemistry_version input matches the chemistry in the read 1 FASTQ. | "true" or "false" (default) | | emptydrops_lower | UMI threshold for emptyDrops detection; default is 100. | N/A | | count_exons | Boolean indicating if the workflow should calculate exon counts **when in single-nucleus (sn_rna) mode**. If true, this option will output an additional layer for the h5ad file. By default, it is set to "false". If the parameter is true and used with sc_rnamode, the workflow will return an error. | "true" or "false" (default) | -| expected_cells | Optional integer input for the expected number of cells, which is used calculate library-level metrics. The default is set to 3,000 | +| gex_expected_cells | Optional integer input for the expected number of cells, which is used calculate library-level metrics. The default is set to 3,000. | N/A | #### Pseudogene handling The example Optimus reference files are downloaded directly from GENCODE (see Quickstart table) and are not modified to remove pseudogenes. This is in contrast to the [references created for Cell Ranger](https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/release-notes/references#header) which remove pseudogenes and small RNAs. diff --git a/website/docs/Pipelines/PairedTag_Pipeline/README.md b/website/docs/Pipelines/PairedTag_Pipeline/README.md index 64d0b956f8..a1aa934081 100644 --- a/website/docs/Pipelines/PairedTag_Pipeline/README.md +++ b/website/docs/Pipelines/PairedTag_Pipeline/README.md @@ -7,7 +7,7 @@ slug: /Pipelines/PairedTag_Pipeline/README | Pipeline Version | Date Updated | Documentation Author | Questions or Feedback | |:---:| :---: | :---: | :---: | -| [PairedTag_v1.7.0](https://github.com/broadinstitute/warp/releases) | September, 2024 | Kaylee Mathews | Please [file an issue in WARP](https://github.com/broadinstitute/warp/issues). | +| [PairedTag_v1.8.0](https://github.com/broadinstitute/warp/releases) | October, 2024 | WARP Pipelines | Please [file an issue in WARP](https://github.com/broadinstitute/warp/issues). | ## Introduction to the Paired-Tag workflow @@ -105,8 +105,8 @@ The Paired-Tag workflow calls two WARP subworkflows and an additional task which | Output variable name | Filename, if applicable | Output format and description | |--- | --- | --- | | pairedtag_pipeline_version_out | N.A. | String describing the version of the Paired-Tag pipeline used. | -| bam_aligned_output_atac | `_atac.bam` | BAM file containing aligned reads from ATAC workflow; contains sample and cell barcodes stored in the BB tag if `preindex` is “true”. | -| fragment_file_atac | `_atac.fragments.tsv` or if preindexing = true, `_atac.fragments.BB.tsv` | TSV file containing fragment start and stop coordinates per barcode. The columns are "Chromosome", "Start", "Stop", "Barcode", and "Number of reads". When preindexing is used, additional columns include "Sample Barcode", "Cell Barcode", and "Duplicates" (which indicates if a cell barcode matches more than one sample barcode). | +| bam_aligned_output_atac | `_atac.bam` or if `preindex` = true, `_atac.bam.BB.bam` | BAM file containing aligned reads from ATAC workflow; contains sample and cell barcodes stored in the BB tag if `preindex` is “true”. | +| fragment_file_atac | `_atac.fragments.tsv` or if `preindex` = true, `_atac.fragments.sorted.tsv.gz` | TSV file containing fragment start and stop coordinates per barcode. The columns are "Chromosome", "Start", "Stop", "Barcode", and "Number of reads". When `preindex` = true, the file is bgzipped and has additional columns include "Sample Barcode", "Cell Barcode", and "Duplicates" (which indicates if a cell barcode matches more than one sample barcode). | | snap_metrics_atac | `_atac.metrics.h5ad` | h5ad (Anndata) file containing per-barcode metrics from SnapATAC2. See the [ATAC Count Matrix Overview](../ATAC/count-matrix-overview.md) for more details. If the preindex option is used, the h5ad.obs will contain 3 extra columns: preindex (the sample barcode), CB (cell barcodes), and duplicates (indicates with a 1 if the cell barcode matches more than preindex, otherwise it is 0).| | genomic_reference_version_gex | `.txt` | File containing the Genome build, source and GTF annotation version. | | bam_gex | `_gex.bam` | BAM file containing aligned reads from Optimus workflow. | @@ -117,7 +117,8 @@ The Paired-Tag workflow calls two WARP subworkflows and an additional task which | gene_metrics_gex | `_gex.gene_metrics.csv.gz` | CSV file containing the per-gene metrics. | | cell_calls_gex | `_gex.emptyDrops` | TSV file containing the EmptyDrops results when the Optimus workflow is run in sc_rna mode. | | h5ad_output_file_gex | `_gex.h5ad` | h5ad (Anndata) file containing the raw cell-by-gene count matrix, gene metrics, cell metrics, and global attributes. See the [Optimus Count Matrix Overview](../Optimus_Pipeline/Loom_schema.md) for more details. | -| library_metrics | `__library_metrics.csv` | Optional CSV file containing all library-level metrics calculated with STARsolo for gene expression data. | +| library_metrics | `_gex__library_metrics.csv` | Optional CSV file containing all library-level metrics calculated with STARsolo for gene expression data. | +| atac_library_final | `_atac__library_metrics` | CSV file containing all the library-level metrics calucalted by SnapATAC2. | | cloud_provider | String describing the cloud provider that should be used to run the workflow; value should be "gcp" or "azure". | String | | multimappers_EM_matrix | `UniqueAndMult-EM.mtx` | Optional output produced when `soloMultiMappers` is "EM"; see STARsolo [documentation](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md#multi-gene-reads) for more information.| | multimappers_Uniform_matrix | `UniqueAndMult-Uniform.mtx` | Optional output produced when `soloMultiMappers` is "Uniform" (default); see STARsolo [documentation](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md#multi-gene-reads) for more information.| From 41c6d0413734ce6266e2b1b2ea1774ee7a347dc4 Mon Sep 17 00:00:00 2001 From: ekiernan Date: Tue, 29 Oct 2024 09:44:07 -0400 Subject: [PATCH 19/20] fixing doc table typos --- website/docs/Pipelines/PairedTag_Pipeline/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/docs/Pipelines/PairedTag_Pipeline/README.md b/website/docs/Pipelines/PairedTag_Pipeline/README.md index a1aa934081..d1eeb23b11 100644 --- a/website/docs/Pipelines/PairedTag_Pipeline/README.md +++ b/website/docs/Pipelines/PairedTag_Pipeline/README.md @@ -104,7 +104,7 @@ The Paired-Tag workflow calls two WARP subworkflows and an additional task which | Output variable name | Filename, if applicable | Output format and description | |--- | --- | --- | -| pairedtag_pipeline_version_out | N.A. | String describing the version of the Paired-Tag pipeline used. | +| pairedtag_pipeline_version_out | N/A | String describing the version of the Paired-Tag pipeline used. | | bam_aligned_output_atac | `_atac.bam` or if `preindex` = true, `_atac.bam.BB.bam` | BAM file containing aligned reads from ATAC workflow; contains sample and cell barcodes stored in the BB tag if `preindex` is “true”. | | fragment_file_atac | `_atac.fragments.tsv` or if `preindex` = true, `_atac.fragments.sorted.tsv.gz` | TSV file containing fragment start and stop coordinates per barcode. The columns are "Chromosome", "Start", "Stop", "Barcode", and "Number of reads". When `preindex` = true, the file is bgzipped and has additional columns include "Sample Barcode", "Cell Barcode", and "Duplicates" (which indicates if a cell barcode matches more than one sample barcode). | | snap_metrics_atac | `_atac.metrics.h5ad` | h5ad (Anndata) file containing per-barcode metrics from SnapATAC2. See the [ATAC Count Matrix Overview](../ATAC/count-matrix-overview.md) for more details. If the preindex option is used, the h5ad.obs will contain 3 extra columns: preindex (the sample barcode), CB (cell barcodes), and duplicates (indicates with a 1 if the cell barcode matches more than preindex, otherwise it is 0).| @@ -119,7 +119,7 @@ The Paired-Tag workflow calls two WARP subworkflows and an additional task which | h5ad_output_file_gex | `_gex.h5ad` | h5ad (Anndata) file containing the raw cell-by-gene count matrix, gene metrics, cell metrics, and global attributes. See the [Optimus Count Matrix Overview](../Optimus_Pipeline/Loom_schema.md) for more details. | | library_metrics | `_gex__library_metrics.csv` | Optional CSV file containing all library-level metrics calculated with STARsolo for gene expression data. | | atac_library_final | `_atac__library_metrics` | CSV file containing all the library-level metrics calucalted by SnapATAC2. | -| cloud_provider | String describing the cloud provider that should be used to run the workflow; value should be "gcp" or "azure". | String | +| cloud_provider | N/A | String describing the cloud provider that should be used to run the workflow; value should be "gcp" or "azure". | | multimappers_EM_matrix | `UniqueAndMult-EM.mtx` | Optional output produced when `soloMultiMappers` is "EM"; see STARsolo [documentation](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md#multi-gene-reads) for more information.| | multimappers_Uniform_matrix | `UniqueAndMult-Uniform.mtx` | Optional output produced when `soloMultiMappers` is "Uniform" (default); see STARsolo [documentation](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md#multi-gene-reads) for more information.| | multimappers_Rescue_matrix | `UniqueAndMult-Rescue.mtx` | Optional output produced when `soloMultiMappers` is "Rescue"; see STARsolo [documentation](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md#multi-gene-reads) for more information. | From a2c7df8f9ac2412b7ab2005c8c088498610b21f1 Mon Sep 17 00:00:00 2001 From: ekiernan Date: Tue, 29 Oct 2024 10:31:11 -0400 Subject: [PATCH 20/20] adding metric documentation --- .../docs/Pipelines/ATAC/library-metrics.md | 41 ++++++++++--------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/website/docs/Pipelines/ATAC/library-metrics.md b/website/docs/Pipelines/ATAC/library-metrics.md index 184cfeb8eb..3e80bc85e4 100644 --- a/website/docs/Pipelines/ATAC/library-metrics.md +++ b/website/docs/Pipelines/ATAC/library-metrics.md @@ -10,26 +10,27 @@ The [ATAC pipeline](README.md) uses [SnapATAC2](https://github.com/kaizhang/Snap | Metric | Description | | --- | --- | | NHash_ID | A unique identifier used to track and reference the specific sample or dataset. | -| Sequenced_reads | The total number of reads generated from the sequencing process, which includes both reads that are mapped and unmapped. | -| Sequenced_read_pairs | The total number of read pairs (two reads per pair) generated from the sequencing process. This is typically half of the total sequenced reads if all reads are paired. | -| Fraction_valid_barcode | The fraction of reads that contain a valid barcode, indicating the proportion of reads that are correctly assigned to a specific cell or sample. | -| Fraction_Q30_bases_in_read_1 | The proportion of bases in Read 1 that have a Phred quality score of 30 or higher, indicating high-confidence base calls. | -| Fraction_Q30_bases_in_read_2 | The proportion of bases in Read 2 that have a Phred quality score of 30 or higher, indicating high-confidence base calls. | -| Number_of_cells | The estimated number of cells captured and sequenced in the experiment, based on the barcodes identified. | -| Mean_raw_read_pairs_per_cell | The average number of raw read pairs associated with each cell, providing an indication of the sequencing depth per cell. | -| Median_high-quality_fragments_per_cell | The median number of high-quality (e.g., confidently mapped) fragments associated with each cell, representing typical fragment quality across cells. | -| Fraction of high-quality fragments in cells | The fraction of high-quality fragments that are associated with identified cells, indicating the proportion of good-quality data that is cell-associated. | -| Fraction_of_transposition_events_in_peaks_in_cells | The fraction of transposition events within identified cells that occur within peaks, which are regions of accessible chromatin. | -| Fraction_duplicates | The fraction of sequenced fragments that are duplicates, which can result from PCR amplification or other factors, indicating the redundancy in the sequencing data. | -| Fraction_confidently_mapped | The fraction of sequenced fragments that are confidently mapped to the reference genome, indicating the proportion of reads that align well to the genome. | -| Fraction_unmapped | The fraction of sequenced fragments that could not be mapped to the reference genome, which can indicate sequencing errors, contamination, or regions not covered by the reference. | -| Fraction_nonnuclear | The fraction of sequenced fragments that are mapped to non-nuclear (e.g., mitochondrial or other organellar) DNA, providing insight into contamination or organellar activity. | -| Fraction_fragment_in_nucleosome_free_region | The fraction of sequenced fragments that map to nucleosome-free regions, which are indicative of accessible chromatin. | -| Fraction_fragment_flanking_single_nucleosome | The fraction of sequenced fragments that map to regions flanking single nucleosomes, indicating regions with partial chromatin accessibility. | -| TSS_enrichment_score | A measure of the enrichment of transposition events at transcription start sites (TSS), indicating the accessibility of promoters across the genome. | -| Fraction_of_high-quality_fragments_overlapping_TSS | The fraction of high-quality fragments that overlap transcription start sites (TSS), providing insight into promoter accessibility. | +| sequenced_reads | The total number of reads generated from the sequencing process, which includes both reads that are mapped and unmapped. | +| sequenced_read_pairs | The total number of read pairs (two reads per pair) generated from the sequencing process. This is typically half of the total sequenced reads if all reads are paired. | +| fraction_valid_barcode | The fraction of reads that contain a valid barcode, indicating the proportion of reads that are correctly assigned to a specific cell or sample. | +| fraction_Q30_bases_in_read_1 | The proportion of bases in Read 1 that have a Phred quality score of 30 or higher, indicating high-confidence base calls. | +| fraction_Q30_bases_in_read_2 | The proportion of bases in Read 2 that have a Phred quality score of 30 or higher, indicating high-confidence base calls. | +| number_of_cells | The estimated number of cells captured and sequenced in the experiment, based on the barcodes identified. | +| mean_raw_read_pairs_per_cell | The average number of raw read pairs associated with each cell, providing an indication of the sequencing depth per cell. | +| median_high-quality_fragments_per_cell | The median number of high-quality (e.g., confidently mapped) fragments associated with each cell, representing typical fragment quality across cells. | +| fraction of high-quality fragments in cells | The fraction of high-quality fragments that are associated with identified cells, indicating the proportion of good-quality data that is cell-associated. | +| fraction_of_transposition_events_in_peaks_in_cells | The fraction of transposition events within identified cells that occur within peaks, which are regions of accessible chromatin. | +| fraction_duplicates | The fraction of sequenced fragments that are duplicates, which can result from PCR amplification or other factors, indicating the redundancy in the sequencing data. | +| fraction_confidently_mapped | The fraction of sequenced fragments that are confidently mapped to the reference genome, indicating the proportion of reads that align well to the genome. | +| fraction_unmapped | The fraction of sequenced fragments that could not be mapped to the reference genome, which can indicate sequencing errors, contamination, or regions not covered by the reference. | +| fraction_nonnuclear | The fraction of sequenced fragments that are mapped to non-nuclear (e.g., mitochondrial or other organellar) DNA, providing insight into contamination or organellar activity. | +| fraction_fragment_in_nucleosome_free_region | The fraction of sequenced fragments that map to nucleosome-free regions, which are indicative of accessible chromatin. | +| fraction_fragment_flanking_single_nucleosome | The fraction of sequenced fragments that map to regions flanking single nucleosomes, indicating regions with partial chromatin accessibility. | +| tss_enrichment_score | A measure of the enrichment of transposition events at transcription start sites (TSS), indicating the accessibility of promoters across the genome. | +| fraction_of_high-quality_fragments_overlapping_TSS | The fraction of high-quality fragments that overlap transcription start sites (TSS), providing insight into promoter accessibility. | | Number_of_peaks | The total number of peaks, or regions of accessible chromatin, identified in the dataset, representing potential regulatory elements. | -| Fraction_of_genome_in_peaks | The fraction of the genome that is covered by identified peaks, indicating the extent of chromatin accessibility across the genome. | -| Fraction_of_high-quality_fragments_overlapping_peaks | The fraction of high-quality fragments that overlap with identified peaks, providing an indication of the efficiency of the assay in capturing accessible regions. | +| fraction_of_genome_in_peaks | The fraction of the genome that is covered by identified peaks, indicating the extent of chromatin accessibility across the genome. | +| fraction_of_high-quality_fragments_overlapping_peaks | The fraction of high-quality fragments that overlap with identified peaks, providing an indication of the efficiency of the assay in capturing accessible regions. | +| percent_target | Percent of cells recovered; value is calculated as estimated_cells/expected_cells. |