Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lk pd 2786 add atac expectedcells #1398

Merged
merged 22 commits into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions pipeline_versions.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,11 @@ ExomeReprocessing 3.3.1 2024-09-17
BuildIndices 3.0.0 2023-12-06
scATAC 1.3.2 2023-08-03
snm3C 4.0.4 2024-08-06
Multiome 5.7.1 2024-10-18
PairedTag 1.7.1 2024-10-18
Multiome 5.8.0 2024-10-23
PairedTag 1.8.0 2024-10-23
MultiSampleSmartSeq2 2.2.22 2024-09-11
MultiSampleSmartSeq2SingleNucleus 2.0.1 2024-09-24
Optimus 7.7.0 2024-09-24
atac 2.3.2 2024-10-18
MultiSampleSmartSeq2SingleNucleus 2.0.2 2024-10-23
Optimus 7.8.0 2024-10-23
atac 2.4.0 2024-10-23
SmartSeq2SingleSample 5.1.21 2024-09-11
SlideSeq 3.4.2 2024-09-24
SlideSeq 3.4.3 2024-10-24
8 changes: 8 additions & 0 deletions pipelines/skylab/atac/atac.changelog.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
# 2.4.0
2024-10-23 (Date of Last Commit)

* Added a new input parameter for atac_expected_cells, which describes the numnber of cells used for the library preparation
* Updated the ATAC library CSV to be consistent in file naming convention and to have similar case for metric names to the Optimus workflow library CSV
* Added a new metric to the ATAC library CSV to calculate percent_target, which is the number of estimated cells by SnapATAC2 divided by expected_cells input


# 2.3.2
2024-10-18 (Date of Last Commit)

Expand Down
49 changes: 35 additions & 14 deletions pipelines/skylab/atac/atac.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ workflow ATAC {
# Additional library aliquot ID
String? atac_nhash_id

#Expected cells from library preparation
Int atac_expected_cells = 3000

# Option for running files with preindex
Boolean preindex = false

Expand All @@ -46,7 +49,7 @@ workflow ATAC {
String adapter_seq_read3 = "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG"
}

String pipeline_version = "2.3.2"
String pipeline_version = "2.4.0"

# Determine docker prefix based on cloud provider
String gcr_docker_prefix = "us.gcr.io/broad-gotc-prod/"
Expand Down Expand Up @@ -139,7 +142,9 @@ workflow ATAC {
annotations_gtf = annotations_gtf,
preindex = preindex,
docker_path = docker_prefix + snap_atac_docker,
atac_nhash_id = atac_nhash_id
atac_nhash_id = atac_nhash_id,
atac_expected_cells = atac_expected_cells,
input_id = input_id
}
}
if (!preindex) {
Expand All @@ -150,7 +155,9 @@ workflow ATAC {
annotations_gtf = annotations_gtf,
preindex = preindex,
docker_path = docker_prefix + snap_atac_docker,
atac_nhash_id = atac_nhash_id
atac_nhash_id = atac_nhash_id,
atac_expected_cells = atac_expected_cells,
input_id = input_id

}
}
Expand Down Expand Up @@ -512,10 +519,10 @@ task CreateFragmentFile {
String cpuPlatform = "Intel Cascade Lake"
String docker_path
String atac_nhash_id = ""
String input_id
Int atac_expected_cells = 3000
}

String bam_base_name = basename(bam, ".bam")

parameter_meta {
bam: "Aligned bam with CB in CB tag. This is the output of the BWAPairedEndAlignment task."
chrom_sizes: "Text file containing chrom_sizes for genome build (i.e. hg38)."
Expand All @@ -532,11 +539,12 @@ task CreateFragmentFile {

# set parameters
bam = "~{bam}"
bam_base_name = "~{bam_base_name}"
input_id = "~{input_id}"
chrom_sizes = "~{chrom_sizes}"
atac_gtf = "~{annotations_gtf}"
preindex = "~{preindex}"
atac_nhash_id = "~{atac_nhash_id}"
expected_cells = ~{atac_expected_cells}

# calculate chrom size dictionary based on text file
chrom_size_dict={}
Expand All @@ -554,12 +562,22 @@ task CreateFragmentFile {

# extract CB or BB (if preindex is true) tag from bam file to create fragment file
if preindex == "true":
data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="BB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None)
data = pp.recipe_10x_metrics("~{bam}", "~{input_id}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="BB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None)
elif preindex == "false":
data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="CB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None)
data = pp.recipe_10x_metrics("~{bam}", "~{input_id}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="CB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None)

# Add NHashID to metrics
data = OrderedDict({'NHashID': atac_nhash_id, **data})

# Calculate atac percent target
print("Calculating percent target")
number_of_cells = data['Cells']['Number_of_cells']
print("Print number of cells", number_of_cells)
atac_percent_target = number_of_cells / expected_cells*100
print("Setting percent target in nested dictionary")
data['Cells']['percent_target'] = atac_percent_target


# Flatten the dictionary
flattened_data = []
for category, metrics in data.items():
Expand All @@ -569,8 +587,11 @@ task CreateFragmentFile {
else:
flattened_data.append((category, metrics))

# Convert the flattened keys to lowercase (except for 'NHashID')
flattened_data = [(metric if metric == 'NHashID' else str(metric).lower(), value) for metric, value in flattened_data]

# Write to CSV
csv_file_path = "~{bam_base_name}_~{atac_nhash_id}.atac_metrics.csv"
csv_file_path = "~{input_id}_~{atac_nhash_id}_library_metrics.csv"
with open(csv_file_path, mode='w', newline='') as file:
writer = csv.writer(file)
writer.writerows(flattened_data) # Write data
Expand All @@ -583,7 +604,7 @@ task CreateFragmentFile {
# calculate tsse metrics
snap.metrics.tsse(atac_data, atac_gtf)
# Write new atac file
atac_data.write_h5ad("~{bam_base_name}.metrics.h5ad")
atac_data.write_h5ad("~{input_id}.metrics.h5ad")

CODE
>>>
Expand All @@ -597,8 +618,8 @@ task CreateFragmentFile {
}

output {
File fragment_file = "~{bam_base_name}.fragments.tsv"
File Snap_metrics = "~{bam_base_name}.metrics.h5ad"
File atac_library_metrics = "~{bam_base_name}_~{atac_nhash_id}.atac_metrics.csv"
File fragment_file = "~{input_id}.fragments.tsv"
File Snap_metrics = "~{input_id}.metrics.h5ad"
File atac_library_metrics = "~{input_id}_~{atac_nhash_id}_library_metrics.csv"
}
}
7 changes: 7 additions & 0 deletions pipelines/skylab/multiome/Multiome.changelog.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
# 5.8.0
2024-10-23 (Date of Last Commit)

* Updated the workflow to include a new expected_cells input parameter describing the number of cells used as input to the library preparation; this is passed to both the ATAC workflows and Optimus workflows and the default is set to 3000 cells
* Updated the ATAC library CSV and the Gene Expression library CSV to be consistent in file naming convention and to have similar case for metric names
* Added a new metric to the ATAC library CSV to calculate percent_target, which is the number of estimated cells by SnapATAC2 divided by expected_cells input

# 5.7.1
2024-10-18 (Date of Last Commit)

Expand Down
9 changes: 6 additions & 3 deletions pipelines/skylab/multiome/Multiome.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils

workflow Multiome {

String pipeline_version = "5.7.1"
String pipeline_version = "5.8.0"


input {
Expand All @@ -18,6 +18,7 @@ workflow Multiome {
# Additional library aliquot ID
String? gex_nhash_id
String? atac_nhash_id
Int expected_cells = 3000

# Optimus Inputs
String counting_mode = "sn_rna"
Expand Down Expand Up @@ -102,7 +103,8 @@ workflow Multiome {
star_strand_mode = star_strand_mode,
count_exons = count_exons,
soloMultiMappers = soloMultiMappers,
cloud_provider = cloud_provider
cloud_provider = cloud_provider,
gex_expected_cells = expected_cells
}

# Call the ATAC workflow
Expand All @@ -120,7 +122,8 @@ workflow Multiome {
vm_size = vm_size,
annotations_gtf = annotations_gtf,
atac_nhash_id = atac_nhash_id,
adapter_seq_read3 = adapter_seq_read3
adapter_seq_read3 = adapter_seq_read3,
atac_expected_cells = expected_cells
}
call H5adUtils.JoinMultiomeBarcodes as JoinBarcodes {
input:
Expand Down
8 changes: 8 additions & 0 deletions pipelines/skylab/optimus/Optimus.changelog.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
# 7.8.0
2024-10-23 (Date of Last Commit)

* Renamed the input expected_cells to gex_expected_cells
* Updated gex_expected_cells to a required output
* Reformatted the library CSV output filename to remove an extra gex

# 7.7.0
2024-09-24 (Date of Last Commit)

Expand All @@ -6,6 +13,7 @@

# 7.6.1
2024-09-11 (Date of Last Commit)

* Updated warp-tools docker which added create_h5ad_snss2.py to the docker image. This change does not affect the Optimus pipeline

# 7.6.0
Expand Down
10 changes: 5 additions & 5 deletions pipelines/skylab/optimus/Optimus.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ workflow Optimus {
File annotations_gtf
File? mt_genes
String? soloMultiMappers = "Uniform"
Int? expected_cells
Int gex_expected_cells = 3000

# Chemistry options include: 2 or 3
Int tenx_chemistry_version
Expand Down Expand Up @@ -71,7 +71,7 @@ workflow Optimus {
# version of this pipeline


String pipeline_version = "7.7.0"
String pipeline_version = "7.8.0"


# this is used to scatter matched [r1_fastq, r2_fastq, i1_fastq] arrays
Expand Down Expand Up @@ -223,7 +223,7 @@ workflow Optimus {
input_id = input_id,
counting_mode = counting_mode,
star_merge_docker_path = docker_prefix + star_merge_docker,
expected_cells = expected_cells,
expected_cells = gex_expected_cells,
gex_nhash_id = gex_nhash_id
}
if (counting_mode == "sc_rna"){
Expand All @@ -242,7 +242,7 @@ workflow Optimus {
input:
input_id = input_id,
gex_nhash_id = gex_nhash_id,
expected_cells = expected_cells,
expected_cells = gex_expected_cells,
input_name = input_name,
input_id_metadata_field = input_id_metadata_field,
input_name_metadata_field = input_name_metadata_field,
Expand Down Expand Up @@ -279,7 +279,7 @@ workflow Optimus {
input:
input_id = input_id,
gex_nhash_id = gex_nhash_id,
expected_cells = expected_cells,
expected_cells = gex_expected_cells,
input_name = input_name,
counting_mode = counting_mode,
input_id_metadata_field = input_id_metadata_field,
Expand Down
11 changes: 10 additions & 1 deletion pipelines/skylab/paired_tag/PairedTag.changelog.md
Original file line number Diff line number Diff line change
@@ -1,15 +1,24 @@
# 1.8.0
2024-10-23 (Date of Last Commit)

* Updated the workflow to include a new expected_cells input parameter describing the number of cells used as input to the library preparation; this is passed to both the ATAC workflows and Optimus workflows and the default is set to 3000 cells
* Updated the ATAC library CSV and the Gene Expression library CSV to be consistent in file naming convention and to have similar case for metric names
* Added a new metric to the ATAC library CSV to calculate percent_target, which is the number of estimated cells by SnapATAC2 divided by expected_cells input

# 1.7.1
2024-10-18 (Date of Last Commit)

* Removed the underscore of the NHashID in the ATAC library metrics CSV

# 1.7.0
2024-09-24 (Date of Last Commit)

* Added a python implementation of DoubletFinder to calculate doublet scores in gene expression data; percent doublets are now available as a library-level metric and individual doublet scores for cell barcodes are in the h5ad
* Updated gene_names in the final h5ad to be unique

# 1.6.1
2024-09-11 (Date of Last Commit)

* Updated warp-tools docker which added create_h5ad_snss2.py to the docker image. This change does not affect the PairedTag pipeline

# 1.6.0
Expand All @@ -21,6 +30,7 @@
2024-08-06 (Date of Last Commit)

* Updated the warp-tools docker to calculate mitochondrial reads from unique reads in cell and gene metrics; these metrics are in the cell and gene metrics CSV as well as h5ad

# 1.4.1
2024-08-02 (Date of Last Commit)

Expand Down Expand Up @@ -71,7 +81,6 @@

* Updated the demultiplex task so that some intermediate input names have been renamed. There is no change to the outputs.


# 0.6.0
2024-05-10 (Date)

Expand Down
4 changes: 2 additions & 2 deletions pipelines/skylab/paired_tag/PairedTag.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils

workflow PairedTag {

String pipeline_version = "1.7.1"
String pipeline_version = "1.8.0"


input {
Expand Down Expand Up @@ -109,7 +109,7 @@ workflow PairedTag {
read1_fastq = atac_r1_fastq[idx],
read3_fastq = atac_r3_fastq[idx],
barcodes_fastq = atac_r2_fastq[idx],
input_id = input_id,
input_id = input_id + "_atac",
whitelist = atac_whitelist,
preindex = preindex,
docker_path = docker_prefix + upstools_docker
Expand Down
5 changes: 5 additions & 0 deletions pipelines/skylab/slideseq/SlideSeq.changelog.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# 3.4.3
2024-10-24 (Date of Last Commit)

* Updated the h5adUtils WDL to rename the gene expression library CSV filename; this does not impact slideseq

# 3.4.2
2024-09-24 (Date of Last Commit)

Expand Down
2 changes: 1 addition & 1 deletion pipelines/skylab/slideseq/SlideSeq.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils

workflow SlideSeq {

String pipeline_version = "3.4.2"
String pipeline_version = "3.4.3"

input {
Array[File] r1_fastq
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# 2.0.2
2024-10-23 (Date of Last Commit)

* Updated the h5adUtils WDL to rename the gene expression library CSV filename; this does not impact slideseq

# 2.0.1
2024-09-24 (Date of Last Commit)

* Added a python implementation of DoubletFinder to calculate doublet scores in gene expression data; this does not affect the snSS2 workflow

# 2.0.0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ workflow MultiSampleSmartSeq2SingleNucleus {
}

# Version of this pipeline
String pipeline_version = "2.0.1"
String pipeline_version = "2.0.2"

if (false) {
String? none = "None"
Expand Down
Loading
Loading