Skip to content

Commit

Permalink
Merge pull request #1365 from broadinstitute/develop
Browse files Browse the repository at this point in the history
dev -> staging
  • Loading branch information
ekiernan authored Sep 9, 2024
2 parents c49bcc9 + 7a03227 commit e3fc193
Show file tree
Hide file tree
Showing 15 changed files with 285 additions and 53 deletions.
50 changes: 25 additions & 25 deletions pipeline_versions.txt
Original file line number Diff line number Diff line change
@@ -1,42 +1,42 @@
Pipeline Name Version Date of Last Commit
Optimus 7.6.0 2024-08-06
Multiome 5.5.0 2024-08-06
PairedTag 1.5.0 2024-08-06
atac 2.2.3 2024-08-02
SlideSeq 3.4.0 2024-08-06
snm3C 4.0.3 2024-08-05
MultiSampleSmartSeq2SingleNucleus 1.4.2 2024-08-25-02
scATAC 1.3.2 2023-08-03
MultiSampleSmartSeq2 2.2.21 2023-04-19
PairedTag 1.6.0 2024-08-02
Optimus 7.6.0 2024-08-06
atac 2.3.0 2024-08-29
snm3C 4.0.4 2024-08-06
SmartSeq2SingleSample 5.1.20 2023-04-19
Multiome 5.6.0 2024-08-02
scATAC 1.3.2 2023-08-03
BuildIndices 3.0.0 2023-12-06
MultiSampleSmartSeq2 2.2.21 2023-04-19
CEMBA 1.1.6 2023-12-18
SlideSeq 3.4.0 2024-08-06
BuildCembaReferences 1.0.0 2020-11-15
UltimaGenomicsWholeGenomeCramOnly 1.0.20 2024-08-02
CEMBA 1.1.6 2023-12-18
GDCWholeGenomeSomaticSingleSample 1.3.2 2024-08-02
ExomeGermlineSingleSample 3.1.22 2024-06-12
UltimaGenomicsWholeGenomeGermline 1.0.20 2024-08-02
WholeGenomeGermlineSingleSample 3.2.1 2024-06-12
VariantCalling 2.2.1 2024-06-12
UltimaGenomicsWholeGenomeCramOnly 1.0.20 2024-08-02
JointGenotypingByChromosomePartOne 1.4.12 2023-12-18
JointGenotypingByChromosomePartTwo 1.4.11 2023-12-18
UltimaGenomicsJointGenotyping 1.1.7 2023-12-18
JointGenotyping 1.6.10 2023-12-18
ReblockGVCF 2.2.1 2024-06-12
JointGenotypingByChromosomePartTwo 1.4.11 2023-12-18
JointGenotypingByChromosomePartOne 1.4.12 2023-12-18
ExternalExomeReprocessing 3.2.2 2024-08-02
ExternalWholeGenomeReprocessing 2.2.2 2024-08-02
ExomeReprocessing 3.2.2 2024-08-02
CramToUnmappedBams 1.1.3 2024-08-02
WholeGenomeReprocessing 3.2.2 2024-08-02
IlluminaGenotypingArray 1.12.21 2024-08-02
Arrays 2.6.27 2024-08-02
MultiSampleArrays 1.6.2 2024-08-02
VariantCalling 2.2.1 2024-06-12
WholeGenomeGermlineSingleSample 3.2.1 2024-06-12
UltimaGenomicsWholeGenomeGermline 1.0.20 2024-08-02
ExomeGermlineSingleSample 3.1.22 2024-06-12
ValidateChip 1.16.5 2024-08-02
Arrays 2.6.27 2024-08-02
Imputation 1.1.13 2024-05-21
RNAWithUMIsPipeline 1.0.16 2023-12-18
MultiSampleArrays 1.6.2 2024-08-02
BroadInternalUltimaGenomics 1.0.21 2024-08-02
BroadInternalArrays 1.1.11 2024-08-02
BroadInternalImputation 1.1.12 2024-08-02
BroadInternalRNAWithUMIs 1.0.33 2024-08-02
CramToUnmappedBams 1.1.3 2024-08-02
ExternalWholeGenomeReprocessing 2.2.2 2024-08-02
ExternalExomeReprocessing 3.2.2 2024-08-02
WholeGenomeReprocessing 3.2.2 2024-08-02
ExomeReprocessing 3.2.2 2024-08-02
IlluminaGenotypingArray 1.12.21 2024-08-02
CheckFingerprint 1.0.20 2024-08-02
AnnotationFiltration 1.2.5 2023-12-18
RNAWithUMIsPipeline 1.0.16 2023-12-18
7 changes: 7 additions & 0 deletions pipelines/skylab/atac/atac.changelog.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
# 2.3.0
2024-08-29 (Date of Last Commit)

* Updated the SnapATAC2 docker to include v2.7.0; the pipeline will now produce a library-level summary metric CSV for the BAM.

* Updated the memory for the CreateFragmentFile task

# 2.2.3
2024-08-02 (Date of Last Commit)

Expand Down
39 changes: 30 additions & 9 deletions pipelines/skylab/atac/atac.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ workflow ATAC {
String adapter_seq_read3 = "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG"
}

String pipeline_version = "2.2.3"
String pipeline_version = "2.3.0"

# Determine docker prefix based on cloud provider
String gcr_docker_prefix = "us.gcr.io/broad-gotc-prod/"
Expand All @@ -58,7 +58,7 @@ workflow ATAC {
String cutadapt_docker = "cutadapt:1.0.0-4.4-1686752919"
String samtools_docker = "samtools-dist-bwa:3.0.0"
String upstools_docker = "upstools:1.0.0-2023.03.03-1704300311"
String snap_atac_docker = "snapatac2:1.0.9-2.6.3-1715865353"
String snap_atac_docker = "snapatac2:1.1.0"

# Make sure either 'gcp' or 'azure' is supplied as cloud_provider input. If not, raise an error
if ((cloud_provider != "gcp") && (cloud_provider != "azure")) {
Expand Down Expand Up @@ -158,11 +158,13 @@ workflow ATAC {
File bam_aligned_output_atac = select_first([BBTag.bb_bam, BWAPairedEndAlignment.bam_aligned_output])
File fragment_file_atac = select_first([BB_fragment.fragment_file, CreateFragmentFile.fragment_file])
File snap_metrics_atac = select_first([BB_fragment.Snap_metrics,CreateFragmentFile.Snap_metrics])
File library_metrics = select_first([BB_fragment.atac_library_metrics, CreateFragmentFile.atac_library_metrics])

output {
File bam_aligned_output = bam_aligned_output_atac
File fragment_file = fragment_file_atac
File snap_metrics = snap_metrics_atac
File library_metrics_file = library_metrics
}
}

Expand Down Expand Up @@ -505,7 +507,7 @@ task CreateFragmentFile {
File annotations_gtf
Boolean preindex
Int disk_size = 500
Int mem_size = 16
Int mem_size = 64
Int nthreads = 4
String cpuPlatform = "Intel Cascade Lake"
String docker_path
Expand Down Expand Up @@ -547,17 +549,35 @@ task CreateFragmentFile {
import snapatac2.preprocessing as pp
import snapatac2 as snap
import anndata as ad
from collections import OrderedDict
import csv
# extract CB or BB (if preindex is true) tag from bam file to create fragment file
if preindex == "true":
pp.make_fragment_file("~{bam}", "~{bam_base_name}.fragments.tsv", is_paired=True, barcode_tag="BB")
data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="BB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None)
elif preindex == "false":
pp.make_fragment_file("~{bam}", "~{bam_base_name}.fragments.tsv", is_paired=True, barcode_tag="CB")
data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="CB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None)
# Add NHashID to metrics
nhash_ID_value = "XXX"
data = OrderedDict({'NHash_ID': atac_nhash_id, **data})
# Flatten the dictionary
flattened_data = []
for category, metrics in data.items():
if isinstance(metrics, dict):
for metric, value in metrics.items():
flattened_data.append((metric, value))
else:
flattened_data.append((category, metrics))
# Write to CSV
csv_file_path = "~{bam_base_name}_~{atac_nhash_id}.atac_metrics.csv"
with open(csv_file_path, mode='w', newline='') as file:
writer = csv.writer(file)
writer.writerows(flattened_data) # Write data
print(f"Dictionary successfully written to {csv_file_path}")
# calculate quality metrics; note min_num_fragments and min_tsse are set to 0 instead of default
# those settings allow us to retain all barcodes
pp.import_data("~{bam_base_name}.fragments.tsv", file="temp_metrics.h5ad", chrom_sizes=chrom_size_dict, min_num_fragments=0)
atac_data = ad.read_h5ad("temp_metrics.h5ad")
# Add nhash_id to h5ad file as unstructured metadata
atac_data.uns['NHashID'] = atac_nhash_id
Expand All @@ -580,5 +600,6 @@ task CreateFragmentFile {
output {
File fragment_file = "~{bam_base_name}.fragments.tsv"
File Snap_metrics = "~{bam_base_name}.metrics.h5ad"
File atac_library_metrics = "~{bam_base_name}_~{atac_nhash_id}.atac_metrics.csv"
}
}
5 changes: 5 additions & 0 deletions pipelines/skylab/multiome/Multiome.changelog.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# 5.6.0
2024-08-02 (Date of Last Commit)

* Updated the SnapATAC2 docker to include v2.7.0; the pipeline will now produce a library-level summary metric CSV for the BAM.

# 5.5.0
2024-08-06 (Date of Last Commit)

Expand Down
3 changes: 2 additions & 1 deletion pipelines/skylab/multiome/Multiome.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils

workflow Multiome {

String pipeline_version = "5.5.0"
String pipeline_version = "5.6.0"


input {
Expand Down Expand Up @@ -179,6 +179,7 @@ workflow Multiome {
File fragment_file_atac = JoinBarcodes.atac_fragment_tsv
File fragment_file_index = JoinBarcodes.atac_fragment_tsv_tbi
File snap_metrics_atac = JoinBarcodes.atac_h5ad_file
File atac_library_metrics = Atac.library_metrics_file

# optimus outputs
File genomic_reference_version_gex = Optimus.genomic_reference_version
Expand Down
5 changes: 5 additions & 0 deletions pipelines/skylab/paired_tag/PairedTag.changelog.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# 1.6.0
2024-08-02 (Date of Last Commit)

* Updated the SnapATAC2 docker to include v2.7.0; the pipeline will now produce a library-level summary metric CSV for the BAM.

# 1.5.0
2024-08-06 (Date of Last Commit)

Expand Down
4 changes: 3 additions & 1 deletion pipelines/skylab/paired_tag/PairedTag.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils

workflow PairedTag {

String pipeline_version = "1.5.0"
String pipeline_version = "1.6.0"


input {
Expand Down Expand Up @@ -149,6 +149,7 @@ workflow PairedTag {

File atac_fragment_out = select_first([ParseBarcodes.atac_fragment_tsv,Atac_preindex.fragment_file])
File atac_h5ad_out = select_first([ParseBarcodes.atac_h5ad_file, Atac_preindex.snap_metrics])

output {

String pairedtag_pipeline_version_out = pipeline_version
Expand All @@ -157,6 +158,7 @@ workflow PairedTag {
File bam_aligned_output_atac = Atac_preindex.bam_aligned_output
File fragment_file_atac = atac_fragment_out
File snap_metrics_atac = atac_h5ad_out
File atac_library_final = Atac_preindex.library_metrics_file

# optimus outputs
File genomic_reference_version_gex = Optimus.genomic_reference_version
Expand Down
2 changes: 1 addition & 1 deletion pipelines/skylab/paired_tag/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
## Announcing a new site for WARP documentation!

Paired-tag documentation has moved! Read more about the [Paired-Tag workflow](https://broadinstitute.github.io/warp/docs/Pipelines/PairedTag_Pipeline/README) on the new [WARP documentation site](https://broadinstitute.github.io/warp/)!
Paired-tag documentation has moved! Read more about the [Paired-Tag workflow](https://broadinstitute.github.io/warp/docs/Pipelines/PairedTag_Pipeline/README) on the new [WARP documentation site](https://broadinstitute.github.io/warp/)!

### Paired-Tag summary

Expand Down
9 changes: 7 additions & 2 deletions pipelines/skylab/snm3C/snm3C.changelog.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
# 4.0.4
2024-08-06 (Date of Last Commit)

* Updated the Demultiplexing task in the snm3C wdl to flag when file/cell is empty

# 4.0.3
2024-08-05 (Date of Last Commit)
2024-08-06 (Date of Last Commit)

* Updated the demultiplexing task in snm3C wdl to dynamically update the batch number based on the number of fastq files present
* Updated the Demultiplexing task in snm3C wdl to dynamically update the batch number based on the number of fastq files present

# 4.0.2
2024-07-09 (Date of Last Commit)
Expand Down
34 changes: 20 additions & 14 deletions pipelines/skylab/snm3C/snm3C.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ workflow snm3C {
}

# version of the pipeline
String pipeline_version = "4.0.3"
String pipeline_version = "4.0.4"

call Demultiplexing {
input:
Expand Down Expand Up @@ -154,6 +154,8 @@ task Demultiplexing {
File random_primer_indexes
String plate_id
Int batch_number
Int min_threshold = 100
Int max_threshold = 10000000
String docker

Int disk_size = 1000
Expand All @@ -179,7 +181,7 @@ task Demultiplexing {
$WORKING_DIR/r2.fastq.gz \
> $WORKING_DIR/~{plate_id}.stats.txt

# remove the fastq files that end in unknown-R1.fq.gz and unknown-R2.fq.gz
# Remove the fastq files that end in unknown-R1.fq.gz and unknown-R2.fq.gz
rm $WORKING_DIR/*-unknown-R{1,2}.fq.gz

python3 <<CODE
Expand All @@ -199,27 +201,31 @@ task Demultiplexing {
trimmed_count = int(adapter_match[1])
adapter_counts[adapter_name] = trimmed_count
# Removing fastq files with trimmed reads greater than 30
threshold = 10000000
# Removing fastq files with trimmed reads greater than 10000000 or less than 100
for filename in os.listdir(working_dir):
if filename.endswith('.fq.gz'):
file_path = os.path.join(working_dir, filename)
adapter_name = re.search(r'A(\d+)-R', filename)
adapter_name = re.search(r'([A-Za-z]\d+)-R', filename).group(1)
if adapter_name:
adapter_name = 'A' + adapter_name.group(1)
if adapter_name in adapter_counts and adapter_counts[adapter_name] > threshold:
os.remove(file_path)
if adapter_name in adapter_counts:
if adapter_counts[adapter_name] < ~{min_threshold} or adapter_counts[adapter_name] > ~{max_threshold}:
print("Removing ", file_path, " with count equal to ", adapter_counts[adapter_name])
os.remove(file_path)
CODE
# Check if the number of *R1.fq.gz files is 0
if [[ $(ls | grep "\-R1.fq.gz" | wc -l) -eq 0 ]]; then
echo "Error: No files found. All fastq files were removed. Exiting."
exit 1
fi
# Batch the fastq files into folders of batch_number size
R1_files=($(ls $WORKING_DIR | grep "\-R1.fq.gz"))
R2_files=($(ls $WORKING_DIR | grep "\-R2.fq.gz"))
batch_number=~{batch_number}
total_files=${#R1_files[@]}
echo "Total files: $total_files"
batch_number=~{batch_number}
if [[ $total_files -lt $batch_number ]]; then
echo "Warning: Number of files is less than the batch number. Updating batch number to $total_files."
batch_number=$total_files
Expand All @@ -229,14 +235,14 @@ task Demultiplexing {
mkdir -p "batch${i}" # Combine batch and i, use -p to create parent dirs
done
# Counter for the folder index
# Counter for the folder index and create emptycells file
folder_index=1
WORKING_DIR=`pwd`
# Distribute the FASTQ files and create TAR files
for file in "${R1_files[@]}"; do
sample_id=$(basename "$file" "-R1.fq.gz")
r2_file="${sample_id}-R2.fq.gz"
mv $WORKING_DIR/$file batch$((folder_index))/$file
mv $WORKING_DIR/$r2_file batch$((folder_index))/$r2_file
# Increment the counter
Expand All @@ -249,7 +255,7 @@ task Demultiplexing {
tar -cf - $WORKING_DIR/batch${i}/*.fq.gz | pigz > ~{plate_id}.${i}.cutadapt_output_files.tar.gz
done
>>>
runtime {
docker: docker
disks: "local-disk ${disk_size} SSD"
Expand Down
Loading

0 comments on commit e3fc193

Please sign in to comment.