Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove fastqprocess and mergebam tasks when running star once #1195

Open
wants to merge 12 commits into
base: develop
Choose a base branch
from
8 changes: 4 additions & 4 deletions pipelines/skylab/multiome/Multiome.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -160,10 +160,10 @@ workflow Multiome {
File gene_metrics_gex = Optimus.gene_metrics
File? cell_calls_gex = Optimus.cell_calls
File h5ad_output_file_gex = JoinBarcodes.gex_h5ad_file
Array[File?] multimappers_EM_matrix = Optimus.multimappers_EM_matrix
Array[File?] multimappers_Uniform_matrix = Optimus.multimappers_Uniform_matrix
Array[File?] multimappers_Rescue_matrix = Optimus.multimappers_Rescue_matrix
Array[File?] multimappers_PropUnique_matrix = Optimus.multimappers_PropUnique_matrix
File? multimappers_EM_matrix = Optimus.multimappers_EM_matrix
File? multimappers_Uniform_matrix = Optimus.multimappers_Uniform_matrix
File? multimappers_Rescue_matrix = Optimus.multimappers_Rescue_matrix
File? multimappers_PropUnique_matrix = Optimus.multimappers_PropUnique_matrix
File? gex_aligner_metrics = Optimus.aligner_metrics
File? library_metrics = Optimus.library_metrics
File? mtx_files = Optimus.mtx_files
Expand Down
72 changes: 25 additions & 47 deletions pipelines/skylab/optimus/Optimus.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -166,23 +166,10 @@ workflow Optimus {
ubuntu_docker_path = ubuntu_docker_prefix + ubuntu_docker
}

call FastqProcessing.FastqProcessing as SplitFastq {
input:
i1_fastq = i1_fastq,
r1_fastq = r1_fastq,
r2_fastq = r2_fastq,
whitelist = whitelist,
chemistry = tenx_chemistry_version,
sample_id = input_id,
read_struct = read_struct,
warp_tools_docker_path = docker_prefix + warp_tools_docker
}

scatter(idx in range(length(SplitFastq.fastq_R1_output_array))) {
call StarAlign.STARsoloFastq as STARsoloFastq {
call StarAlign.STARsoloFastq as STARsoloFastq {
input:
r1_fastq = [SplitFastq.fastq_R1_output_array[idx]],
r2_fastq = [SplitFastq.fastq_R2_output_array[idx]],
r1_fastq = r1_fastq,
r2_fastq = r2_fastq,
star_strand_mode = star_strand_mode,
white_list = whitelist,
tar_star_reference = tar_star_reference,
Expand All @@ -193,18 +180,11 @@ workflow Optimus {
soloMultiMappers = soloMultiMappers,
samtools_star_docker_path = docker_prefix + samtools_star,
is_slidetags = is_slidetags
}
}
call Merge.MergeSortBamFiles as MergeBam {
input:
bam_inputs = STARsoloFastq.bam_output,
output_bam_filename = output_bam_basename + ".bam",
sort_order = "coordinate",
picard_cloud_docker_path = docker_prefix + picard_cloud_docker
}
call Metrics.CalculateGeneMetrics as GeneMetrics {

call Metrics.CalculateGeneMetrics as GeneMetrics {
input:
bam_input = MergeBam.output_bam,
bam_input = STARsoloFastq.bam_output,
mt_genes = mt_genes,
original_gtf = annotations_gtf,
input_id = input_id,
Expand All @@ -213,7 +193,7 @@ workflow Optimus {

call Metrics.CalculateCellMetrics as CellMetrics {
input:
bam_input = MergeBam.output_bam,
bam_input = STARsoloFastq.bam_output,
mt_genes = mt_genes,
original_gtf = annotations_gtf,
input_id = input_id,
Expand All @@ -222,13 +202,13 @@ workflow Optimus {

call StarAlign.MergeStarOutput as MergeStarOutputs {
input:
barcodes = STARsoloFastq.barcodes,
features = STARsoloFastq.features,
matrix = STARsoloFastq.matrix,
cell_reads = STARsoloFastq.cell_reads,
summary = STARsoloFastq.summary,
align_features = STARsoloFastq.align_features,
umipercell = STARsoloFastq.umipercell,
barcodes = [STARsoloFastq.barcodes],
features = [STARsoloFastq.features],
matrix = [STARsoloFastq.matrix],
cell_reads = [STARsoloFastq.cell_reads],
summary = [STARsoloFastq.summary],
align_features = [STARsoloFastq.align_features],
umipercell = [STARsoloFastq.umipercell],
input_id = input_id,
counting_mode = counting_mode,
star_merge_docker_path = docker_prefix + star_merge_docker,
Expand Down Expand Up @@ -272,10 +252,10 @@ workflow Optimus {
if (count_exons && counting_mode=="sn_rna") {
call StarAlign.MergeStarOutput as MergeStarOutputsExons {
input:
barcodes = STARsoloFastq.barcodes_sn_rna,
features = STARsoloFastq.features_sn_rna,
matrix = STARsoloFastq.matrix_sn_rna,
cell_reads = STARsoloFastq.cell_reads_sn_rna,
barcodes = [STARsoloFastq.barcodes_sn_rna],
features = [STARsoloFastq.features_sn_rna],
matrix = [STARsoloFastq.matrix_sn_rna],
cell_reads = [STARsoloFastq.cell_reads_sn_rna],
input_id = input_id,
counting_mode = "sc_rna",
summary = STARsoloFastq.summary_sn_rna,
Expand Down Expand Up @@ -346,28 +326,26 @@ workflow Optimus {
File final_h5ad_output = select_first([OptimusH5adGenerationWithExons.h5ad_output, OptimusH5adGeneration.h5ad_output])
File final_library_metrics = select_first([OptimusH5adGenerationWithExons.library_metrics, OptimusH5adGeneration.library_metrics])


output {
# version of this pipeline
String pipeline_version_out = pipeline_version
File genomic_reference_version = ReferenceCheck.genomic_ref_version
File bam = MergeBam.output_bam
File bam = STARsoloFastq.bam_output
File matrix = MergeStarOutputs.sparse_counts
File matrix_row_index = MergeStarOutputs.row_index
File matrix_col_index = MergeStarOutputs.col_index
File cell_metrics = CellMetrics.cell_metrics
File gene_metrics = GeneMetrics.gene_metrics
File? cell_calls = RunEmptyDrops.empty_drops_result
File? aligner_metrics = MergeStarOutputs.cell_reads_out
File? multimappers_EM_matrix = STARsoloFastq.multimappers_EM_matrix
File? multimappers_Uniform_matrix = STARsoloFastq.multimappers_Uniform_matrix
File? multimappers_Rescue_matrix = STARsoloFastq.multimappers_Rescue_matrix
File? multimappers_PropUnique_matrix = STARsoloFastq.multimappers_PropUnique_matrix
# File? library_metrics = MergeStarOutputs.library_metrics
File library_metrics = final_library_metrics
File? mtx_files = MergeStarOutputs.mtx_files
File? filtered_mtx_files = MergeStarOutputs.filtered_mtx_files

Array[File?] multimappers_EM_matrix = STARsoloFastq.multimappers_EM_matrix
Array[File?] multimappers_Uniform_matrix = STARsoloFastq.multimappers_Uniform_matrix
Array[File?] multimappers_Rescue_matrix = STARsoloFastq.multimappers_Rescue_matrix
Array[File?] multimappers_PropUnique_matrix = STARsoloFastq.multimappers_PropUnique_matrix

File? filtered_mtx_files = MergeStarOutputs.filtered_mtx_files

# h5ad
File h5ad_output_file = final_h5ad_output
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,9 @@
"Optimus.tar_star_reference": "gs://gcp-public-data--broad-references/hg38/v0/star/star_2.7.9a_primary_gencode_human_v27.tar",
"Optimus.input_id": "pbmc4k_human",
"Optimus.chemistry": "tenX_v2",
"Optimus.annotations_gtf": "gs://gcp-public-data--broad-references/hg38/v0/gencode.v27.primary_assembly.annotation.gtf"
"Optimus.annotations_gtf": "gs://gcp-public-data--broad-references/hg38/v0/gencode.v27.primary_assembly.annotation.gtf",
"Optimus.ref_genome_fasta": "gs://gcp-public-data--broad-references/hg38/v0/GRCh38.primary_assembly.genome.fa",
"Optimus.STARsoloFastq.cpu_platform":"Intel Cascade Lake",
"Optimus.STARsoloFastq.cpu":"16",
"Optimus.STARsoloFastq.mem_size":"64"
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,9 @@
"Optimus.tar_star_reference": "gs://gcp-public-data--broad-references/hg38/v0/star/star_2.7.9a_primary_gencode_human_v27.tar",
"Optimus.input_id": "pbmc_human_v3",
"Optimus.chemistry": "tenX_v3",
"Optimus.annotations_gtf": "gs://gcp-public-data--broad-references/hg38/v0/gencode.v27.primary_assembly.annotation.gtf"
"Optimus.annotations_gtf": "gs://gcp-public-data--broad-references/hg38/v0/gencode.v27.primary_assembly.annotation.gtf",
"Optimus.ref_genome_fasta": "gs://gcp-public-data--broad-references/hg38/v0/GRCh38.primary_assembly.genome.fa",
"Optimus.STARsoloFastq.cpu_platform":"Intel Cascade Lake",
"Optimus.STARsoloFastq.cpu":"16",
"Optimus.STARsoloFastq.mem_size":"64"
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,9 @@
"Optimus.tar_star_reference": "gs://gcp-public-data--broad-references/mm10/v0/star/star_2.7.9a_primary_gencode_mouse_vM21.tar",
"Optimus.input_id": "neurons2k_mouse",
"Optimus.chemistry": "tenX_v2",
"Optimus.annotations_gtf": "gs://gcp-public-data--broad-references/mm10/v0/gencode.vM21.primary_assembly.annotation.gtf"
"Optimus.annotations_gtf": "gs://gcp-public-data--broad-references/mm10/v0/gencode.vM21.primary_assembly.annotation.gtf",
"Optimus.ref_genome_fasta": "gs://gcp-public-data--broad-references/mm10/v0/GRCm38.primary_assembly.genome.fa",
"Optimus.STARsoloFastq.cpu_platform":"Intel Cascade Lake",
"Optimus.STARsoloFastq.cpu":"16",
"Optimus.STARsoloFastq.mem_size":"64"
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,5 +25,8 @@
"Optimus.chemistry": "tenX_v2",
"Optimus.annotations_gtf": "gs://gcp-public-data--broad-references/mm10/v0/gencode.vM21.primary_assembly.annotation.gtf",
"Optimus.counting_mode": "sn_rna",
"Optimus.count_exons": true
"Optimus.count_exons": true,
"Optimus.STARsoloFastq.cpu_platform":"Intel Cascade Lake",
"Optimus.STARsoloFastq.cpu":"16",
"Optimus.STARsoloFastq.mem_size":"64"
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@
"Optimus.tenx_chemistry_version": "3",
"Optimus.annotations_gtf": "gs://gcp-public-data--broad-references/hg38/v0/star/v2_7_10a/modified_v43.annotation.gtf",
"Optimus.star_strand_mode": "Forward",
"Optimus.ref_genome_fasta": "gs://gcp-public-data--broad-references/hg38/v0/GRCh38.primary_assembly.genome.fa",
"Optimus.STARsoloFastq.cpu_platform":"Intel Cascade Lake",
"Optimus.STARsoloFastq.cpu":"16",
"Optimus.STARsoloFastq.mem_size":"64",
"Optimus.cloud_provider": "gcp",
"Optimus.gex_nhash_id":"example_1234"
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@
"Optimus.input_id": "neurons2k_mouse",
"Optimus.tenx_chemistry_version": "2",
"Optimus.star_strand_mode": "Unstranded",
"Optimus.ref_genome_fasta": "gs://gcp-public-data--broad-references/GRCm39/GRCm39.primary_assembly.genome.fa.gz",
"Optimus.STARsoloFastq.cpu_platform":"Intel Cascade Lake",
"Optimus.STARsoloFastq.cpu":"16",
"Optimus.STARsoloFastq.mem_size":"64",
"Optimus.cloud_provider": "gcp",
"Optimus.gex_nhash_id":"example_1234",
"Optimus.annotations_gtf": "gs://gcp-public-data--broad-references/GRCm39/star/v2_7_10a/modified_vM32.annotation.gtf"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@
"Optimus.annotations_gtf": "gs://gcp-public-data--broad-references/GRCm39/star/v2_7_10a/modified_vM32.annotation.gtf",
"Optimus.counting_mode": "sn_rna",
"Optimus.count_exons": true,
"Optimus.STARsoloFastq.cpu_platform":"Intel Cascade Lake",
"Optimus.STARsoloFastq.cpu":"16",
"Optimus.STARsoloFastq.mem_size":"64",
"Optimus.cloud_provider": "gcp",
"Optimus.gex_nhash_id":"example_1234"
}
2 changes: 1 addition & 1 deletion tasks/skylab/FastqProcessing.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ task FastqProcessing {
fi

fastqprocess \
--bam-size 30.0 \
--num-output-files 1 \
--sample-id "~{sample_id}" \
$FASTQS \
--white-list "~{whitelist}" \
Expand Down
27 changes: 11 additions & 16 deletions tasks/skylab/StarAlign.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -227,18 +227,14 @@ task STARsoloFastq {

# runtime values
String samtools_star_docker_path
Int machine_mem_mb = 64000
Int cpu = 8
# by default request non preemptible machine to make sure the slow star alignment step completes
Int preemptible = 3
String cpu_platform = "Intel Ice Lake"
Int machine_mem_mb = 512000
Int mem_size = 512
Int cpu = 128
Int disk = 2000
# by default request non preemptible machine to make sure the slow star alignment step completes
Int preemptible = 1

# if slide_tags true set disk to 1000 otherwise dynamic allocation based on input size
# dynamic allocation multiplies input size by 2.2 to account for output bam file + 20% overhead, add size of reference.
Boolean is_slidetags
Int disk = if is_slidetags then 1000 else
ceil(size(tar_star_reference, "Gi") * 3) +
ceil(size(r1_fastq, "Gi") * 20) +
ceil(size(r2_fastq, "Gi") * 20)
}

meta {
Expand Down Expand Up @@ -340,9 +336,9 @@ task STARsoloFastq {
# validate the bam with samtools quickcheck
samtools quickcheck -v Aligned.sortedByCoord.out.bam


echo "UMI LEN " $UMILen

# why is this here?
touch barcodes_sn_rna.tsv
touch features_sn_rna.tsv
touch matrix_sn_rna.mtx
Expand All @@ -351,7 +347,6 @@ task STARsoloFastq {
touch Summary_sn_rna.csv
touch UMIperCellSorted_sn_rna.txt


if [[ "~{counting_mode}" == "sc_rna" ]]
then
SoloDirectory="Solo.out/Gene/raw"
Expand Down Expand Up @@ -424,12 +419,12 @@ task STARsoloFastq {
>>>

runtime {
docker: samtools_star_docker_path
memory: "~{machine_mem_mb} MiB"
disks: "local-disk ~{disk} HDD"
memory: "~{mem_size} GiB"
disks: "local-disk ~{disk} SSD"
disk: disk + " GB" # TES
cpu: cpu
preemptible: preemptible
cpuPlatform: cpu_platform
}

output {
Expand Down
Loading