diff --git a/beta-pipelines/skylab/m3c/CondensedSnm3C.wdl b/beta-pipelines/skylab/m3c/CondensedSnm3C.wdl index 232053de2f..5e9ecb80aa 100644 --- a/beta-pipelines/skylab/m3c/CondensedSnm3C.wdl +++ b/beta-pipelines/skylab/m3c/CondensedSnm3C.wdl @@ -7,6 +7,10 @@ workflow WDLized_snm3C { Array[File] fastq_input_read2 File random_primer_indexes String plate_id + # mapping inputs + File tarred_index_files + File genome_fa + File chromosome_sizes } call Demultiplexing { @@ -22,15 +26,18 @@ workflow WDLized_snm3C { tarred_demultiplexed_fastqs = Demultiplexing.tarred_demultiplexed_fastqs } - # call hisat_3n_pair_end_mapping_dna_mode { - # input: - # r1_trimmed = Sort_and_trim_r1_and_r2.r1_trimmed_fq, - # r2_trimmed = Sort_and_trim_r1_and_r2.r2_trimmed_fq - # } -# + call Hisat_3n_pair_end_mapping_dna_mode { + input: + r1_trimmed_tar = Sort_and_trim_r1_and_r2.r1_trimmed_fq_tar, + r2_trimmed_tar = Sort_and_trim_r1_and_r2.r2_trimmed_fq_tar, + tarred_index_files = tarred_index_files, + genome_fa = genome_fa, + chromosome_sizes = chromosome_sizes + } + # call separate_unmapped_reads { # input: - # hisat3n_bam = hisat_3n_pair_end_mapping_dna_mode.hisat3n_bam + # hisat3n_bam = Hisat_3n_pair_end_mapping_dna_mode.hisat3n_bam # } # # call split_unmapped_reads { @@ -80,7 +87,7 @@ workflow WDLized_snm3C { # call summary { # input: # trimmed_stats = Sort_and_trim_r1_and_r2.trim_stats, - # hisat3n_stats = hisat_3n_pair_end_mapping_dna_mode.hisat3n_stats, + # hisat3n_stats = Hisat_3n_pair_end_mapping_dna_mode.hisat3n_stats, # r1_hisat3n_stats = hisat_single_end_r1_r2_mapping_dna_mode_and_merge_sort_split_reads_by_name.r1_hisat3n_stats, # r2_hisat3n_stats = hisat_single_end_r1_r2_mapping_dna_mode_and_merge_sort_split_reads_by_name.r2_hisat3n_stats, # dedup_stats = dedup_unique_bam_and_index_unique_bam.dedup_stats, @@ -97,9 +104,11 @@ workflow WDLized_snm3C { #File UniqueAlign_cell_parser_picard_dedup = dedup_unique_bam_and_index_unique_bam.dedup_stats #File SplitReads_cell_parser_hisat_summary = "?" #File hicFiles = call_chromatin_contacts.chromatin_contact_stats - File trimmed_stats = Sort_and_trim_r1_and_r2.trim_stats - File r1_trimmed_fq = Sort_and_trim_r1_and_r2.r1_trimmed_fq - File r2_trimmed_fq = Sort_and_trim_r1_and_r2.r2_trimmed_fq + File trimmed_stats = Sort_and_trim_r1_and_r2.trim_stats_tar + File r1_trimmed_fq = Sort_and_trim_r1_and_r2.r1_trimmed_fq_tar + File r2_trimmed_fq = Sort_and_trim_r1_and_r2.r2_trimmed_fq_tar + File hisat3n_stats_tar = Hisat_3n_pair_end_mapping_dna_mode.hisat3n_paired_end_stats_tar + File hisat3n_bam_tar = Hisat_3n_pair_end_mapping_dna_mode.hisat3n_paired_end_bam_tar } } @@ -204,8 +213,8 @@ task Sort_and_trim_r1_and_r2 { for file in "${R1_files[@]}"; do sample_id=$(basename "$file" "-R1.fq.gz") r2_file="${sample_id}-R2.fq.gz" - gunzip -c "$file" | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > "${sample_id}-R1_sorted.fq" - gunzip -c "$r2_file" | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > "${sample_id}-R2_sorted.fq" + zcat "$file" | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > "${sample_id}-R1_sorted.fq" + zcat "$r2_file" | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > "${sample_id}-R2_sorted.fq" done @@ -245,30 +254,91 @@ task Sort_and_trim_r1_and_r2 { memory: "${mem_size} GiB" } output { - File r1_trimmed_fq = "R1_trimmed_files.tar.gz" - File r2_trimmed_fq = "R2_trimmed_files.tar.gz" - File trim_stats = "trimmed_stats_files.tar.gz" + File r1_trimmed_fq_tar = "R1_trimmed_files.tar.gz" + File r2_trimmed_fq_tar = "R2_trimmed_files.tar.gz" + File trim_stats_tar = "trimmed_stats_files.tar.gz" } } -#task hisat_3n_pair_end_mapping_dna_mode{ -# input { -# File r1_trimmed -# File r2_trimmed -# } -# command <<< -# >>> -# runtime { -# docker: "fill_in" -# disks: "local-disk ${disk_size} HDD" -# cpu: 1 -# memory: "${mem_size} GiB" -# } -# output { -# File hisat3n_bam = "" -# File hisat3n_stats = "" -# } -#} +task Hisat_3n_pair_end_mapping_dna_mode{ + input { + File r1_trimmed_tar + File r2_trimmed_tar + File tarred_index_files + File genome_fa + File chromosome_sizes + + String docker = "us.gcr.io/broad-gotc-prod/m3c-yap-hisat:1.0.0-2.2.1" + Int disk_size = 100 + Int mem_size = 100 + } + command <<< + set -euo pipefail + + mkdir reference/ + mkdir fastq/ + + cp ~{tarred_index_files} reference/ + cp ~{genome_fa} reference/ + cp ~{chromosome_sizes} reference/ + cp ~{r1_trimmed_tar} fastq/ + cp ~{r2_trimmed_tar} fastq/ + + # untar the index files + cd reference/ + echo "Untarring the index files" + tar -zxvf ~{tarred_index_files} + rm ~{tarred_index_files} + samtools faidx hg38.fa + + # untar the demultiplexed fastq files + cd ../fastq/ + echo "Untarring the fastq files" + tar -zxvf ~{r1_trimmed_tar} + tar -zxvf ~{r2_trimmed_tar} + rm ~{r1_trimmed_tar} + rm ~{r2_trimmed_tar} + + # define lists of r1 and r2 fq files + R1_files=($(ls | grep "\-R1_trimmed.fq.gz")) + R2_files=($(ls | grep "\-R2_trimmed.fq.gz")) + + for file in "${R1_files[@]}"; do + sample_id=$(basename "$file" "-R1_trimmed.fq.gz") + hisat-3n /cromwell_root/reference/hg38 \ + -q \ + -1 ${sample_id}-R1_trimmed.fq.gz \ + -2 ${sample_id}-R2_trimmed.fq.gz \ + --directional-mapping-reverse \ + --base-change C,T \ + --no-repeat-index \ + --no-spliced-alignment \ + --no-temp-splicesite \ + -t \ + --new-summary \ + --summary-file ${sample_id}.hisat3n_dna_summary.txt \ + --threads 11 | samtools view -b -q 0 -o "${sample_id}.hisat3n_dna.unsort.bam" + done + + # tar up the bam files and stats files + tar -zcvf hisat3n_paired_end_bam_files.tar.gz *.bam + tar -zcvf hisat3n_paired_end_stats_files.tar.gz *.hisat3n_dna_summary.txt + + mv hisat3n_paired_end_bam_files.tar.gz ../ + mv hisat3n_paired_end_stats_files.tar.gz ../ + + >>> + runtime { + docker: docker + disks: "local-disk ${disk_size} HDD" + cpu: 1 + memory: "${mem_size} GiB" + } + output { + File hisat3n_paired_end_bam_tar = "hisat3n_paired_end_bam_files.tar.gz" + File hisat3n_paired_end_stats_tar = "hisat3n_paired_end_stats_files.tar.gz" + } +} #task separate_unmapped_reads { # input {