From 757960a290b8eece8b5ffd5b0daed5d27f498059 Mon Sep 17 00:00:00 2001 From: Jason Smith Date: Tue, 19 Mar 2019 09:25:17 -0400 Subject: [PATCH 01/15] add anno and tutorial files --- .gitignore | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.gitignore b/.gitignore index 49515244..920f1f2c 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,13 @@ _site/ /_site/ .sass-cache/ .jekyll-metadata + +# Annotation files +anno/hg38_annotations.bed.gz +anno/hg19_annotations.bed.gz +anno/mm10_annotations.bed.gz +anno/mm9_annotations.bed.gz + +# Tutorial files +examples/data/tutorial_r1.fastq.gz +examples/data/tutorial_r2.fastq.gz From 7b0992d92dcdaad53177da4b9ddec273295780e2 Mon Sep 17 00:00:00 2001 From: Jason Smith Date: Thu, 21 Mar 2019 09:58:22 -0400 Subject: [PATCH 02/15] check tools are callable before continuing --- pipelines/pepatac.py | 74 +++++++++++++++++++++++++++++--------------- 1 file changed, 49 insertions(+), 25 deletions(-) diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index e66a1eca..6710e1b2 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -39,26 +39,6 @@ def parse_arguments(): parser.add_argument("-gs", "--genome-size", default="hs", type=str, help="genome size for MACS2") - parser.add_argument("--frip-ref-peaks", default=None, - dest="frip_ref_peaks", type=str, - help="Reference peak set for calculating FRiP") - - parser.add_argument("--TSS-name", default=None, - dest="TSS_name", type=str, - help="Name of TSS annotation file") - - parser.add_argument("--anno-name", default=None, - dest="anno_name", type=str, - help="Name of reference bed file for calculating FRiF") - - parser.add_argument("--keep", action='store_true', - dest="keep", - help="Keep prealignment BAM files") - - parser.add_argument("--noFIFO", action='store_true', - dest="no_fifo", - help="Do NOT use named pipes during prealignments") - parser.add_argument("--peak-caller", dest="peak_caller", default="macs2", choices=PEAK_CALLERS, help="Name of peak caller") @@ -75,6 +55,30 @@ def parse_arguments(): help="Space-delimited list of reference genomes to " "align to before primary alignment.") + parser.add_argument("--TSS-name", default=None, + dest="TSS_name", type=str, + help="Name of TSS annotation file") + + parser.add_argument("--blacklist", default=None, + dest="blacklist", type=str, + help="Name of peak blacklist file") + + parser.add_argument("--frip-ref-peaks", default=None, + dest="frip_ref_peaks", type=str, + help="Reference peak set for calculating FRiP") + + parser.add_argument("--anno-name", default=None, + dest="anno_name", type=str, + help="Reference bed file for calculating FRiF") + + parser.add_argument("--keep", action='store_true', + dest="keep", + help="Keep prealignment BAM files") + + parser.add_argument("--noFIFO", action='store_true', + dest="no_fifo", + help="Do NOT use named pipes during prealignments") + parser.add_argument("--lite", dest="lite", action='store_true', help="Only keep minimal, essential output to conserve " "disk space.") @@ -479,6 +483,19 @@ def main(): param = pm.config.parameters res = pm.config.resources + # Check that the required tools are callable by the pipeline + is_callable = True + missing_tools = [] + for t, tool in tools.__dict__.items(): + if type(tool) != bool: + if not ngstk.check_command(tool): + missing_tools.append(tool) + is_callable = False + if not is_callable: + err_msg = ("PEPATAC could NOT find these required tools: {}\n" + "Confirm they are installed and in your PATH.") + pm.fail_pipeline(RuntimeError(err_msg.format(' '.join(missing_tools)))) + # Set up reference resource according to genome prefix. gfolder = os.path.join(res.genomes, args.genome_assembly) res.chrom_sizes = os.path.join( @@ -488,8 +505,12 @@ def main(): res.TSS_file = os.path.join(gfolder, args.TSS_name) else: res.TSS_file = os.path.join(gfolder, args.genome_assembly + "_TSS.tsv") - res.blacklist = os.path.join( - gfolder, args.genome_assembly + ".blacklist.bed") + + if args.blacklist: + res.blacklist = os.path.join(gfolder, args.blacklist) + else: + res.blacklist = os.path.join( + gfolder, args.genome_assembly + ".blacklist.bed") # Get bowtie2 indexes res.bt2_genome = _get_bowtie2_index(res.genomes, args.genome_assembly) @@ -1267,7 +1288,7 @@ def report_peak_count(): "_annotations.bed")) if not os.path.exists(anno_file) and not os.path.exists(anno_unzip): - print("Skipping read and peak annotation") + print("Skipping read annotation") print("This requires a {} annotation file." .format(args.genome_assembly)) print("Confirm this file is present in {} or specify using `--anno-name`" @@ -1397,8 +1418,11 @@ def report_peak_count(): anchor_image=tssPNG) pm.report_object("Peak partition distribution", gpPDF, anchor_image=gpPNG) - #else: - # print("Could not find {}".format(anno_local)) + else: + print("Cannot annotate peaks without a {} annotation file" + .format(args.genome_assembly)) + print("Confirm this file is present in {} or specify using `--anno-name`" + .format(str(os.path.dirname(anno_file)))) if args.lite: # Remove everything but ultimate outputs From 5a66c2f503424d1ccd00bb5b739007f7c4ea4ad9 Mon Sep 17 00:00:00 2001 From: nsheff Date: Thu, 21 Mar 2019 17:04:14 -0400 Subject: [PATCH 03/15] add multiple targets for fifo process --- pipelines/pepatac.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index e66a1eca..4d052bad 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -259,9 +259,9 @@ def _align_with_bt2(args, tools, paired, useFIFO, unmap_fq1, unmap_fq2, else: if useFIFO and paired: pm.wait = False - pm.run(cmd1, out_fastq_r2, container=pm.container) + pm.run(cmd1, [summary_file, out_fastq_r2], container=pm.container) pm.wait = True - pm.run(cmd2, summary_file, container=pm.container) + pm.run(cmd2, [summary_file, out_fastq_r2], container=pm.container) else: # TODO: switch to this once filter_paired_fq works with SE #pm.run(cmd2, summary_file, container=pm.container) From adf82d1caf56cbf53f532bcd48e8450e2b08e0f5 Mon Sep 17 00:00:00 2001 From: nsheff Date: Fri, 22 Mar 2019 12:41:59 -0400 Subject: [PATCH 04/15] Add check for fifo using exists --- pipelines/pepatac.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index 4d052bad..86250771 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -10,8 +10,8 @@ from argparse import ArgumentParser import os -import sys import re +import sys import tempfile import tarfile import pypiper @@ -208,7 +208,8 @@ def _align_with_bt2(args, tools, paired, useFIFO, unmap_fq1, unmap_fq2, out_fastq_tmp = os.path.join(sub_outdir, assembly_identifier + "_bt2_2") cmd = "mkfifo " + out_fastq_tmp - pm.run(cmd, out_fastq_tmp, container=pm.container) + if not os.path.exists(out_fastq_tmp): + pm.run(cmd, out_fastq_tmp, container=pm.container) elif useFIFO and not paired: out_fastq_tmp = os.path.join(sub_outdir, assembly_identifier + "_bt2") @@ -216,7 +217,8 @@ def _align_with_bt2(args, tools, paired, useFIFO, unmap_fq1, unmap_fq2, out_fastq_tmp = os.path.join(sub_outdir, assembly_identifier + "_bt2_2") cmd = "mkfifo " + out_fastq_tmp - pm.run(cmd, out_fastq_tmp, container=pm.container) + if not os.path.exists(out_fastq_tmp): + pm.run(cmd, out_fastq_tmp, container=pm.container) else: out_fastq_tmp = out_fastq_pre + '_unmap.fq' From 43de0f286b9b70d52e1db2a6029c5f707ad5554d Mon Sep 17 00:00:00 2001 From: nsheff Date: Fri, 22 Mar 2019 13:42:27 -0400 Subject: [PATCH 05/15] use ziptool. Fix #88 --- pipelines/pepatac.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index 86250771..584118f6 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -770,7 +770,7 @@ def check_alignment_genome(): for unmapped_fq in to_compress: # Compress unmapped fastq reads if not pypiper.is_gzipped_fastq(unmapped_fq): - cmd = ("gzip " + unmapped_fq) + cmd = (ngstk.ziptool + " " + unmapped_fq) unmapped_fq = unmapped_fq + ".gz" pm.run(cmd, unmapped_fq, container=pm.container) From 80bb222bcb90b6e18e87cf5c9131bf837060d2c1 Mon Sep 17 00:00:00 2001 From: Jason Smith Date: Fri, 22 Mar 2019 15:47:33 -0400 Subject: [PATCH 06/15] check commands are callable --- pipelines/pepatac.py | 181 +++++++++++++++++++++++-------------------- 1 file changed, 97 insertions(+), 84 deletions(-) diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index 6710e1b2..513742c5 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -36,13 +36,13 @@ def parse_arguments(): required=["input", "genome", "sample-name", "output-parent"]) # Pipeline-specific arguments - parser.add_argument("-gs", "--genome-size", default="hs", type=str, - help="genome size for MACS2") - parser.add_argument("--peak-caller", dest="peak_caller", default="macs2", choices=PEAK_CALLERS, help="Name of peak caller") + parser.add_argument("-gs", "--genome-size", default="hs", type=str, + help="MACS2 genome size") + parser.add_argument("--trimmer", dest="trimmer", default="skewer", choices=TRIMMERS, help="Name of read trimming program") @@ -183,46 +183,37 @@ def _align_with_bt2(args, tools, paired, useFIFO, unmap_fq1, unmap_fq2, pm.clean_add(tempdir) # Build bowtie2 command - if args.keep: - cmd = "(" + tools.bowtie2 + " -p " + str(pm.cores) - cmd += bt2_opts_txt - cmd += " -x " + assembly_bt2 - cmd += " --rg-id " + args.sample_name + if args.keep or not useFIFO: + cmd1 = "(" + tools.bowtie2 + " -p " + str(pm.cores) + cmd1 += bt2_opts_txt + cmd1 += " -x " + assembly_bt2 + cmd1 += " --rg-id " + args.sample_name if paired: - cmd += " -1 " + unmap_fq1 + " -2 " + unmap_fq2 - cmd += " --un-conc-gz " + out_fastq_bt2 + cmd1 += " -1 " + unmap_fq1 + " -2 " + unmap_fq2 + #cmd1 += " --un-conc-gz " + out_fastq_bt2 else: - cmd += " -U " + unmap_fq1 - cmd += " --un-gz " + out_fastq_bt2 - cmd += " | " + tools.samtools + " view -bS - -@ 1" # convert to bam - cmd += " | " + tools.samtools + " sort - -@ 1" # sort output - cmd += " -T " + tempdir - cmd += " -o " + mapped_bam - cmd += ") 2>" + summary_file + cmd1 += " -U " + unmap_fq1 + #cmd1 += " --un-gz " + out_fastq_bt2 + cmd1 += " | " + tools.samtools + " view -bS - -@ 1" # convert to bam + cmd1 += " | " + tools.samtools + " sort - -@ 1" # sort output + cmd1 += " -T " + tempdir + cmd1 += " -o " + mapped_bam + cmd1 += ") 2>" + summary_file + cmd2 = (tools.samtools + " view -f 4 -@ " + str(pm.cores) + + " " + mapped_bam) # In this samtools sort command we print to stdout and then use > to # redirect instead of `+ " -o " + mapped_bam` because then samtools # uses a random temp file, so it won't choke if the job gets # interrupted and restarted at this step. else: - if useFIFO and paired: - out_fastq_tmp = os.path.join(sub_outdir, - assembly_identifier + "_bt2") - if os.path.isfile(out_fastq_tmp): - out_fastq_tmp = os.path.join(sub_outdir, - assembly_identifier + "_bt2_2") - cmd = "mkfifo " + out_fastq_tmp - pm.run(cmd, out_fastq_tmp, container=pm.container) - elif useFIFO and not paired: + out_fastq_tmp = os.path.join(sub_outdir, + assembly_identifier + "_bt2") + if os.path.isfile(out_fastq_tmp): out_fastq_tmp = os.path.join(sub_outdir, - assembly_identifier + "_bt2") - if os.path.isfile(out_fastq_tmp): - out_fastq_tmp = os.path.join(sub_outdir, - assembly_identifier + "_bt2_2") - cmd = "mkfifo " + out_fastq_tmp - pm.run(cmd, out_fastq_tmp, container=pm.container) - else: - out_fastq_tmp = out_fastq_pre + '_unmap.fq' + assembly_identifier + "_bt2_2") + cmd = "mkfifo " + out_fastq_tmp + pm.run(cmd, out_fastq_tmp, container=pm.container) out_fastq_r1 = out_fastq_pre + '_unmap_R1.fq' out_fastq_r2 = out_fastq_pre + '_unmap_R2.fq' @@ -258,10 +249,10 @@ def _align_with_bt2(args, tools, paired, useFIFO, unmap_fq1, unmap_fq2, cmd1 += ") 2>" + summary_file cmd2 = "" - if args.keep: - pm.run(cmd, mapped_bam, container=pm.container) + if args.keep or not useFIFO: + pm.run([cmd1, cmd2], mapped_bam, container=pm.container) else: - if useFIFO and paired: + if paired: pm.wait = False pm.run(cmd1, out_fastq_r2, container=pm.container) pm.wait = True @@ -274,14 +265,21 @@ def _align_with_bt2(args, tools, paired, useFIFO, unmap_fq1, unmap_fq2, pm.clean_add(out_fastq_tmp) - # get concordant aligned read pairs - if args.keep and paired: - cmd = ("grep 'aligned concordantly exactly 1 time' " + + # get aligned read counts + cmd1 = ("grep 'aligned concordantly exactly 1 time' " + summary_file + " | awk '{print $1}'") - else: - cmd = ("grep 'aligned exactly 1 time' " + + cmd2 = ("grep 'aligned exactly 1 time' " + summary_file + " | awk '{print $1}'") - concordant = pm.checkprint(cmd) + if args.keep or not useFIFO and paired: + c1 = pm.checkprint(cmd1) + c2 = pm.checkprint(cmd2) + if c1 and c2: + concordant = float(c1) + float(c2) + else: + concordant = float(c1) + else: + concordant = pm.checkprint(cmd2) + if concordant: ar = float(concordant)*2 else: @@ -303,6 +301,9 @@ def _align_with_bt2(args, tools, paired, useFIFO, unmap_fq1, unmap_fq2, if args.keep and paired: unmap_fq1 = out_fastq_pre + "_unmap_R1.fq.gz" unmap_fq2 = out_fastq_pre + "_unmap_R2.fq.gz" + elif not args.keep and not useFIFO and paired : + unmap_fq1 = out_fastq_pre + "_unmap_R1.fq.gz" + unmap_fq2 = out_fastq_pre + "_unmap_R2.fq.gz" elif not args.keep and paired: unmap_fq1 = out_fastq_r1 unmap_fq2 = out_fastq_r2 @@ -459,6 +460,39 @@ def anno_path(anno_name): ANNO_FOLDER, anno_name) +def check_commands(commands, ignore): + """ + Check if command(s) can be called + + :param attributedict commands: dictionary of commands to check + :param list ignore: list of commands that are optional and can be ignored + """ + + # Use `command` to see if command is callable, store exit code + is_callable = True + uncallable = [] + for name, command in commands.items(): + if command not in ignore: + # if a command is a java file, modify the command + if '.jar' in command: + command = "java -jar " + command + # if an environment variable is not expanded it means it points to + # an uncallable command + if '$' in command: + uncallable.append(command) + + code = os.system("command -v {0} >/dev/null 2>&1 || {{ exit 1; }}".format(command)) + # If exit code is not 0, track which command failed + if code != 0: + uncallable.append(command) + is_callable = False + if is_callable: + return True + else: + print("The following required tool(s) are not callable: {0}".format(' '.join(uncallable))) + return False + + def main(): """ Main pipeline process. @@ -484,17 +518,9 @@ def main(): res = pm.config.resources # Check that the required tools are callable by the pipeline - is_callable = True - missing_tools = [] - for t, tool in tools.__dict__.items(): - if type(tool) != bool: - if not ngstk.check_command(tool): - missing_tools.append(tool) - is_callable = False - if not is_callable: - err_msg = ("PEPATAC could NOT find these required tools: {}\n" - "Confirm they are installed and in your PATH.") - pm.fail_pipeline(RuntimeError(err_msg.format(' '.join(missing_tools)))) + if not check_commands(tools, ["fseq", "${TRIMMOMATIC}", "${PICARD}", "Rscript"]): + err_msg = "Please install missing tools before continuing." + pm.fail_pipeline(RuntimeError(err_msg)) # Set up reference resource according to genome prefix. gfolder = os.path.join(res.genomes, args.genome_assembly) @@ -759,9 +785,9 @@ def main(): # Split genome mapping result bamfile into two: high-quality aligned # reads (keepers) and unmapped reads (in case we want to analyze the # altogether unmapped reads) - # -q 10: skip alignments with MAPQ less than 10 - cmd2 = (tools.samtools + " view -q 10 -b -@ " + str(pm.cores) + - " -U " + failQC_genome_bam + " ") + # Default (samtools.params): skip alignments with MAPQ less than 10 (-q 10) + cmd2 = (tools.samtools + " view -b " + param.samtools.params + " -@ " + + str(pm.cores) + " -U " + failQC_genome_bam + " ") if args.paired_end: # add a step to accept only reads mapped in proper pair cmd2 += "-f 2 " @@ -789,7 +815,7 @@ def check_alignment_genome(): for unmapped_fq in to_compress: # Compress unmapped fastq reads if not pypiper.is_gzipped_fastq(unmapped_fq): - cmd = ("gzip " + unmapped_fq) + cmd = (ngstk.ziptool + unmapped_fq) unmapped_fq = unmapped_fq + ".gz" pm.run(cmd, unmapped_fq, container=pm.container) @@ -802,7 +828,7 @@ def check_alignment_genome(): pm.clean_add(temp_mapping_index) # Determine mitochondrial read counts - mito_name = ["chrM", "chrMT", "M", "MT"] + mito_name = ["chrM", "chrMT", "M", "MT", "rCRSd"] # If first run, use the temp bam file if os.path.isfile(mapping_genome_bam_temp) and os.stat(mapping_genome_bam_temp).st_size > 0: @@ -1150,22 +1176,11 @@ def report_peak_count(): pm.stop_pipeline() else: if args.peak_caller == "fseq": - fseq_cmd_chunks = [tools.fseq, ("-o", peak_folder)] - - # Parse only a subset of fseq options. - for fseq_opt in ["of", "l", "t", "s"]: - fseq_value = param.fseq[fseq_opt] - # TODO: use more natural try/except once PipelineManager parameters - # AD is strict. - if fseq_value == fseq_opt: - # Non-strict pipeline parameters AttributeDict returns key - # itself if missing. - continue - # We're building a command, so even non-text values need no special - # handling. - fseq_optval = ("-{}".format(fseq_opt), fseq_value) - fseq_cmd_chunks.append(fseq_optval) - + fseq_cmd_chunks = [ + tools.fseq, + ("-o", peak_folder), + param.fseq.params + ] # Create the peak calling command fseq_cmd_chunks.append(peak_input_file) fseq_cmd = build_command(fseq_cmd_chunks) @@ -1186,13 +1201,10 @@ def report_peak_count(): macs_cmd_chunks = [ "{} callpeak".format(tools.macs2), ("-t", peak_input_file), - "-f BED", - ("-g", args.genome_size), ("--outdir", peak_folder), ("-n", args.sample_name), - ("-q", param.macs2.q), - ("--shift", param.macs2.shift), - "--nomodel" + ("-g", args.genome_size), + param.macs2.params ] # Note: required input file is non-positional ("treatment" file -t) cmd = build_command(macs_cmd_chunks) @@ -1304,13 +1316,14 @@ def report_peak_count(): if os.path.isfile(anno_local): # Get list of features - cmd1 = ("gunzip -c " + anno_local + " | cut -f 4 | sort -u") + cmd1 = (ngstk.ziptool + " -d -c " + anno_local + + " | cut -f 4 | sort -u") ftList = pm.checkprint(cmd1) ftList = str.splitlines(ftList) # Split annotation file on features - cmd2 = ("gunzip -c " + anno_local + " | awk -F'\t' '{print>\"" + - QC_folder + "/\"$4}'") + cmd2 = (ngstk.ziptool + " -d -c " + anno_local + + " | awk -F'\t' '{print>\"" + QC_folder + "/\"$4}'") if len(ftList) >= 1: for pos, anno in enumerate(ftList): annoFile = os.path.join(QC_folder, anno) @@ -1347,7 +1360,7 @@ def report_peak_count(): pm.timestamp("### Plot FRiP/F") cmd = (tools.samtools + " view -@ " + str(pm.cores) + - " -q 15 -c -F4 " + rmdup_bam) + param.samtools.params + " -c -F4 " + rmdup_bam) totalReads = pm.checkprint(cmd) totalReads = str(totalReads).rstrip() From a4bf65d813ee5bd5d4cf331e0225ec28d198240d Mon Sep 17 00:00:00 2001 From: Jason Smith Date: Fri, 22 Mar 2019 15:48:00 -0400 Subject: [PATCH 07/15] adjust how tool parameters are adjusted and called --- pipelines/pepatac.yaml | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/pipelines/pepatac.yaml b/pipelines/pepatac.yaml index 155d4ea6..71981633 100644 --- a/pipelines/pepatac.yaml +++ b/pipelines/pepatac.yaml @@ -17,7 +17,6 @@ tools: # absolute paths to required tools bedGraphToBigWig: bedGraphToBigWig wigToBigWig: wigToBigWig bigWigCat: bigWigCat - bedSort: bedSort bedToBigBed: bedToBigBed # optional tools fseq: fseq @@ -31,15 +30,20 @@ resources: adapters: null # Set to null to use default adapter file included in repository parameters: # parameters passed to bioinformatic tools, subclassed by tool +# Adjust/Add/Remove parameters for individual tools here samtools: - q: 10 + params: '-q 10' + # -q: skip alignments with MAPQ < 10. macs2: - f: BED - q: 0.01 - shift: 0 + params: '-f BED -q 0.01 --shift 0 --nomodel' + # -f: Format of tag file. + # -q: The qvalue (minimum FDR) cutoff to call significant regions. + # --shift: Assign an arbitrary shift in bp. See MACS documentation. + # --nomodel: Will bybass building the shifting model. fseq: - of: npf # narrowPeak as output format - l: 600 # feature length - t: 4.0 # "threshold" (standard deviations) - s: 1 # wiggle track step + params: '-of npf -l 600 -t 4.0 -s 1' + # -of: narrowPeak as output format. + # -l: feature length. + # -t: "threshold" (standard deviations). + # -s: wiggle track step. \ No newline at end of file From 2d6e6e17d3db657a7406347496550f389d02c36d Mon Sep 17 00:00:00 2001 From: nsheff Date: Fri, 22 Mar 2019 16:43:32 -0400 Subject: [PATCH 08/15] pipeline is now python3 compatible --- pipelines/pepatac.py | 8 ++++++-- tools/bamSitesToWig.py | 8 ++++---- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index 584118f6..ac95b76a 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -118,6 +118,10 @@ def calc_frip(bamfile, peakfile, frip_func, pipeline_manager, num_peak_reads = pipeline_manager.checkprint(frip_cmd) num_aligned_reads = pipeline_manager.get_stat(aligned_reads_key) print(num_aligned_reads, num_peak_reads) + # python3 requires we be extra careful, and the above commands are returning + # some garbage. + num_peak_reads = re.sub("[^0-9]", "", num_peak_reads.decode('utf-8')) + num_aligned_reads = re.sub("[^0-9]", "", num_aligned_reads) return float(num_peak_reads) / float(num_aligned_reads) @@ -1027,7 +1031,7 @@ def post_dup_aligned_reads(dedup_log): cmd += " -o " + exact_target cmd += " -w " + smooth_target cmd += " -m " + "atac" - cmd += " -p " + str(max(1, int(pm.cores) * 2/3)) + cmd += " -p " + str(int(max(1, int(pm.cores) * 2/3))) cmd2 = "touch " + temp_target pm.run([cmd, cmd2], temp_target, container=pm.container) pm.clean_add(temp_target) @@ -1063,7 +1067,7 @@ def post_dup_aligned_reads(dedup_log): # include in summary stats. This could be done in prettier ways which # I'm open to. Just adding for the idea. with open(Tss_enrich) as f: - floats = map(float, f) + floats = list(map(float, f)) try: # If the TSS enrichment is 0, don't report Tss_score = ((sum(floats[1950:2050]) / 100) / diff --git a/tools/bamSitesToWig.py b/tools/bamSitesToWig.py index 104c733c..8c49760e 100755 --- a/tools/bamSitesToWig.py +++ b/tools/bamSitesToWig.py @@ -199,7 +199,7 @@ def get_shifted_pos(read, shift_factor): header_line = ("fixedStep chrom=" + chrom + " start=" + str(begin) + " step=1\n") - cutsToWigProcess.stdin.write(header_line) + cutsToWigProcess.stdin.write(header_line.encode('utf-8')) if self.smoothbw: if self.variable_step: @@ -209,17 +209,17 @@ def get_shifted_pos(read, shift_factor): str(begin) + " step=" + str(self.step_size) + "\n") - cutsToWigProcessSm.stdin.write(header_line) + cutsToWigProcessSm.stdin.write(header_line.encode('utf-8')) try: for read in reads: shifted_pos = get_shifted_pos(read, shift_factor) if self.exactbw: - cutsToWigProcess.stdin.write(str(shifted_pos) + "\n") + cutsToWigProcess.stdin.write((str(shifted_pos) + "\n").encode('utf-8')) if self.smoothbw: - cutsToWigProcessSm.stdin.write(str(shifted_pos) + "\n") + cutsToWigProcessSm.stdin.write((str(shifted_pos) + "\n").encode('utf-8')) if self.bedout: strand = "-" if read.is_reverse else "+" From ee090127c08fd3adc7a929156c4074d5b391934f Mon Sep 17 00:00:00 2001 From: Jason Smith Date: Fri, 22 Mar 2019 16:54:48 -0400 Subject: [PATCH 09/15] modify prealignments --- pipelines/pepatac.py | 168 ++++++++++++++++--------------------------- 1 file changed, 60 insertions(+), 108 deletions(-) diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index 73948cef..75b2d899 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -182,117 +182,78 @@ def _align_with_bt2(args, tools, paired, useFIFO, unmap_fq1, unmap_fq2, tempdir = tempfile.mkdtemp(dir=sub_outdir) pm.clean_add(tempdir) + out_fastq_r1 = out_fastq_pre + '_unmap_R1.fq' + out_fastq_r2 = out_fastq_pre + '_unmap_R2.fq' + + if useFIFO: + out_fastq_tmp = os.path.join(sub_outdir, + assembly_identifier + "_bt2") + cmd = "mkfifo " + out_fastq_tmp + if not os.path.exists(out_fastq_tmp): + pm.run(cmd, out_fastq_tmp, container=pm.container) + else: + out_fastq_tmp = out_fastq_pre + '_unmap.fq' + + filter_pair = build_command([tools.perl, + tool_path("filter_paired_fq.pl"), out_fastq_tmp, + unmap_fq1, unmap_fq2, out_fastq_r1, out_fastq_r2]) + # TODO: make filter_paired_fq work with SE data + # cmd = build_command([tools.perl, + # tool_path("filter_paired_fq.pl"), out_fastq_tmp, + # unmap_fq1, out_fastq_r1]) + # For now, revert to old method + # Build bowtie2 command - if args.keep or not useFIFO: - cmd1 = "(" + tools.bowtie2 + " -p " + str(pm.cores) - cmd1 += bt2_opts_txt - cmd1 += " -x " + assembly_bt2 - cmd1 += " --rg-id " + args.sample_name - if paired: - cmd1 += " -1 " + unmap_fq1 + " -2 " + unmap_fq2 - cmd1 += " --un-conc-gz " + out_fastq_bt2 - else: - cmd1 += " -U " + unmap_fq1 - cmd1 += " --un-gz " + out_fastq_bt2 - cmd1 += " | " + tools.samtools + " view -bS - -@ 1" # convert to bam - cmd1 += " | " + tools.samtools + " sort - -@ 1" # sort output - cmd1 += " -T " + tempdir - cmd1 += " -o " + mapped_bam - cmd1 += ") 2>" + summary_file - - cmd2 = (tools.samtools + " view -f 4 -@ " + str(pm.cores) + - " " + mapped_bam) + if args.keep or not paired: + cmd = "(" + tools.bowtie2 + " -p " + str(pm.cores) + cmd += bt2_opts_txt + cmd += " -x " + assembly_bt2 + cmd += " --rg-id " + args.sample_name + cmd += " -U " + unmap_fq1 + cmd += " --un-gz " + out_fastq_bt2 + cmd += " | " + tools.samtools + " view -bS - -@ 1" # convert to bam + cmd += " | " + tools.samtools + " sort - -@ 1" # sort output + cmd += " -T " + tempdir + cmd += " -o " + mapped_bam + cmd += ") 2>" + summary_file # In this samtools sort command we print to stdout and then use > to # redirect instead of `+ " -o " + mapped_bam` because then samtools # uses a random temp file, so it won't choke if the job gets # interrupted and restarted at this step. - else: - if useFIFO and paired: - out_fastq_tmp = os.path.join(sub_outdir, - assembly_identifier + "_bt2") - if os.path.isfile(out_fastq_tmp): - out_fastq_tmp = os.path.join(sub_outdir, - assembly_identifier + "_bt2_2") - cmd = "mkfifo " + out_fastq_tmp - if not os.path.exists(out_fastq_tmp): - pm.run(cmd, out_fastq_tmp, container=pm.container) - elif useFIFO and not paired: - out_fastq_tmp = os.path.join(sub_outdir, - assembly_identifier + "_bt2") - if os.path.isfile(out_fastq_tmp): - out_fastq_tmp = os.path.join(sub_outdir, - assembly_identifier + "_bt2_2") - cmd = "mkfifo " + out_fastq_tmp - if not os.path.exists(out_fastq_tmp): - pm.run(cmd, out_fastq_tmp, container=pm.container) - else: - out_fastq_tmp = out_fastq_pre + '_unmap.fq' - - out_fastq_r1 = out_fastq_pre + '_unmap_R1.fq' - out_fastq_r2 = out_fastq_pre + '_unmap_R2.fq' - - if paired: - cmd1 = build_command([tools.perl, - tool_path("filter_paired_fq.pl"), out_fastq_tmp, - unmap_fq1, unmap_fq2, out_fastq_r1, out_fastq_r2]) - cmd2 = "(" + tools.bowtie2 + " -p " + str(pm.cores) - cmd2 += bt2_opts_txt - cmd2 += " -x " + assembly_bt2 - cmd2 += " --rg-id " + args.sample_name - cmd2 += " -U " + unmap_fq1 - cmd2 += " --un " + out_fastq_tmp - cmd2 += " > /dev/null" - cmd2 += ") 2>" + summary_file - else: - # TODO: make filter_paired_fq work with SE data - # cmd1 = build_command([tools.perl, - # tool_path("filter_paired_fq.pl"), out_fastq_tmp, - # unmap_fq1, out_fastq_r1]) - # For now, revert to old method - cmd1 = "(" + tools.bowtie2 + " -p " + str(pm.cores) - cmd1 += bt2_opts_txt - cmd1 += " -x " + assembly_bt2 - cmd1 += " --rg-id " + args.sample_name - cmd1 += " -U " + unmap_fq1 - cmd1 += " --un-gz " + out_fastq_bt2 - cmd1 += " | " + tools.samtools + " view -bS - -@ 1" # convert to bam - cmd1 += " | " + tools.samtools + " sort - -@ 1" # sort output - cmd1 += " -T " + tempdir - cmd1 += " -o " + mapped_bam - cmd1 += ") 2>" + summary_file - cmd2 = "" - - if args.keep or not useFIFO: - pm.run([cmd1, cmd2], mapped_bam, container=pm.container) + else: + cmd = "(" + tools.bowtie2 + " -p " + str(pm.cores) + cmd += bt2_opts_txt + cmd += " -x " + assembly_bt2 + cmd += " --rg-id " + args.sample_name + cmd += " -U " + unmap_fq1 + cmd += " --un " + out_fastq_tmp + cmd += " > /dev/null" + cmd += ") 2>" + summary_file + + if args.keep: + pm.run([cmd, filter_pair], mapped_bam, container=pm.container) else: if paired: pm.wait = False - pm.run(cmd1, [summary_file, out_fastq_r2], container=pm.container) + pm.run(filter_pair, [summary_file, out_fastq_r2], container=pm.container) pm.wait = True - pm.run(cmd2, [summary_file, out_fastq_r2], container=pm.container) + pm.run(cmd, [summary_file, out_fastq_r2], container=pm.container) else: # TODO: switch to this once filter_paired_fq works with SE #pm.run(cmd2, summary_file, container=pm.container) #pm.run(cmd1, out_fastq_r1, container=pm.container) - pm.run(cmd1, out_fastq_bt2, container=pm.container) + pm.run(cmd, out_fastq_bt2, container=pm.container) pm.clean_add(out_fastq_tmp) # get aligned read counts - cmd1 = ("grep 'aligned concordantly exactly 1 time' " + - summary_file + " | awk '{print $1}'") - cmd2 = ("grep 'aligned exactly 1 time' " + + if args.keep and paired: + cmd = ("grep 'aligned concordantly exactly 1 time' " + summary_file + " | awk '{print $1}'") - if args.keep or not useFIFO and paired: - c1 = pm.checkprint(cmd1) - c2 = pm.checkprint(cmd2) - if c1 and c2: - concordant = float(c1) + float(c2) - else: - concordant = float(c1) else: - concordant = pm.checkprint(cmd2) - + cmd = ("grep 'aligned exactly 1 time' " + + summary_file + " | awk '{print $1}'") + concordant = pm.checkprint(cmd) if concordant: ar = float(concordant)*2 else: @@ -312,17 +273,11 @@ def _align_with_bt2(args, tools, paired, useFIFO, unmap_fq1, unmap_fq2, # filter genome reads not mapped if args.keep and paired: - unmap_fq1 = out_fastq_pre + "_unmap_R1.fq.gz" - unmap_fq2 = out_fastq_pre + "_unmap_R2.fq.gz" - elif not args.keep and not useFIFO and paired : - unmap_fq1 = out_fastq_pre + "_unmap_R1.fq.gz" - unmap_fq2 = out_fastq_pre + "_unmap_R2.fq.gz" - elif not args.keep and paired: unmap_fq1 = out_fastq_r1 unmap_fq2 = out_fastq_r2 - elif args.keep and not paired: - unmap_fq1 = out_fastq_bt2 - unmap_fq2 = "" + elif not args.keep and paired : + unmap_fq1 = out_fastq_pre + "_unmap_R1.fq.gz" + unmap_fq2 = out_fastq_pre + "_unmap_R2.fq.gz" else: # Use alternate once filter_paired_fq is working with SE #unmap_fq1 = out_fastq_r1 @@ -491,11 +446,12 @@ def check_commands(commands, ignore): command = "java -jar " + command # if an environment variable is not expanded it means it points to # an uncallable command - if '$' in command: + if '$' in command: uncallable.append(command) - code = os.system("command -v {0} >/dev/null 2>&1 || {{ exit 1; }}".format(command)) + code = os.system("command -v {0} >/dev/null 2>&1 || {{ exit 1; }}".format(command)) # If exit code is not 0, track which command failed + #print("{} code {}".format(command, code)) # DEBUG if code != 0: uncallable.append(command) is_callable = False @@ -828,11 +784,7 @@ def check_alignment_genome(): for unmapped_fq in to_compress: # Compress unmapped fastq reads if not pypiper.is_gzipped_fastq(unmapped_fq): -<<<<<<< HEAD - cmd = (ngstk.ziptool + unmapped_fq) -======= cmd = (ngstk.ziptool + " " + unmapped_fq) ->>>>>>> 43de0f286b9b70d52e1db2a6029c5f707ad5554d unmapped_fq = unmapped_fq + ".gz" pm.run(cmd, unmapped_fq, container=pm.container) @@ -1079,7 +1031,7 @@ def post_dup_aligned_reads(dedup_log): map_genome_folder, args.sample_name + "_smooth.bw") shift_bed = os.path.join(exact_folder, args.sample_name + "_shift.bed") - wig_cmd_callable = ngstk.check_command("wigToBigWig") + #wig_cmd_callable = ngstk.check_command("wigToBigWig") if wig_cmd_callable: cmd = tool_path("bamSitesToWig.py") From 2a459539907ca09f9f4514bbf44e64110c7853f1 Mon Sep 17 00:00:00 2001 From: Jason Smith Date: Mon, 25 Mar 2019 13:44:49 -0400 Subject: [PATCH 10/15] make order arguments more logical --- pipelines/pepatac.py | 131 ++++++++++++++++++------------------------- 1 file changed, 56 insertions(+), 75 deletions(-) diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index 62bf9ab9..657e88ee 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -5,7 +5,7 @@ __author__ = ["Jin Xu", "Nathan Sheffield", "Jason Smith"] __email__ = "jasonsmith@virginia.edu" -__version__ = "0.8.5" +__version__ = "0.8.6" from argparse import ArgumentParser @@ -47,14 +47,14 @@ def parse_arguments(): default="skewer", choices=TRIMMERS, help="Name of read trimming program") - parser.add_argument("--deduplicator", dest="deduplicator", - default="samblaster", choices=DEDUPLICATORS, - help="Name of deduplicator program") - parser.add_argument("--prealignments", default=[], type=str, nargs="+", help="Space-delimited list of reference genomes to " "align to before primary alignment.") + parser.add_argument("--deduplicator", dest="deduplicator", + default="samblaster", choices=DEDUPLICATORS, + help="Name of deduplicator program") + parser.add_argument("--TSS-name", default=None, dest="TSS_name", type=str, help="Name of TSS annotation file") @@ -122,10 +122,6 @@ def calc_frip(bamfile, peakfile, frip_func, pipeline_manager, num_peak_reads = pipeline_manager.checkprint(frip_cmd) num_aligned_reads = pipeline_manager.get_stat(aligned_reads_key) print(num_aligned_reads, num_peak_reads) - # python3 requires we be extra careful, and the above commands are returning - # some garbage. - num_peak_reads = re.sub("[^0-9]", "", num_peak_reads.decode('utf-8')) - num_aligned_reads = re.sub("[^0-9]", "", num_aligned_reads) return float(num_peak_reads) / float(num_aligned_reads) @@ -189,7 +185,7 @@ def _align_with_bt2(args, tools, paired, useFIFO, unmap_fq1, unmap_fq2, out_fastq_r1 = out_fastq_pre + '_unmap_R1.fq' out_fastq_r2 = out_fastq_pre + '_unmap_R2.fq' - if useFIFO: + if useFIFO and not args.keep: out_fastq_tmp = os.path.join(sub_outdir, assembly_identifier + "_bt2") cmd = "mkfifo " + out_fastq_tmp @@ -208,62 +204,59 @@ def _align_with_bt2(args, tools, paired, useFIFO, unmap_fq1, unmap_fq2, # For now, revert to old method # Build bowtie2 command - if args.keep or not paired: - cmd = "(" + tools.bowtie2 + " -p " + str(pm.cores) - cmd += bt2_opts_txt - cmd += " -x " + assembly_bt2 - cmd += " --rg-id " + args.sample_name - cmd += " -U " + unmap_fq1 - cmd += " --un-gz " + out_fastq_bt2 + cmd = "(" + tools.bowtie2 + " -p " + str(pm.cores) + cmd += bt2_opts_txt + cmd += " -x " + assembly_bt2 + cmd += " --rg-id " + args.sample_name + cmd += " -U " + unmap_fq1 + cmd += " --un " + out_fastq_tmp + if args.keep: # or not paired + #cmd += " --un-gz " + out_fastq_bt2 # TODO drop this for paired... because repair-ing with filter_paired_fq.pl + # In this samtools sort command we print to stdout and then use > to + # redirect instead of `+ " -o " + mapped_bam` because then samtools + # uses a random temp file, so it won't choke if the job gets + # interrupted and restarted at this step. cmd += " | " + tools.samtools + " view -bS - -@ 1" # convert to bam cmd += " | " + tools.samtools + " sort - -@ 1" # sort output cmd += " -T " + tempdir cmd += " -o " + mapped_bam - cmd += ") 2>" + summary_file - # In this samtools sort command we print to stdout and then use > to - # redirect instead of `+ " -o " + mapped_bam` because then samtools - # uses a random temp file, so it won't choke if the job gets - # interrupted and restarted at this step. - else: - cmd = "(" + tools.bowtie2 + " -p " + str(pm.cores) - cmd += bt2_opts_txt - cmd += " -x " + assembly_bt2 - cmd += " --rg-id " + args.sample_name - cmd += " -U " + unmap_fq1 - cmd += " --un " + out_fastq_tmp + else: cmd += " > /dev/null" - cmd += ") 2>" + summary_file + cmd += ") 2>" + summary_file - if args.keep: - pm.run([cmd, filter_pair], mapped_bam, container=pm.container) - else: - if paired: + if paired: + if args.keep or not useFIFO: + pm.run([cmd, filter_pair], mapped_bam, container=pm.container) + else: pm.wait = False pm.run(filter_pair, [summary_file, out_fastq_r2], container=pm.container) pm.wait = True pm.run(cmd, [summary_file, out_fastq_r2], container=pm.container) + else: + if args.keep: + pm.run(cmd, mapped_bam, container=pm.container) else: # TODO: switch to this once filter_paired_fq works with SE #pm.run(cmd2, summary_file, container=pm.container) #pm.run(cmd1, out_fastq_r1, container=pm.container) pm.run(cmd, out_fastq_bt2, container=pm.container) - pm.clean_add(out_fastq_tmp) - + pm.clean_add(out_fastq_tmp) + # get aligned read counts - if args.keep and paired: - cmd = ("grep 'aligned concordantly exactly 1 time' " + - summary_file + " | awk '{print $1}'") - else: - cmd = ("grep 'aligned exactly 1 time' " + - summary_file + " | awk '{print $1}'") - concordant = pm.checkprint(cmd) - if concordant: - ar = float(concordant)*2 + #if args.keep and paired: + # cmd = ("grep 'aligned concordantly exactly 1 time' " + + # summary_file + " | awk '{print $1}'") + #else: + cmd = ("grep 'aligned exactly 1 time' " + summary_file + + " | awk '{print $1}'") + align_exact = pm.checkprint(cmd) + if align_exact: + ar = float(align_exact)*2 else: ar = 0 - # report concordant aligned reads + # report aligned reads pm.report_result("Aligned_reads_" + assembly_identifier, ar) try: # wrapped in try block in case Trimmed_reads is not reported in this @@ -275,13 +268,9 @@ def _align_with_bt2(args, tools, paired, useFIFO, unmap_fq1, unmap_fq2, res_key = "Alignment_rate_" + assembly_identifier pm.report_result(res_key, round(float(ar) * 100 / float(tr), 2)) - # filter genome reads not mapped - if args.keep and paired: + if paired: unmap_fq1 = out_fastq_r1 unmap_fq2 = out_fastq_r2 - elif not args.keep and paired : - unmap_fq1 = out_fastq_pre + "_unmap_R1.fq.gz" - unmap_fq2 = out_fastq_pre + "_unmap_R2.fq.gz" else: # Use alternate once filter_paired_fq is working with SE #unmap_fq1 = out_fastq_r1 @@ -450,7 +439,7 @@ def check_commands(commands, ignore): command = "java -jar " + command # if an environment variable is not expanded it means it points to # an uncallable command - if '$' in command: + if '$' in command: uncallable.append(command) code = os.system("command -v {0} >/dev/null 2>&1 || {{ exit 1; }}".format(command)) @@ -682,7 +671,6 @@ def main(): trimmed_fastq, args.paired_end, trimmed_fastq_R2, fastqc_folder=os.path.join(param.outfolder, "fastqc")), container=pm.container) - pm.clean_add(os.path.join(fastq_folder, "*.fq"), conditional=True) pm.clean_add(os.path.join(fastq_folder, "*.log"), conditional=True) @@ -810,8 +798,7 @@ def check_alignment_genome(): else: bam_file = mapping_genome_bam - cmd = (tools.samtools + " idxstats " + mapping_genome_bam_temp + - " | grep") + cmd = (tools.samtools + " idxstats " + bam_file + " | grep") for name in mito_name: cmd += " -we '" + name + "'" cmd += "| cut -f 3" @@ -1035,24 +1022,18 @@ def post_dup_aligned_reads(dedup_log): map_genome_folder, args.sample_name + "_smooth.bw") shift_bed = os.path.join(exact_folder, args.sample_name + "_shift.bed") - #wig_cmd_callable = ngstk.check_command("wigToBigWig") - - if wig_cmd_callable: - cmd = tool_path("bamSitesToWig.py") - cmd += " -i " + rmdup_bam - cmd += " -c " + res.chrom_sizes - cmd += " -b " + shift_bed # request bed output - cmd += " -o " + exact_target - cmd += " -w " + smooth_target - cmd += " -m " + "atac" - cmd += " -p " + str(int(max(1, int(pm.cores) * 2/3))) - cmd2 = "touch " + temp_target - pm.run([cmd, cmd2], temp_target, container=pm.container) - pm.clean_add(temp_target) - pm.clean_add(temp_exact_folder) - else: - print("Skipping signal track production -- Could not call \'wigToBigWig\'.") - print("Check that you have the required UCSC tools in your PATH.") + cmd = tool_path("bamSitesToWig.py") + cmd += " -i " + rmdup_bam + cmd += " -c " + res.chrom_sizes + cmd += " -b " + shift_bed # request bed output + cmd += " -o " + exact_target + cmd += " -w " + smooth_target + cmd += " -m " + "atac" + cmd += " -p " + str(max(1, int(pm.cores) * 2/3)) + cmd2 = "touch " + temp_target + pm.run([cmd, cmd2], temp_target, container=pm.container) + pm.clean_add(temp_target) + pm.clean_add(temp_exact_folder) # TSS enrichment if not os.path.exists(res.TSS_file): @@ -1081,7 +1062,7 @@ def post_dup_aligned_reads(dedup_log): # include in summary stats. This could be done in prettier ways which # I'm open to. Just adding for the idea. with open(Tss_enrich) as f: - floats = list(map(float, f)) + floats = map(float, f) try: # If the TSS enrichment is 0, don't report Tss_score = ((sum(floats[1950:2050]) / 100) / @@ -1332,7 +1313,7 @@ def report_peak_count(): pm.timestamp("### Plot FRiP/F") - cmd = (tools.samtools + " view -@ " + str(pm.cores) + + cmd = (tools.samtools + " view -@ " + str(pm.cores) + " " + param.samtools.params + " -c -F4 " + rmdup_bam) totalReads = pm.checkprint(cmd) totalReads = str(totalReads).rstrip() From dcf16d3314b8cf515f4cdb20c07242095e19e8bd Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Tue, 26 Mar 2019 10:23:27 -0400 Subject: [PATCH 11/15] fix unreferenced variable when no prealignments --- pipelines/pepatac.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index 657e88ee..72015d84 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -683,16 +683,16 @@ def main(): unmap_fq2 = trimmed_fastq_R2 # Map to any requested prealignments - # We recommend mapping to chrM first for ATAC-seq data + # We recommend mapping to chrM (i.e. rCRSd) prior to primary genome alignment pm.timestamp("### Prealignments") + # Keep track of the unmapped files in order to compress them after final + # alignment. + to_compress = [] if len(args.prealignments) == 0: print("You may use `--prealignments` to align to references before " "the genome alignment step. See docs.") else: print("Prealignment assemblies: " + str(args.prealignments)) - # Keep track of the unmapped files in order to compress them after final - # alignment. - to_compress = [] # Loop through any prealignment references and map to them sequentially for reference in args.prealignments: if args.no_fifo: From d85bf395ff39435d9cb22e28abf5af050572427c Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Tue, 26 Mar 2019 10:24:35 -0400 Subject: [PATCH 12/15] update usage docs --- docs/usage.md | 46 ++++++++++++++++++++++++---------------------- usage.txt | 44 +++++++++++++++++++++++--------------------- 2 files changed, 47 insertions(+), 43 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 7f00652d..da128714 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -10,52 +10,54 @@ usage: pepatac.py [-h] [-R] [-N] [-D] [-F] [-C CONFIG_FILE] -O PARENT_OUTPUT_FOLDER [-M MEMORY_LIMIT] [-P NUMBER_OF_CORES] -S SAMPLE_NAME -I INPUT_FILES [INPUT_FILES ...] [-I2 [INPUT_FILES2 [INPUT_FILES2 ...]]] -G GENOME_ASSEMBLY - [-Q SINGLE_OR_PAIRED] [-gs GENOME_SIZE] - [--frip-ref-peaks FRIP_REF_PEAKS] [--TSS-name TSS_NAME] - [--anno-name ANNO_NAME] [--keep] [--noFIFO] - [--peak-caller {fseq,macs2}] - [--trimmer {trimmomatic,pyadapt,skewer}] - [--deduplicator {picard,samblaster}] - [--prealignments PREALIGNMENTS [PREALIGNMENTS ...]] [--lite] - [-V] + [-Q SINGLE_OR_PAIRED] [--peak-caller {fseq,macs2}] + [-gs GENOME_SIZE] [--trimmer {trimmomatic,pyadapt,skewer}] + [--prealignments PREALIGNMENTS [PREALIGNMENTS ...]] + [--deduplicator {picard,samblaster}] [--TSS-name TSS_NAME] + [--blacklist BLACKLIST] [--frip-ref-peaks FRIP_REF_PEAKS] + [--anno-name ANNO_NAME] [--keep] [--noFIFO] [--lite] [-V] -PEPATAC version 0.8.5 +PEPATAC version 0.8.6 optional arguments: -h, --help show this help message and exit -R, --recover Overwrite locks to recover from previous failed run -N, --new-start Overwrite all results to start a fresh run - -D, --dirty Do not auto-delete intermediate files + -D, --dirty Don't auto-delete intermediate files -F, --force-follow Always run 'follow' commands -C CONFIG_FILE, --config CONFIG_FILE Pipeline configuration file (YAML). Relative paths are with respect to the pipeline script. -M MEMORY_LIMIT, --mem MEMORY_LIMIT - Memory limit (in Mb) for processes accepting such + Memory limit for processes accepting such. Default + units are megabytes unless specified using the suffix + [K|M|G|T]. -P NUMBER_OF_CORES, --cores NUMBER_OF_CORES Number of cores for parallelized processes -I2 [INPUT_FILES2 [INPUT_FILES2 ...]], --input2 [INPUT_FILES2 [INPUT_FILES2 ...]] Secondary input files, such as read2 -Q SINGLE_OR_PAIRED, --single-or-paired SINGLE_OR_PAIRED Single- or paired-end sequencing protocol - -gs GENOME_SIZE, --genome-size GENOME_SIZE - genome size for MACS2 - --frip-ref-peaks FRIP_REF_PEAKS - Reference peak set for calculating FRiP - --TSS-name TSS_NAME Name of TSS annotation file - --anno-name ANNO_NAME - Name of reference bed file for calculating FRiF - --keep Keep prealignment BAM files - --noFIFO Do NOT use named pipes during prealignments --peak-caller {fseq,macs2} Name of peak caller + -gs GENOME_SIZE, --genome-size GENOME_SIZE + MACS2 genome size --trimmer {trimmomatic,pyadapt,skewer} Name of read trimming program - --deduplicator {picard,samblaster} - Name of deduplicator program --prealignments PREALIGNMENTS [PREALIGNMENTS ...] Space-delimited list of reference genomes to align to before primary alignment. + --deduplicator {picard,samblaster} + Name of deduplicator program + --TSS-name TSS_NAME Name of TSS annotation file + --blacklist BLACKLIST + Name of peak blacklist file + --frip-ref-peaks FRIP_REF_PEAKS + Reference peak set for calculating FRiP + --anno-name ANNO_NAME + Reference bed file for calculating FRiF + --keep Keep prealignment BAM files + --noFIFO Do NOT use named pipes during prealignments --lite Only keep minimal, essential output to conserve disk space. -V, --version show program's version number and exit diff --git a/usage.txt b/usage.txt index 5d542c9c..6bd9f56f 100644 --- a/usage.txt +++ b/usage.txt @@ -2,16 +2,14 @@ usage: pepatac.py [-h] [-R] [-N] [-D] [-F] [-C CONFIG_FILE] -O PARENT_OUTPUT_FOLDER [-M MEMORY_LIMIT] [-P NUMBER_OF_CORES] -S SAMPLE_NAME -I INPUT_FILES [INPUT_FILES ...] [-I2 [INPUT_FILES2 [INPUT_FILES2 ...]]] -G GENOME_ASSEMBLY - [-Q SINGLE_OR_PAIRED] [-gs GENOME_SIZE] - [--frip-ref-peaks FRIP_REF_PEAKS] [--TSS-name TSS_NAME] - [--anno-name ANNO_NAME] [--keep] [--noFIFO] - [--peak-caller {fseq,macs2}] - [--trimmer {trimmomatic,pyadapt,skewer}] - [--deduplicator {picard,samblaster}] - [--prealignments PREALIGNMENTS [PREALIGNMENTS ...]] [--lite] - [-V] + [-Q SINGLE_OR_PAIRED] [--peak-caller {fseq,macs2}] + [-gs GENOME_SIZE] [--trimmer {trimmomatic,pyadapt,skewer}] + [--prealignments PREALIGNMENTS [PREALIGNMENTS ...]] + [--deduplicator {picard,samblaster}] [--TSS-name TSS_NAME] + [--blacklist BLACKLIST] [--frip-ref-peaks FRIP_REF_PEAKS] + [--anno-name ANNO_NAME] [--keep] [--noFIFO] [--lite] [-V] -PEPATAC version 0.8.5 +PEPATAC version 0.8.6 optional arguments: -h, --help show this help message and exit @@ -23,31 +21,35 @@ optional arguments: Pipeline configuration file (YAML). Relative paths are with respect to the pipeline script. -M MEMORY_LIMIT, --mem MEMORY_LIMIT - Memory limit (in Mb) for processes accepting such + Memory limit for processes accepting such. Default + units are megabytes unless specified using the suffix + [K|M|G|T]. -P NUMBER_OF_CORES, --cores NUMBER_OF_CORES Number of cores for parallelized processes -I2 [INPUT_FILES2 [INPUT_FILES2 ...]], --input2 [INPUT_FILES2 [INPUT_FILES2 ...]] Secondary input files, such as read2 -Q SINGLE_OR_PAIRED, --single-or-paired SINGLE_OR_PAIRED Single- or paired-end sequencing protocol - -gs GENOME_SIZE, --genome-size GENOME_SIZE - genome size for MACS2 - --frip-ref-peaks FRIP_REF_PEAKS - Reference peak set for calculating FRiP - --TSS-name TSS_NAME Name of TSS annotation file - --anno-name ANNO_NAME - Name of reference bed file for calculating FRiF - --keep Keep prealignment BAM files - --noFIFO Do NOT use named pipes during prealignments --peak-caller {fseq,macs2} Name of peak caller + -gs GENOME_SIZE, --genome-size GENOME_SIZE + MACS2 genome size --trimmer {trimmomatic,pyadapt,skewer} Name of read trimming program - --deduplicator {picard,samblaster} - Name of deduplicator program --prealignments PREALIGNMENTS [PREALIGNMENTS ...] Space-delimited list of reference genomes to align to before primary alignment. + --deduplicator {picard,samblaster} + Name of deduplicator program + --TSS-name TSS_NAME Name of TSS annotation file + --blacklist BLACKLIST + Name of peak blacklist file + --frip-ref-peaks FRIP_REF_PEAKS + Reference peak set for calculating FRiP + --anno-name ANNO_NAME + Reference bed file for calculating FRiF + --keep Keep prealignment BAM files + --noFIFO Do NOT use named pipes during prealignments --lite Only keep minimal, essential output to conserve disk space. -V, --version show program's version number and exit From 4ce86660b7e0083921691b4d281c2b80c37690e8 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Tue, 26 Mar 2019 10:52:40 -0400 Subject: [PATCH 13/15] parameterize bowtie2 for pre and primary alignments --- containers/pepatac.Dockerfile | 2 +- pipelines/pepatac.py | 18 +++++++++++------- pipelines/pepatac.yaml | 14 ++++++++++++++ 3 files changed, 26 insertions(+), 8 deletions(-) diff --git a/containers/pepatac.Dockerfile b/containers/pepatac.Dockerfile index cb1058ad..49e9d2fc 100644 --- a/containers/pepatac.Dockerfile +++ b/containers/pepatac.Dockerfile @@ -5,7 +5,7 @@ FROM phusion/baseimage:0.10.2 LABEL maintainer Jason Smith "jasonsmith@virginia.edu" # Version info -LABEL version 0.9.1 +LABEL version 0.9.2 # Use baseimage-docker's init system. CMD ["/sbin/my_init"] diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index 72015d84..e09798a0 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -175,8 +175,6 @@ def _align_with_bt2(args, tools, paired, useFIFO, unmap_fq1, unmap_fq2, # Default options bt2_opts_txt = " -k 1" # Return only 1 alignment bt2_opts_txt += " -D 20 -R 3 -N 1 -L 20 -i S,1,0.50" - if paired and args.keep: - bt2_opts_txt += " -X 2000" # samtools sort needs a temporary directory tempdir = tempfile.mkdtemp(dir=sub_outdir) @@ -699,13 +697,15 @@ def main(): unmap_fq1, unmap_fq2 = _align_with_bt2( args, tools, args.paired_end, False, unmap_fq1, unmap_fq2, reference, assembly_bt2=_get_bowtie2_index(res.genomes, reference), - outfolder=param.outfolder, aligndir="prealignments") + outfolder=param.outfolder, aligndir="prealignments", + bt2_opts_txt = param.bowtie2_pre.params) to_compress.extend((unmap_fq1, unmap_fq2)) else: unmap_fq1, unmap_fq2 = _align_with_bt2( args, tools, args.paired_end, True, unmap_fq1, unmap_fq2, reference, assembly_bt2=_get_bowtie2_index(res.genomes, reference), - outfolder=param.outfolder, aligndir="prealignments") + outfolder=param.outfolder, aligndir="prealignments", + bt2_opts_txt = param.bowtie2_pre.params) to_compress.extend((unmap_fq1, unmap_fq2)) pm.timestamp("### Map to genome") @@ -722,15 +722,19 @@ def main(): unmap_genome_bam = os.path.join( map_genome_folder, args.sample_name + "_unmap.bam") - bt2_options = " --very-sensitive" - bt2_options += " -X 2000" + if not param.bowtie2.params: + bt2_options = " --very-sensitive" + if args.paired_end: + bt2_options += " -X 2000" + else: + bt2_options = param.bowtie2.params # samtools sort needs a temporary directory tempdir = tempfile.mkdtemp(dir=map_genome_folder) pm.clean_add(tempdir) cmd = tools.bowtie2 + " -p " + str(pm.cores) - cmd += bt2_options + cmd += " " + bt2_options cmd += " --rg-id " + args.sample_name cmd += " -x " + res.bt2_genome if args.paired_end: diff --git a/pipelines/pepatac.yaml b/pipelines/pepatac.yaml index 71981633..2208f543 100644 --- a/pipelines/pepatac.yaml +++ b/pipelines/pepatac.yaml @@ -31,6 +31,20 @@ resources: parameters: # parameters passed to bioinformatic tools, subclassed by tool # Adjust/Add/Remove parameters for individual tools here + bowtie2_pre: # Modify bowtie2 prealignment settings + params: '' + # pipeline default: -k 1 -D 20 -R 3 -N 1 -L 20 -i S,1,0.50 + # -k 1: report up to <1> alns per read; MAPQ not meaningful + # -D 20: give up extending after <20> failed extends in a row + # -R 3: for reads w/ repetitive seeds, try <3> sets of seeds + # -N 1: max # mismatches in seed alignment; can be 0 or 1 + # -L 20: length of seed substrings; must be >3, <32 + # -i S,1,0.50: interval between seed substrings w/r/t read len + bowtie2: # Modify bowtie2 primary genome alignment settings + params: '' + # pipeline default: --very-sensitive -X 2000 + # --very-sensitive: -D 20 -R 3 -N 0 -L 20 -i S,1,0.50 + # -X 2000: paired-end maximum fragment length samtools: params: '-q 10' # -q: skip alignments with MAPQ < 10. From be865a8b9d0f2f2f1255d98a983979b52539eda8 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Tue, 26 Mar 2019 10:52:49 -0400 Subject: [PATCH 14/15] update changelog --- docs/changelog.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/changelog.md b/docs/changelog.md index 56f1c9c5..972ab2d6 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,6 +1,18 @@ # Change log All notable changes to this project will be documented in this file. +## [0.8.6] -- 2019-03-26 + +### Changed +- Use ngtsk ziptool option universally +- Change how tools are parameterized and include bowtie2 +- Simply and clarify prealignment steps +- Improve argument order for better readability + +### Added +- Perform a pre-check that all required tools are callable +- Add multiple targets and a pre-check for FIFO usage + ## [0.8.5] -- 2019-03-19 ### Changed From 190d41d6a708cf0390a1689110bd83f1279e6732 Mon Sep 17 00:00:00 2001 From: jpsmith5 Date: Tue, 26 Mar 2019 11:25:24 -0400 Subject: [PATCH 15/15] fix whitespace issue with parameterization --- pipelines/pepatac.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/pepatac.py b/pipelines/pepatac.py index e09798a0..38850575 100755 --- a/pipelines/pepatac.py +++ b/pipelines/pepatac.py @@ -173,7 +173,7 @@ def _align_with_bt2(args, tools, paired, useFIFO, unmap_fq1, unmap_fq2, if not bt2_opts_txt: # Default options - bt2_opts_txt = " -k 1" # Return only 1 alignment + bt2_opts_txt = "-k 1" # Return only 1 alignment bt2_opts_txt += " -D 20 -R 3 -N 1 -L 20 -i S,1,0.50" # samtools sort needs a temporary directory @@ -203,7 +203,7 @@ def _align_with_bt2(args, tools, paired, useFIFO, unmap_fq1, unmap_fq2, # Build bowtie2 command cmd = "(" + tools.bowtie2 + " -p " + str(pm.cores) - cmd += bt2_opts_txt + cmd += " " + bt2_opts_txt cmd += " -x " + assembly_bt2 cmd += " --rg-id " + args.sample_name cmd += " -U " + unmap_fq1