From c58c1aa87286ef3a5df24e7a861165b36456c4cd Mon Sep 17 00:00:00 2001 From: nsheff Date: Fri, 21 Jul 2017 17:22:30 -0400 Subject: [PATCH 01/14] version to v0.5.0-dev --- pipelines/ATACseq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/ATACseq.py b/pipelines/ATACseq.py index 07ab2ca1..2e38294c 100755 --- a/pipelines/ATACseq.py +++ b/pipelines/ATACseq.py @@ -5,7 +5,7 @@ __author__ = ["Jin Xu", "Nathan Sheffield"] __email__ = "xujin937@gmail.com" -__version__ = "0.4.0" +__version__ = "0.5.0-dev" from argparse import ArgumentParser From 47db9835bcf82d7277fbc6949bd7397f7661bc75 Mon Sep 17 00:00:00 2001 From: nsheff Date: Wed, 2 Aug 2017 09:58:11 -0400 Subject: [PATCH 02/14] Fix sort stdout issue. See #22 --- README.md | 14 ++++++++++---- pipelines/ATACseq.py | 8 ++++++-- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index fa27bae9..9540f258 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ Version 0.3 of this pipeline requires looper version 0.6 or greater. You can upg There are two configuration options: You can either set up environment variables to fit the default configuration, or change the configuration file to fit your environment. For the Chang lab, you may use the pre-made config file and project template described on the [Chang lab configuration](examples/chang_project) page. For others, choose one: -**Option 1: Default configuration** (recommended; [pipelines/ATACseq.yaml](pipelines/ATACseq.yaml)). +**Configuration option 1: Default configuration** (recommended; [pipelines/ATACseq.yaml](pipelines/ATACseq.yaml)). - Make sure the executable tools (java, samtools, bowtie2, etc.) are in your PATH. - Set up environment variables to point to `jar` files for the java tools (`picard` and `trimmomatic`). ``` @@ -54,18 +54,20 @@ There are two configuration options: You can either set up environment variables - Specify custom sequencing adapter file if desired (in [pipelines/ATACseq.yaml](pipelines/ATACseq.yaml)). -**Option 2: Custom configuration**. Instead, you can also put absolute paths to each tool or resource in the configuration file to fit your local setup. Just change the pipeline configuration file ([pipelines/ATACseq.yaml](pipelines/ATACseq.yaml)) appropriately. +**Configuration option 2: Custom configuration**. Instead, you can also put absolute paths to each tool or resource in the configuration file to fit your local setup. Just change the pipeline configuration file ([pipelines/ATACseq.yaml](pipelines/ATACseq.yaml)) appropriately. ## Running the pipeline You have options for running the pipeline. This is a looper-compatible pipeline, so you never need to interface with the pipeline directly, but you can if you want. -### Option 1: Running the pipeline script directly + + +### Run option 1: Running the pipeline script directly Just run `python pipelines/ATACseq.py -h` to see usage. You just need to pass a few command-line parameters to specify sample_name, reference genome, input files, etc. See example command in [cmd.sh](cmd.sh). -### Option 2: Running the pipeline through looper +### Run option 2: Running the pipeline through looper [Looper](http://looper.readthedocs.io/) is a pipeline submission engine that makes it easy to deploy this pipeline across samples. To use it, you will need to tell looper about your project. @@ -96,6 +98,10 @@ Your annotation file must specify these columns: Run your project as above, by passing your project config file to `looper run`. More detailed instructions and advanced options for how to define your project are in the [Looper documentation on defining a project](http://looper.readthedocs.io/en/latest/define-your-project.html). Of particular interest may be the section on [using looper derived columns](http://looper.readthedocs.io/en/latest/advanced.html#pointing-to-flexible-data-with-derived-columns). +### Arguments + +The arguments to the pipeline are the same whether you run it using looper or on the command line. + ## Outline of analysis steps ### Prealignments diff --git a/pipelines/ATACseq.py b/pipelines/ATACseq.py index 51e48cd9..e79bea17 100755 --- a/pipelines/ATACseq.py +++ b/pipelines/ATACseq.py @@ -140,9 +140,14 @@ def align(unmap_fq1, unmap_fq2, assembly_identifier, assembly_bt2, aligndir=None cmd += " -1 " + unmap_fq1 + " -2 " + unmap_fq2 cmd += " --un-conc-gz " + out_fastq_bt2 cmd += " | " + tools.samtools + " view -bS - -@ 1" # convert to bam - cmd += " | " + tools.samtools + " sort - -@ 1" + " -o " + mapped_bam # sort output + cmd += " | " + tools.samtools + " sort - -@ 1" # sort output cmd += " > " + mapped_bam + # In this samtools sort command we print to stdout and then use > to + # redirect instead of `+ " -o " + mapped_bam` because then samtools + # uses a random temp file, so it won't choke if the job gets + # interrupted and restarted at this step. + pm.run(cmd, mapped_bam, follow = lambda: count_alignment(assembly_identifier, mapped_bam)) # filter genome reads not mapped @@ -417,7 +422,6 @@ def estimate_lib_size(picard_log): # Always plot strand specific TSS enrichment. # added by Ryan 2/10/17 to calculate TSS score as numeric and to include in summary stats # This could be done in prettier ways which I'm open to. Just adding for the idea - # with open("A34912-CaudateNucleus-RepA_hg19.srt.rmdup.flt.RefSeqTSS") as f: with open(Tss_enrich) as f: floats = map(float,f) Tss_score = (sum(floats[1950:2050])/100)/(sum(floats[1:200])/200) From ab3c08184a136c9e51245d7164f8fade09d680ec Mon Sep 17 00:00:00 2001 From: nsheff Date: Wed, 2 Aug 2017 13:49:21 -0400 Subject: [PATCH 03/14] change main alignment to samtools stdout --- pipelines/ATACseq.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pipelines/ATACseq.py b/pipelines/ATACseq.py index 4397d468..9bb6f591 100755 --- a/pipelines/ATACseq.py +++ b/pipelines/ATACseq.py @@ -375,7 +375,8 @@ def align(unmap_fq1, unmap_fq2, assembly_identifier, assembly_bt2, aligndir=None cmd += " -1 " + unmap_fq1 + " -2 " + unmap_fq2 cmd += " | " + tools.samtools + " view -bS - -@ 1 " #cmd += " -f 2 -q 10" # quality and pairing filter - cmd += " | " + tools.samtools + " sort - -@ 1" + " -o " + mapping_genome_bam_temp + cmd += " | " + tools.samtools + " sort - -@ 1" + cmd += " > " + mapping_genome_bam_temp # Split genome mapping result bamfile into two: high-quality aligned reads (keepers) # and unmapped reads (in case we want to analyze the altogether unmapped reads) From e8c4092a1ce0b8e3c49cfc569205991ec3f06798 Mon Sep 17 00:00:00 2001 From: nsheff Date: Thu, 3 Aug 2017 12:55:30 -0400 Subject: [PATCH 04/14] Switch to specifying a tempdir for sort. This makes it so the temp files are placed in a subfolder local to the alignment, and also adds these files to the clean script, so that any temporary files from jobs that got canceled or preempted can be easily cleaned with looper clean --- pipelines/ATACseq.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/pipelines/ATACseq.py b/pipelines/ATACseq.py index 9bb6f591..9adc6211 100755 --- a/pipelines/ATACseq.py +++ b/pipelines/ATACseq.py @@ -11,6 +11,7 @@ from argparse import ArgumentParser import os import sys +import tempfile import pypiper @@ -186,6 +187,10 @@ def align(unmap_fq1, unmap_fq2, assembly_identifier, assembly_bt2, aligndir=None bt2_options += " -D 20 -R 3 -N 1 -L 20 -i S,1,0.50" bt2_options += " -X 2000" + # samtools sort needs a temporary directory + tempdir = tempfile.mkdtemp(dir=sub_outdir) + pm.clean_add(tempdir) + # Build bowtie2 command cmd = tools.bowtie2 + " -p " + str(pm.cores) cmd += bt2_options @@ -195,14 +200,14 @@ def align(unmap_fq1, unmap_fq2, assembly_identifier, assembly_bt2, aligndir=None cmd += " --un-conc-gz " + out_fastq_bt2 cmd += " | " + tools.samtools + " view -bS - -@ 1" # convert to bam cmd += " | " + tools.samtools + " sort - -@ 1" # sort output - cmd += " > " + mapped_bam + cmd += " -T " + tempdir + cmd += " -o " + mapped_bam # In this samtools sort command we print to stdout and then use > to # redirect instead of `+ " -o " + mapped_bam` because then samtools # uses a random temp file, so it won't choke if the job gets # interrupted and restarted at this step. - pm.run(cmd, mapped_bam, follow = lambda: count_alignment(assembly_identifier, mapped_bam, args.paired_end)) # filter genome reads not mapped @@ -368,6 +373,10 @@ def align(unmap_fq1, unmap_fq2, assembly_identifier, assembly_bt2, aligndir=None bt2_options = " --very-sensitive" bt2_options += " -X 2000" + # samtools sort needs a temporary directory + tempdir = tempfile.mkdtemp(dir=sub_outdir) + pm.clean_add(tempdir) + cmd = tools.bowtie2 + " -p " + str(pm.cores) cmd += bt2_options cmd += " --rg-id " + args.sample_name @@ -376,7 +385,8 @@ def align(unmap_fq1, unmap_fq2, assembly_identifier, assembly_bt2, aligndir=None cmd += " | " + tools.samtools + " view -bS - -@ 1 " #cmd += " -f 2 -q 10" # quality and pairing filter cmd += " | " + tools.samtools + " sort - -@ 1" - cmd += " > " + mapping_genome_bam_temp + cmd += " -T " + tempdir + cmd += " -o " + mapping_genome_bam_temp # Split genome mapping result bamfile into two: high-quality aligned reads (keepers) # and unmapped reads (in case we want to analyze the altogether unmapped reads) From 6e3a4cc74828ecb35e9a15f046d3934a98f471f4 Mon Sep 17 00:00:00 2001 From: nsheff Date: Thu, 3 Aug 2017 13:25:28 -0400 Subject: [PATCH 05/14] make skewer quiet. Fix #23 --- pipelines/ATACseq.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pipelines/ATACseq.py b/pipelines/ATACseq.py index 9adc6211..96a9db97 100755 --- a/pipelines/ATACseq.py +++ b/pipelines/ATACseq.py @@ -303,6 +303,7 @@ def align(unmap_fq1, unmap_fq2, assembly_identifier, assembly_bt2, aligndir=None ("-t", str(args.cores)), ("-m", "pe" if args.paired_end else "any"), ("-x", res.adapter), + "--quiet", ("-o", out_fastq_pre), local_input_files[0], local_input_files[1] if args.paired_end else None From 42abff7c3d3c44c55aa22d07cf5fa2917490c79a Mon Sep 17 00:00:00 2001 From: nsheff Date: Thu, 3 Aug 2017 14:03:21 -0400 Subject: [PATCH 06/14] Fix missing dir problem with temp update --- pipelines/ATACseq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/ATACseq.py b/pipelines/ATACseq.py index 96a9db97..1dc3e5c3 100755 --- a/pipelines/ATACseq.py +++ b/pipelines/ATACseq.py @@ -375,7 +375,7 @@ def align(unmap_fq1, unmap_fq2, assembly_identifier, assembly_bt2, aligndir=None bt2_options += " -X 2000" # samtools sort needs a temporary directory - tempdir = tempfile.mkdtemp(dir=sub_outdir) + tempdir = tempfile.mkdtemp(dir=map_genome_folder) pm.clean_add(tempdir) cmd = tools.bowtie2 + " -p " + str(pm.cores) From 2a7d6c45b503efa1298df282dd45dbb24a14f690 Mon Sep 17 00:00:00 2001 From: nsheff Date: Wed, 16 Aug 2017 13:53:10 -0400 Subject: [PATCH 07/14] make norm_bedGraph.pl quiet. Fix #24 --- tools/norm_bedGraph.pl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/norm_bedGraph.pl b/tools/norm_bedGraph.pl index 3c5f269f..8e071c81 100755 --- a/tools/norm_bedGraph.pl +++ b/tools/norm_bedGraph.pl @@ -31,10 +31,10 @@ open (out, ">$outfile"); while ($line=) { - $count++; - if ($count%10000 eq 0) { - print $line; - } + # $count++; + # if ($count%10000 eq 0) { + # print $line; + # } if (!($line=~/track/)) { chomp $line; From d3c26e075841fe4f2383eab57c3d8517fb3813c6 Mon Sep 17 00:00:00 2001 From: nsheff Date: Wed, 16 Aug 2017 13:56:59 -0400 Subject: [PATCH 08/14] change msg --- pipelines/ATACseq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/ATACseq.py b/pipelines/ATACseq.py index 1dc3e5c3..c700dbfd 100755 --- a/pipelines/ATACseq.py +++ b/pipelines/ATACseq.py @@ -618,5 +618,5 @@ def report_peak_count(): try: sys.exit(main()) except KeyboardInterrupt: - print("Program canceled by user!") + print("User aborted.") sys.exit(1) From b67ee3b0daf302a2224b2a69c4851fc62d12fb6f Mon Sep 17 00:00:00 2001 From: nsheff Date: Mon, 21 Aug 2017 13:00:39 -0400 Subject: [PATCH 09/14] abort msg --- pipelines/ATACseq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/ATACseq.py b/pipelines/ATACseq.py index c700dbfd..61c9391c 100755 --- a/pipelines/ATACseq.py +++ b/pipelines/ATACseq.py @@ -618,5 +618,5 @@ def report_peak_count(): try: sys.exit(main()) except KeyboardInterrupt: - print("User aborted.") + print("Pipeline aborted.") sys.exit(1) From 3376067a6284a8b7af4c1a2565a5cec9011c0109 Mon Sep 17 00:00:00 2001 From: nsheff Date: Tue, 22 Aug 2017 15:23:21 -0400 Subject: [PATCH 10/14] by popular demand; make skewer the default trimmer --- pipelines/ATACseq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/ATACseq.py b/pipelines/ATACseq.py index 61c9391c..daf4dcd6 100755 --- a/pipelines/ATACseq.py +++ b/pipelines/ATACseq.py @@ -44,7 +44,7 @@ def parse_arguments(): help="Name of peak caller") parser.add_argument("--trimmer", dest="trimmer", - default="trimmomatic", choices=TRIMMERS, + default="skewer", choices=TRIMMERS, help="Name of read trimming program") parser.add_argument("--prealignments", default=[], type=str, nargs="+", From ccbbe5f62c179bfb3f273deb36df8730570e190e Mon Sep 17 00:00:00 2001 From: nsheff Date: Tue, 22 Aug 2017 15:24:07 -0400 Subject: [PATCH 11/14] add skewer default to changelog --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 81fd66f6..9b36efc2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,11 @@ # Change log All notable changes to this project will be documented in this file. +## [0.5.0] -- Unreleased + +### Changed +- Changed default trimmer from trimmomatic to skewer + ## [0.4.0] -- 2017-07-21 ### Added From 1c2074b8f3a5d499d6043cf0ca692ccfd96cdb17 Mon Sep 17 00:00:00 2001 From: nsheff Date: Thu, 24 Aug 2017 14:53:04 -0400 Subject: [PATCH 12/14] Add report_figure concept --- pipelines/ATACseq.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pipelines/ATACseq.py b/pipelines/ATACseq.py index daf4dcd6..3ee9213b 100755 --- a/pipelines/ATACseq.py +++ b/pipelines/ATACseq.py @@ -499,7 +499,7 @@ def estimate_lib_size(picard_log): pm.run(cmd, Tss_enrich, nofail=True) #Call Rscript to plot TSS Enrichment - Tss_plot = os.path.join(QC_folder , args.sample_name + ".TssEnrichment.pdf") + Tss_plot = os.path.join(QC_folder, args.sample_name + ".TssEnrichment.pdf") cmd = "Rscript " + os.path.join(tools.scripts_dir, "ATAC_Rscript_TSSenrichmentPlot_pyPiper.R") cmd += " " + Tss_enrich + " pdf" pm.run(cmd, Tss_plot, nofail=True) @@ -511,6 +511,12 @@ def estimate_lib_size(picard_log): floats = map(float,f) Tss_score = (sum(floats[1950:2050])/100)/(sum(floats[1:200])/200) pm.report_result("TSS_Score", Tss_score) + try: + # Just wrapping this in a try temporarily so that old versions of + # pypiper will work. v0.6 release of pypiper adds this function + pm.report_figure("TSS enrichment", Tss_plot) + except: + pass # fragment distribution fragL= os.path.join(QC_folder , args.sample_name + ".fragLen.txt") From a7cf737dbeef2c9d21bcde37b4f0224f71391ea8 Mon Sep 17 00:00:00 2001 From: nsheff Date: Wed, 13 Sep 2017 10:40:42 -0400 Subject: [PATCH 13/14] update changelog --- CHANGELOG.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9b36efc2..6ddf4042 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,10 +1,15 @@ # Change log All notable changes to this project will be documented in this file. -## [0.5.0] -- Unreleased +## [0.5.0] -- 2017-09-13 + +### Added +- Adds rudimentary figure reporting ### Changed - Changed default trimmer from trimmomatic to skewer +- Make output from several tasks less verbose to make logs cleaner +- Fixes an issue that left behind temporary samtools files if the job was killed ## [0.4.0] -- 2017-07-21 From d5697b49759ea631c1fcb4b5859c6e14bc943b55 Mon Sep 17 00:00:00 2001 From: nsheff Date: Wed, 13 Sep 2017 10:41:01 -0400 Subject: [PATCH 14/14] version to 0.5.0 --- pipelines/ATACseq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/ATACseq.py b/pipelines/ATACseq.py index 3ee9213b..a2ee4686 100755 --- a/pipelines/ATACseq.py +++ b/pipelines/ATACseq.py @@ -5,7 +5,7 @@ __author__ = ["Jin Xu", "Nathan Sheffield"] __email__ = "xujin937@gmail.com" -__version__ = "0.5.0-dev" +__version__ = "0.5.0" from argparse import ArgumentParser