Merge pull request #26 from databio/dev

Dev
databio · Sep 13, 2017 · 8bec5f0 · 8bec5f0
2 parents 43cdb39 + d5697b4
commit 8bec5f0
Show file tree

Hide file tree

Showing 4 changed files with 55 additions and 16 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,16 @@
 # Change log
 All notable changes to this project will be documented in this file.
 
+## [0.5.0] -- 2017-09-13
+
+### Added
+- Adds rudimentary figure reporting
+
+### Changed
+- Changed default trimmer from trimmomatic to skewer
+- Make output from several tasks less verbose to make logs cleaner
+- Fixes an issue that left behind temporary samtools files if the job was killed
+
 ## [0.4.0] -- 2017-07-21
 
 ### Added

diff --git a/README.md b/README.md
@@ -50,7 +50,7 @@ install.packages(c("ggplot2", "gplots", "reshape2"))
 
 There are two configuration options: You can either set up environment variables to fit the default configuration, or change the configuration file to fit your environment. For the Chang lab, you may use the pre-made config file and project template described on the [Chang lab configuration](examples/chang_project) page. For others, choose one:
 
-**Option 1: Default configuration** (recommended; [pipelines/ATACseq.yaml](pipelines/ATACseq.yaml)). 
+**Configuration option 1: Default configuration** (recommended; [pipelines/ATACseq.yaml](pipelines/ATACseq.yaml)). 
   - Make sure the executable tools (java, samtools, bowtie2, etc.) are in your PATH.
   - Set up environment variables to point to `jar` files for the java tools (`picard` and `trimmomatic`).
   ```
@@ -66,19 +66,21 @@ There are two configuration options: You can either set up environment variables
   - Specify custom sequencing adapter file if desired (in [pipelines/ATACseq.yaml](pipelines/ATACseq.yaml)).
 
 
-**Option 2: Custom configuration**. Instead, you can also put absolute paths to each tool or resource in the configuration file to fit your local setup. Just change the pipeline configuration file ([pipelines/ATACseq.yaml](pipelines/ATACseq.yaml)) appropriately. 
+**Configuration option 2: Custom configuration**. Instead, you can also put absolute paths to each tool or resource in the configuration file to fit your local setup. Just change the pipeline configuration file ([pipelines/ATACseq.yaml](pipelines/ATACseq.yaml)) appropriately. 
 
 ## Usage
 
 You have two options for running the pipeline. 
 
-### Option 1: Running the pipeline script directly
+
+
+### Run option 1: Running the pipeline script directly
 
 To see the command-line options for usage, see [usage.txt](usage.txt), which you can get on the command line by running `pipelines/ATACseq.py --help`. You just need to pass a few command-line parameters to specify sample_name, reference genome, input files, etc. See example command in [cmd.sh](cmd.sh) using test data.
 
 To run on multiple samples, you can just write a loop to process each sample independently with the pipeline, or you can use *option 2*...
 
-### Option 2: Running the pipeline through looper
+### Run option 2: Running the pipeline through looper
 
 [Looper](http://looper.readthedocs.io/) is a pipeline submission engine that makes it easy to deploy this pipeline across samples. To use it, you will need to tell looper about your project. 
 
@@ -109,6 +111,10 @@ Your annotation file must specify these columns:
 
 Run your project as above, by passing your project config file to `looper run`. More detailed instructions and advanced options for how to define your project are in the [Looper documentation on defining a project](http://looper.readthedocs.io/en/latest/define-your-project.html). Of particular interest may be the section on [using looper derived columns](http://looper.readthedocs.io/en/latest/advanced.html#pointing-to-flexible-data-with-derived-columns).
 
+### Arguments
+
+The arguments to the pipeline are the same whether you run it using looper or on the command line. 
+
 ## Outline of analysis steps
 
 ### Prealignments

diff --git a/pipelines/ATACseq.py b/pipelines/ATACseq.py
@@ -5,12 +5,13 @@
 
 __author__ = ["Jin Xu", "Nathan Sheffield"]
 __email__ = "[email protected]"
-__version__ = "0.4.0"
+__version__ = "0.5.0"
 
 
 from argparse import ArgumentParser
 import os
 import sys
+import tempfile
 import pypiper
 
 
@@ -43,7 +44,7 @@ def parse_arguments():
 						help="Name of peak caller")
 
 	parser.add_argument("--trimmer", dest="trimmer",
-						default="trimmomatic", choices=TRIMMERS,
+						default="skewer", choices=TRIMMERS,
 						help="Name of read trimming program")
 
 	parser.add_argument("--prealignments", default=[], type=str, nargs="+",
@@ -186,6 +187,10 @@ def align(unmap_fq1, unmap_fq2, assembly_identifier, assembly_bt2, aligndir=None
 				bt2_options += " -D 20 -R 3 -N 1 -L 20 -i S,1,0.50"
 				bt2_options += " -X 2000"
 
+			# samtools sort needs a temporary directory
+			tempdir = tempfile.mkdtemp(dir=sub_outdir)
+			pm.clean_add(tempdir)
+
 			# Build bowtie2 command
 			cmd = tools.bowtie2 + " -p " + str(pm.cores)
 			cmd += bt2_options
@@ -194,8 +199,14 @@ def align(unmap_fq1, unmap_fq2, assembly_identifier, assembly_bt2, aligndir=None
 			cmd += " -1 " + unmap_fq1  + " -2 " + unmap_fq2
 			cmd += " --un-conc-gz " + out_fastq_bt2
 			cmd += " | " + tools.samtools + " view -bS - -@ 1"  # convert to bam
-			cmd += " | " + tools.samtools + " sort - -@ 1" + " -o " + mapped_bam  # sort output
-			cmd += " > " + mapped_bam
+			cmd += " | " + tools.samtools + " sort - -@ 1" # sort output
+			cmd += " -T " + tempdir
+			cmd += " -o " + mapped_bam
+
+			# In this samtools sort command we print to stdout and then use > to
+			# redirect instead of  `+ " -o " + mapped_bam` because then samtools
+			# uses a random temp file, so it won't choke if the job gets
+			# interrupted and restarted at this step.
 
 			pm.run(cmd, mapped_bam, follow = lambda: count_alignment(assembly_identifier, mapped_bam, args.paired_end))
 
@@ -292,6 +303,7 @@ def align(unmap_fq1, unmap_fq2, assembly_identifier, assembly_bt2, aligndir=None
 			("-t", str(args.cores)),
 			("-m", "pe" if args.paired_end else "any"),
 			("-x", res.adapter),
+			"--quiet",
 			("-o", out_fastq_pre),
 			local_input_files[0],
 			local_input_files[1] if args.paired_end else None
@@ -362,14 +374,20 @@ def align(unmap_fq1, unmap_fq2, assembly_identifier, assembly_bt2, aligndir=None
 	bt2_options = " --very-sensitive"
 	bt2_options += " -X 2000"
 
+	# samtools sort needs a temporary directory
+	tempdir = tempfile.mkdtemp(dir=map_genome_folder)
+	pm.clean_add(tempdir)
+
 	cmd = tools.bowtie2 + " -p " + str(pm.cores)
 	cmd += bt2_options
 	cmd += " --rg-id " + args.sample_name
 	cmd += " -x " +  res.bt2_genome
 	cmd += " -1 " + unmap_fq1  + " -2 " + unmap_fq2
 	cmd += " | " + tools.samtools + " view -bS - -@ 1 "
 	#cmd += " -f 2 -q 10"  # quality and pairing filter
-	cmd += " | " + tools.samtools + " sort - -@ 1" + " -o " + mapping_genome_bam_temp
+	cmd += " | " + tools.samtools + " sort - -@ 1"
+	cmd += " -T " + tempdir
+	cmd += " -o " + mapping_genome_bam_temp
 
 	# Split genome mapping result bamfile into two: high-quality aligned reads (keepers)
 	# and unmapped reads (in case we want to analyze the altogether unmapped reads)
@@ -481,19 +499,24 @@ def estimate_lib_size(picard_log):
 		pm.run(cmd, Tss_enrich, nofail=True)
 
 		#Call Rscript to plot TSS Enrichment
-		Tss_plot = os.path.join(QC_folder ,  args.sample_name + ".TssEnrichment.pdf")
+		Tss_plot = os.path.join(QC_folder,  args.sample_name + ".TssEnrichment.pdf")
 		cmd = "Rscript " + os.path.join(tools.scripts_dir, "ATAC_Rscript_TSSenrichmentPlot_pyPiper.R")
 		cmd += " " + Tss_enrich + " pdf"
 		pm.run(cmd, Tss_plot, nofail=True)
 
 		# Always plot strand specific TSS enrichment. 
 		# added by Ryan 2/10/17 to calculate TSS score as numeric and to include in summary stats
 		# This could be done in prettier ways which I'm open to. Just adding for the idea
-		# with open("A34912-CaudateNucleus-RepA_hg19.srt.rmdup.flt.RefSeqTSS") as f:
 		with open(Tss_enrich) as f:
 			floats = map(float,f)
 		Tss_score = (sum(floats[1950:2050])/100)/(sum(floats[1:200])/200)
 		pm.report_result("TSS_Score", Tss_score)
+		try:
+			# Just wrapping this in a try temporarily so that old versions of 
+			# pypiper will work. v0.6 release of pypiper adds this function
+			pm.report_figure("TSS enrichment", Tss_plot)
+		except:
+			pass
 
 		# fragment  distribution
 		fragL= os.path.join(QC_folder ,  args.sample_name +  ".fragLen.txt")
@@ -601,5 +624,5 @@ def report_peak_count():
 	try:
 		sys.exit(main())
 	except KeyboardInterrupt:
-		print("Program canceled by user!")
+		print("Pipeline aborted.")
 		sys.exit(1)
diff --git a/tools/norm_bedGraph.pl b/tools/norm_bedGraph.pl
@@ -31,10 +31,10 @@
 open (out, ">$outfile");
 while ($line=<in>) {
 
-  $count++;
-  if ($count%10000 eq 0) {
-    print $line;
-  }
+  # $count++;
+  # if ($count%10000 eq 0) {
+  #   print $line;
+  # }
 
   if (!($line=~/track/)) {
     chomp $line;