Merge branch 'develop'

jpiper · Dec 5, 2013 · aa94ae0 · aa94ae0
2 parents 676f0ce + 93c392a
commit aa94ae0
Show file tree

Hide file tree

Showing 9 changed files with 124 additions and 35 deletions.
diff --git a/.gitignore b/.gitignore
@@ -33,3 +33,9 @@ nosetests.xml
 .mr.developer.cfg
 .project
 .pydevproject
+
+*.DS_Store
+*.dylib
+*.pyc
+.idea/
+docs/_*
diff --git a/CHANGES b/CHANGES
@@ -0,0 +1,9 @@
+0.1.1 - 2013-12-05
+==================
+Misc. small bug fixes
+Fixed Python 2.6 Compatability
+Added JSON export script
+
+0.1.0 - 2013-09-01
+==================
+Initial Release
diff --git a/docs/conf.py b/docs/conf.py
@@ -48,9 +48,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = "0.1.0"
+version = "0.1.1"
 # The full version, including alpha/beta/rc tags.
-release = "0.1.0"
+release = "0.1.1"
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/pyDNase/__init__.py b/pyDNase/__init__.py
@@ -12,7 +12,7 @@
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
-__version__ = "0.1.0"
+__version__ = "0.1.1"
 
 import os
 import numpy as np
@@ -51,13 +51,16 @@ def __init__(self, filePath,caching=True,chunkSize=1000):
         try:
             self.samfile = pysam.Samfile(filePath)
         except IOError:
-            errorString = "Unable to load BAM file:{}".format(filePath)
+            errorString = "Unable to load BAM file:{0}".format(filePath)
             raise IOError(errorString)
 
         #Initialise the empty DNase cut cache with the chromosome names from the BAM file
-        self.cutCache    =  {i:{"+":{},"-":{}} for i in self.samfile.references}
+        self.cutCache = {}
         #This helps us differentiate between what's been looked up and when there's just no reads
-        self.lookupCache = {i:[] for i in self.samfile.references}
+        self.lookupCache = {}
+        for i in self.samfile.references:
+            self.cutCache[i]    = {"+":{},"-":{}}
+            self.lookupCache[i] = []
         #Do not change the CHUNK_SIZE after object instantiation!
         self.CHUNK_SIZE  = chunkSize
         self.CACHING     = caching
@@ -179,7 +182,7 @@ def FOS(self,interval,bgsize=35):
             A float with the FOS - returns 10000 if it can't calculate it
         """
 
-        cuts = self["{},{},{},{}".format(interval.chromosome,interval.startbp-bgsize,interval.endbp+bgsize,interval.strand)]
+        cuts = self["{0},{1},{2},{3}".format(interval.chromosome,interval.startbp-bgsize,interval.endbp+bgsize,interval.strand)]
         forwardArray, backwardArray     = cuts["+"], cuts["-"]
         cutArray     = (forwardArray + backwardArray)
 
@@ -236,7 +239,7 @@ def loadBEDFile(self,filename):
         try:
             BEDfile = open(filename, 'rU')
         except IOError:
-            errorString = "Cannot load BED file: {}".format(filename)
+            errorString = "Cannot load BED file: {0}".format(filename)
             raise IOError(errorString)
 
         puts_err("Reading BED File...")
@@ -264,7 +267,7 @@ def __malformedBEDline(self,BEDString):
             Exception
         """
         #TODO: Make a new exception class, something like malformedBEDException?
-        exceptionString = "Malformed BED line: {}".format(BEDString)
+        exceptionString = "Malformed BED line: {0}".format(BEDString)
         raise Exception(exceptionString)
 
     def __parseBEDString(self,BEDString):
@@ -396,7 +399,7 @@ class GenomicInterval(object):
     Basic Object which describes reads region of the genome
     """
 
-    #This counts how many GenomicInterval objects have been created so that each GenomicInterval can have reads label.
+    #This counts how many GenomicInterval objects have been created
     counter = 0
 
     def __init__(self, chrom, start, stop, label = 0,score = 0,strand="+"):
@@ -428,7 +431,7 @@ def __init__(self, chrom, start, stop, label = 0,score = 0,strand="+"):
         self.score      = float(score)
 
         if self.startbp > self.endbp:
-            raise Exception("End location of GenomicInterval is bigger than Start location!")
+            raise Exception("Start location of GenomicInterval is larger than end location!")
 
         # This is from reads bygone era where we ordered the intervals by import order
         # self.score      = self.__class__.counter
@@ -437,7 +440,7 @@ def __init__(self, chrom, start, stop, label = 0,score = 0,strand="+"):
         if label:
             self.label = str(label)
         else:
-            self.label     = "Unnamed{}".format(self.__class__.counter)
+            self.label     = "Unnamed{0}".format(self.__class__.counter)
 
         #This contains anything else you want to store about the interval
         self.metadata = {}
@@ -446,7 +449,7 @@ def __str__(self):
         """
         BED representation of the interval
         """
-        return "{}\t{}\t{}\t{}\t{}\t{}".format(self.chromosome, self.startbp, self.endbp, self.label, self.score, self.strand)
+        return "{0}\t{1}\t{2}\t{3}\t{4}\t{5}".format(self.chromosome, self.startbp, self.endbp, self.label, self.score, self.strand)
 
     def __len__(self):
         """

diff --git a/pyDNase/footprinting/__init__.py b/pyDNase/footprinting/__init__.py
@@ -107,18 +107,19 @@ def calculate(self,reads,shoulder_sizes=range(35,36),footprint_sizes = range(11,
             halffpround = int((fp_size-1)/2)
             fw_fpscores_dict[fp_size] = [0] * halffpround + [sum(i) for i in self.window(forwardArray, fp_size)]
             rv_fpscores_dict[fp_size] = [0] * halffpround + [sum(i) for i in self.window(backwardArray,fp_size)]
+
+       #Empty list of lists for storing the footprint scores
         log_probs       = [[] for i in range(len(forwardArray))]
 
         if bonferroni:
             bonferroni_factor = np.log(1.0/sum(reads.samfile.lengths))
+
         #testing multiple background sizes
         for shoulder_size in shoulder_sizes:
             #This computes the background cut sums for the specified shoulder_size for all basepairs
             f_bindingArray = [0] * (shoulder_size - 1) + [sum(i) for i in self.window(forwardArray,shoulder_size)]
             b_bindingArray = [sum(i) for i in self.window(backwardArray,shoulder_size)] + [0] * (shoulder_size - 1)
 
-            #Empty list of lists for storing the footprint scores
-
             for fp_size in footprint_sizes:
                 halffpround = int((fp_size-1)/2)
                 #This computes the binding cut sums for the specified fp_size for all basepairs
@@ -190,7 +191,7 @@ def calculate(self,reads,shoulder_sizes=range(35,36),footprint_sizes = range(11,
                     #This requires that there are actually tags present in these windows
                     if x:
                         p = (shoulder_size*2) / float((shoulder_size*2) + fp_size)
-                        #This stores the P values and the fp_size used to calculate them in reads tuple. in the log_probs[] list
+                        #This stores the p values and the fp_size used to calculate them in reads tuple. in the log_probs[] list
                         score = binom.logsf(int(x - 1), int(n), p)
                         log_probs[i].append([score,fp_size])
         #This iterates over all the base pairs in the region and creates arrays for the best score and footprint size

diff --git a/pyDNase/scripts/dnase_average_profile.py b/pyDNase/scripts/dnase_average_profile.py
@@ -19,20 +19,20 @@
 import pyDNase
 import numpy as np
 import matplotlib as mpl
+from clint.textui import progress, puts
 #Required for headless operation
 mpl.use('Agg')
 import matplotlib.pyplot as plt
 from matplotlib import rcParams
 
 parser = argparse.ArgumentParser(description='Plots average profile of DNase activity surrounding a list of regions in a BED file')
-parser.add_argument("-w", "--window_size", help="Size of flanking area around centre of the regions to plot (default: 200)",default=200,type=int)
+parser.add_argument("-w", "--window_size", help="Size of flanking area around centre of the regions to plot (default: 100)",default=100,type=int)
 parser.add_argument("-i",action="store_true", help="Ignores any strand information in BED file and plots data relative to reference strand",default=False)
 parser.add_argument("regions", help="BED file of the regions you want to generate the average profile for")
 parser.add_argument("reads", help="The BAM file containing the DNase-seq data")
 parser.add_argument("output", help="filename to write the output to")
 args  = parser.parse_args()
 
-xsize   = args.window_size
 reads   = pyDNase.BAMHandler(args.reads)
 regions = pyDNase.GenomicIntervalSet(args.regions)
 
@@ -41,13 +41,13 @@
     for each in regions:
         each.strand = "+"
 
-for each in regions:
-    each.score = reads.FOS(each)
-regions.resizeRegions(xsize)
+puts("Resizing Regions to {}".format(args.window_size))
+regions.resizeRegions(args.window_size)
 
 fw = []
 rv = []
-for each in regions:
+puts("Reading Data from BAM file...")
+for each in progress.bar(regions):
     if reads[each]["+"].sum() and reads[each]["-"].sum():
         fw.append(reads[each]["+"])
         rv.append(reads[each]["-"])
@@ -58,18 +58,20 @@
 rcParams['xtick.major.pad'] = 20 
 rcParams['ytick.major.pad'] = 20 
 
-#Sort out the X axis
-ticks = [0,xsize,xsize*2]
-labels = [-xsize,0,xsize]
+#Sort out the X axis ticks
+ticks = [0,args.window_size,args.window_size*2]
+labels = [-args.window_size,0,args.window_size]
 plt.xticks(ticks, labels)
 
+#Make the yaxis start from 0
+plt.gca().set_ylim(0)
+
 #Makes ticks only appear on the left hand side
 plt.gca().yaxis.set_ticks_position('left')
 
-#Remove top, right, and bottom borders
+#Remove top and right borders
 plt.gca().spines['top'].set_visible(False)
 plt.gca().spines['right'].set_visible(False)
-#plt.gca().spines['bottom'].set_visible(False)
 
 plt.gca().tick_params(axis='both', which='major', labelsize=32)
 

diff --git a/pyDNase/scripts/dnase_to_JSON.py b/pyDNase/scripts/dnase_to_JSON.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+
+# Copyright (C) 2013 Jason Piper - [email protected]
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import argparse, pyDNase
+from clint.textui import puts, progress
+parser = argparse.ArgumentParser(description='Writes a JSON file of DNase I cuts for regions from a  BED file')
+parser.add_argument("-w", "--window_size", help="Resize all regions to a specific length",default = 0, type=int)
+parser.add_argument("-i",action="store_true", help="Ignores strand information in BED file",default=False)
+parser.add_argument("regions", help="BED file of the regions")
+parser.add_argument("reads", help="BAM file containing the read data")
+parser.add_argument("output", help="filename to write the JSON output to")
+args = parser.parse_args()
+
+reads = pyDNase.BAMHandler(args.reads)
+regions = pyDNase.GenomicIntervalSet(args.regions)
+
+if args.i:
+    for each in regions:
+        each.strand = "+"
+
+if args.window_size:
+    puts("Resizing Regions to {0}".format(args.window_size))
+    regions.resizeRegions(toSize=args.window_size)
+
+#TODO: this will load everything everything into memory, it's probably worth making the option to write directly to disk
+
+outfile = open(args.output,"w")
+outarr = []
+puts("Generating JSON output...")
+for i in progress.bar(sorted(regions, key = lambda x : x.importorder)):
+    cuts = reads[i]
+    outarr.append({"location":i.chromosome + ":" + str(i.startbp) + ":" + str(i.endbp), "positive strand cuts":cuts["+"].tolist() , "negative strand cuts":cuts["-"].tolist()})
+puts("Writing JSON to disk...")
+outfile.write(str(outarr))
+outfile.close()
diff --git a/pyDNase/scripts/dnase_to_javatreeview.py b/pyDNase/scripts/dnase_to_javatreeview.py
@@ -16,12 +16,17 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 import argparse, pyDNase, csv
+import numpy as np
 from clint.textui import puts, progress
+import warnings
+
 parser = argparse.ArgumentParser(description='Writes a JavaTreeView file based on the regions in reads BED file and the reads in reads BAM file')
 parser.add_argument("-w", "--window_size", help="Size of flanking area around centre of the regions to plot (default: 100)",default=100,type=int)
 parser.add_argument("-i",action="store_true", help="Ignores strand information in BED file",default=False)
 parser.add_argument("-o",action="store_true", help="Orders output the same as the input (default: orders by FOS)",default=False)
 parser.add_argument("-a",action="store_true", help="Write absolute cut counts instead strand imbalanced counts",default=False)
+parser.add_argument("-n",action="store_true", help="Normalise the cut data for each region between 0 and 1",default=False)
+parser.add_argument("-c",action="store_true", help="Disable memory caching (low RAM mode)",default=False)
 parser.add_argument("regions", help="BED file of the regions you want to generate the heatmap for")
 parser.add_argument("reads", help="The BAM file containing the read data")
 parser.add_argument("output", help="filename to write the CSV output to")
@@ -34,13 +39,16 @@
     for each in regions:
         each.strand = "+"
 
-puts("Caching reads...")
-for i in progress.bar(regions):
-   reads[i]
+if not args.c:
+    puts("Caching reads...")
+    for i in progress.bar(regions):
+       reads[i]
 
 if args.o:
     sorter = lambda x : x.importorder
 else:
+    if args.c:
+        warnings.warn("You've disabled memory caching, which can cause a ~2x slowdown when sorting by FOS")
     sorter = lambda x : x.score
     puts("Ordering intervals by FOS")
     for i in progress.bar(regions):
@@ -53,12 +61,22 @@
 #Writes the header file
 outfile.writerow(["GID", "ID", "NAME"] + [i+1 for i in range(len(reads[regions[0]]["+"]))])
 
+def normalise_cuts(cuts):
+    normalised_cuts = np.array(cuts,dtype="float")
+    normalised_cuts = ((normalised_cuts - normalised_cuts.min()) / (normalised_cuts.max() - normalised_cuts.min()))
+    return normalised_cuts
 
 puts("Writing to JTV...")
 for i in progress.bar(sorted(regions, key = sorter)):
     cuts = reads[i]
     if args.a:
-        newarray = (cuts["+"] + cuts["-"]).tolist()
+        newarray = (cuts["+"] + cuts["-"])
+        if args.n:
+            newarray = normalise_cuts(newarray)
     else:
-        newarray = (cuts["+"] - cuts["-"]).tolist()
-    outfile.writerow(["NULL","NULL",i.chromosome + ":" + str(i.startbp) + ":" + str(i.endbp)] + newarray)
+        if args.n:
+            cuts["+"] = normalise_cuts(cuts["+"])
+            cuts["-"] = normalise_cuts(cuts["-"])
+        newarray = (cuts["+"] - cuts["-"])
+
+    outfile.writerow(["NULL","NULL",i.chromosome + ":" + str(i.startbp) + ":" + str(i.endbp)] + newarray.tolist())
diff --git a/setup.py b/setup.py
@@ -16,7 +16,7 @@
 
 setup(
     name='pyDNase',
-    version="0.1.0",
+    version="0.1.1",
     description='DNase-seq analysis library',
     long_description=open('README.rst',"rt").read(),
     author='Jason Piper',
@@ -47,7 +47,8 @@
         "pyDNase/scripts/dnase_to_javatreeview.py",
         "pyDNase/scripts/dnase_wig_tracks.py",
         "pyDNase/scripts/wellington_footprints.py",
-        "pyDNase/scripts/examples/example_footprint_scores.py"],
+        "pyDNase/scripts/examples/example_footprint_scores.py",
+        "pyDNase/scripts/dnase_to_JSON.py"],
 
     test_suite="test",
 )