Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
jpiper committed Dec 5, 2013
2 parents 676f0ce + 93c392a commit aa94ae0
Show file tree
Hide file tree
Showing 9 changed files with 124 additions and 35 deletions.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,9 @@ nosetests.xml
.mr.developer.cfg
.project
.pydevproject

*.DS_Store
*.dylib
*.pyc
.idea/
docs/_*
9 changes: 9 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
0.1.1 - 2013-12-05
==================
Misc. small bug fixes
Fixed Python 2.6 Compatability
Added JSON export script

0.1.0 - 2013-09-01
==================
Initial Release
4 changes: 2 additions & 2 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@
# built documents.
#
# The short X.Y version.
version = "0.1.0"
version = "0.1.1"
# The full version, including alpha/beta/rc tags.
release = "0.1.0"
release = "0.1.1"

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
25 changes: 14 additions & 11 deletions pyDNase/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
__version__ = "0.1.0"
__version__ = "0.1.1"

import os
import numpy as np
Expand Down Expand Up @@ -51,13 +51,16 @@ def __init__(self, filePath,caching=True,chunkSize=1000):
try:
self.samfile = pysam.Samfile(filePath)
except IOError:
errorString = "Unable to load BAM file:{}".format(filePath)
errorString = "Unable to load BAM file:{0}".format(filePath)
raise IOError(errorString)

#Initialise the empty DNase cut cache with the chromosome names from the BAM file
self.cutCache = {i:{"+":{},"-":{}} for i in self.samfile.references}
self.cutCache = {}
#This helps us differentiate between what's been looked up and when there's just no reads
self.lookupCache = {i:[] for i in self.samfile.references}
self.lookupCache = {}
for i in self.samfile.references:
self.cutCache[i] = {"+":{},"-":{}}
self.lookupCache[i] = []
#Do not change the CHUNK_SIZE after object instantiation!
self.CHUNK_SIZE = chunkSize
self.CACHING = caching
Expand Down Expand Up @@ -179,7 +182,7 @@ def FOS(self,interval,bgsize=35):
A float with the FOS - returns 10000 if it can't calculate it
"""

cuts = self["{},{},{},{}".format(interval.chromosome,interval.startbp-bgsize,interval.endbp+bgsize,interval.strand)]
cuts = self["{0},{1},{2},{3}".format(interval.chromosome,interval.startbp-bgsize,interval.endbp+bgsize,interval.strand)]
forwardArray, backwardArray = cuts["+"], cuts["-"]
cutArray = (forwardArray + backwardArray)

Expand Down Expand Up @@ -236,7 +239,7 @@ def loadBEDFile(self,filename):
try:
BEDfile = open(filename, 'rU')
except IOError:
errorString = "Cannot load BED file: {}".format(filename)
errorString = "Cannot load BED file: {0}".format(filename)
raise IOError(errorString)

puts_err("Reading BED File...")
Expand Down Expand Up @@ -264,7 +267,7 @@ def __malformedBEDline(self,BEDString):
Exception
"""
#TODO: Make a new exception class, something like malformedBEDException?
exceptionString = "Malformed BED line: {}".format(BEDString)
exceptionString = "Malformed BED line: {0}".format(BEDString)
raise Exception(exceptionString)

def __parseBEDString(self,BEDString):
Expand Down Expand Up @@ -396,7 +399,7 @@ class GenomicInterval(object):
Basic Object which describes reads region of the genome
"""

#This counts how many GenomicInterval objects have been created so that each GenomicInterval can have reads label.
#This counts how many GenomicInterval objects have been created
counter = 0

def __init__(self, chrom, start, stop, label = 0,score = 0,strand="+"):
Expand Down Expand Up @@ -428,7 +431,7 @@ def __init__(self, chrom, start, stop, label = 0,score = 0,strand="+"):
self.score = float(score)

if self.startbp > self.endbp:
raise Exception("End location of GenomicInterval is bigger than Start location!")
raise Exception("Start location of GenomicInterval is larger than end location!")

# This is from reads bygone era where we ordered the intervals by import order
# self.score = self.__class__.counter
Expand All @@ -437,7 +440,7 @@ def __init__(self, chrom, start, stop, label = 0,score = 0,strand="+"):
if label:
self.label = str(label)
else:
self.label = "Unnamed{}".format(self.__class__.counter)
self.label = "Unnamed{0}".format(self.__class__.counter)

#This contains anything else you want to store about the interval
self.metadata = {}
Expand All @@ -446,7 +449,7 @@ def __str__(self):
"""
BED representation of the interval
"""
return "{}\t{}\t{}\t{}\t{}\t{}".format(self.chromosome, self.startbp, self.endbp, self.label, self.score, self.strand)
return "{0}\t{1}\t{2}\t{3}\t{4}\t{5}".format(self.chromosome, self.startbp, self.endbp, self.label, self.score, self.strand)

def __len__(self):
"""
Expand Down
7 changes: 4 additions & 3 deletions pyDNase/footprinting/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,18 +107,19 @@ def calculate(self,reads,shoulder_sizes=range(35,36),footprint_sizes = range(11,
halffpround = int((fp_size-1)/2)
fw_fpscores_dict[fp_size] = [0] * halffpround + [sum(i) for i in self.window(forwardArray, fp_size)]
rv_fpscores_dict[fp_size] = [0] * halffpround + [sum(i) for i in self.window(backwardArray,fp_size)]

#Empty list of lists for storing the footprint scores
log_probs = [[] for i in range(len(forwardArray))]

if bonferroni:
bonferroni_factor = np.log(1.0/sum(reads.samfile.lengths))

#testing multiple background sizes
for shoulder_size in shoulder_sizes:
#This computes the background cut sums for the specified shoulder_size for all basepairs
f_bindingArray = [0] * (shoulder_size - 1) + [sum(i) for i in self.window(forwardArray,shoulder_size)]
b_bindingArray = [sum(i) for i in self.window(backwardArray,shoulder_size)] + [0] * (shoulder_size - 1)

#Empty list of lists for storing the footprint scores

for fp_size in footprint_sizes:
halffpround = int((fp_size-1)/2)
#This computes the binding cut sums for the specified fp_size for all basepairs
Expand Down Expand Up @@ -190,7 +191,7 @@ def calculate(self,reads,shoulder_sizes=range(35,36),footprint_sizes = range(11,
#This requires that there are actually tags present in these windows
if x:
p = (shoulder_size*2) / float((shoulder_size*2) + fp_size)
#This stores the P values and the fp_size used to calculate them in reads tuple. in the log_probs[] list
#This stores the p values and the fp_size used to calculate them in reads tuple. in the log_probs[] list
score = binom.logsf(int(x - 1), int(n), p)
log_probs[i].append([score,fp_size])
#This iterates over all the base pairs in the region and creates arrays for the best score and footprint size
Expand Down
24 changes: 13 additions & 11 deletions pyDNase/scripts/dnase_average_profile.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,20 @@
import pyDNase
import numpy as np
import matplotlib as mpl
from clint.textui import progress, puts
#Required for headless operation
mpl.use('Agg')
import matplotlib.pyplot as plt
from matplotlib import rcParams

parser = argparse.ArgumentParser(description='Plots average profile of DNase activity surrounding a list of regions in a BED file')
parser.add_argument("-w", "--window_size", help="Size of flanking area around centre of the regions to plot (default: 200)",default=200,type=int)
parser.add_argument("-w", "--window_size", help="Size of flanking area around centre of the regions to plot (default: 100)",default=100,type=int)
parser.add_argument("-i",action="store_true", help="Ignores any strand information in BED file and plots data relative to reference strand",default=False)
parser.add_argument("regions", help="BED file of the regions you want to generate the average profile for")
parser.add_argument("reads", help="The BAM file containing the DNase-seq data")
parser.add_argument("output", help="filename to write the output to")
args = parser.parse_args()

xsize = args.window_size
reads = pyDNase.BAMHandler(args.reads)
regions = pyDNase.GenomicIntervalSet(args.regions)

Expand All @@ -41,13 +41,13 @@
for each in regions:
each.strand = "+"

for each in regions:
each.score = reads.FOS(each)
regions.resizeRegions(xsize)
puts("Resizing Regions to {}".format(args.window_size))
regions.resizeRegions(args.window_size)

fw = []
rv = []
for each in regions:
puts("Reading Data from BAM file...")
for each in progress.bar(regions):
if reads[each]["+"].sum() and reads[each]["-"].sum():
fw.append(reads[each]["+"])
rv.append(reads[each]["-"])
Expand All @@ -58,18 +58,20 @@
rcParams['xtick.major.pad'] = 20
rcParams['ytick.major.pad'] = 20

#Sort out the X axis
ticks = [0,xsize,xsize*2]
labels = [-xsize,0,xsize]
#Sort out the X axis ticks
ticks = [0,args.window_size,args.window_size*2]
labels = [-args.window_size,0,args.window_size]
plt.xticks(ticks, labels)

#Make the yaxis start from 0
plt.gca().set_ylim(0)

#Makes ticks only appear on the left hand side
plt.gca().yaxis.set_ticks_position('left')

#Remove top, right, and bottom borders
#Remove top and right borders
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
#plt.gca().spines['bottom'].set_visible(False)

plt.gca().tick_params(axis='both', which='major', labelsize=32)

Expand Down
49 changes: 49 additions & 0 deletions pyDNase/scripts/dnase_to_JSON.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/usr/bin/env python

# Copyright (C) 2013 Jason Piper - [email protected]
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

import argparse, pyDNase
from clint.textui import puts, progress
parser = argparse.ArgumentParser(description='Writes a JSON file of DNase I cuts for regions from a BED file')
parser.add_argument("-w", "--window_size", help="Resize all regions to a specific length",default = 0, type=int)
parser.add_argument("-i",action="store_true", help="Ignores strand information in BED file",default=False)
parser.add_argument("regions", help="BED file of the regions")
parser.add_argument("reads", help="BAM file containing the read data")
parser.add_argument("output", help="filename to write the JSON output to")
args = parser.parse_args()

reads = pyDNase.BAMHandler(args.reads)
regions = pyDNase.GenomicIntervalSet(args.regions)

if args.i:
for each in regions:
each.strand = "+"

if args.window_size:
puts("Resizing Regions to {0}".format(args.window_size))
regions.resizeRegions(toSize=args.window_size)

#TODO: this will load everything everything into memory, it's probably worth making the option to write directly to disk

outfile = open(args.output,"w")
outarr = []
puts("Generating JSON output...")
for i in progress.bar(sorted(regions, key = lambda x : x.importorder)):
cuts = reads[i]
outarr.append({"location":i.chromosome + ":" + str(i.startbp) + ":" + str(i.endbp), "positive strand cuts":cuts["+"].tolist() , "negative strand cuts":cuts["-"].tolist()})
puts("Writing JSON to disk...")
outfile.write(str(outarr))
outfile.close()
30 changes: 24 additions & 6 deletions pyDNase/scripts/dnase_to_javatreeview.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,17 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.

import argparse, pyDNase, csv
import numpy as np
from clint.textui import puts, progress
import warnings

parser = argparse.ArgumentParser(description='Writes a JavaTreeView file based on the regions in reads BED file and the reads in reads BAM file')
parser.add_argument("-w", "--window_size", help="Size of flanking area around centre of the regions to plot (default: 100)",default=100,type=int)
parser.add_argument("-i",action="store_true", help="Ignores strand information in BED file",default=False)
parser.add_argument("-o",action="store_true", help="Orders output the same as the input (default: orders by FOS)",default=False)
parser.add_argument("-a",action="store_true", help="Write absolute cut counts instead strand imbalanced counts",default=False)
parser.add_argument("-n",action="store_true", help="Normalise the cut data for each region between 0 and 1",default=False)
parser.add_argument("-c",action="store_true", help="Disable memory caching (low RAM mode)",default=False)
parser.add_argument("regions", help="BED file of the regions you want to generate the heatmap for")
parser.add_argument("reads", help="The BAM file containing the read data")
parser.add_argument("output", help="filename to write the CSV output to")
Expand All @@ -34,13 +39,16 @@
for each in regions:
each.strand = "+"

puts("Caching reads...")
for i in progress.bar(regions):
reads[i]
if not args.c:
puts("Caching reads...")
for i in progress.bar(regions):
reads[i]

if args.o:
sorter = lambda x : x.importorder
else:
if args.c:
warnings.warn("You've disabled memory caching, which can cause a ~2x slowdown when sorting by FOS")
sorter = lambda x : x.score
puts("Ordering intervals by FOS")
for i in progress.bar(regions):
Expand All @@ -53,12 +61,22 @@
#Writes the header file
outfile.writerow(["GID", "ID", "NAME"] + [i+1 for i in range(len(reads[regions[0]]["+"]))])

def normalise_cuts(cuts):
normalised_cuts = np.array(cuts,dtype="float")
normalised_cuts = ((normalised_cuts - normalised_cuts.min()) / (normalised_cuts.max() - normalised_cuts.min()))
return normalised_cuts

puts("Writing to JTV...")
for i in progress.bar(sorted(regions, key = sorter)):
cuts = reads[i]
if args.a:
newarray = (cuts["+"] + cuts["-"]).tolist()
newarray = (cuts["+"] + cuts["-"])
if args.n:
newarray = normalise_cuts(newarray)
else:
newarray = (cuts["+"] - cuts["-"]).tolist()
outfile.writerow(["NULL","NULL",i.chromosome + ":" + str(i.startbp) + ":" + str(i.endbp)] + newarray)
if args.n:
cuts["+"] = normalise_cuts(cuts["+"])
cuts["-"] = normalise_cuts(cuts["-"])
newarray = (cuts["+"] - cuts["-"])

outfile.writerow(["NULL","NULL",i.chromosome + ":" + str(i.startbp) + ":" + str(i.endbp)] + newarray.tolist())
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

setup(
name='pyDNase',
version="0.1.0",
version="0.1.1",
description='DNase-seq analysis library',
long_description=open('README.rst',"rt").read(),
author='Jason Piper',
Expand Down Expand Up @@ -47,7 +47,8 @@
"pyDNase/scripts/dnase_to_javatreeview.py",
"pyDNase/scripts/dnase_wig_tracks.py",
"pyDNase/scripts/wellington_footprints.py",
"pyDNase/scripts/examples/example_footprint_scores.py"],
"pyDNase/scripts/examples/example_footprint_scores.py",
"pyDNase/scripts/dnase_to_JSON.py"],

test_suite="test",
)

0 comments on commit aa94ae0

Please sign in to comment.