Skip to content

Commit

Permalink
fixes for de results
Browse files Browse the repository at this point in the history
  • Loading branch information
dmnfarrell committed Jul 29, 2017
1 parent d820737 commit 7638ba2
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 17 deletions.
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,11 @@
name = 'smallrnaseq',
version = '0.4.0',
description = 'Package for short RNA-seq analysis',
long_description = 'smallrnaseq is a Python package for processing of small RNA seq data.',
url='https://github.com/dmnfarrell/smallrnaseq',
license='GPL v3',
author = 'Damien Farrell',
author_email = 'farrell.damien[at]gmail.com',
author_email = 'farrell.damien@gmail.com',
packages = ['smallrnaseq'],
package_data={'smallrnaseq': ['data/*','*.R']},
install_requires=inst_requires,
Expand Down
22 changes: 16 additions & 6 deletions smallrnaseq/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,11 +333,15 @@ def diff_expression(opts):
sep = '\t'
labels = pd.read_csv(labelsfile, sep=sep)
counts = pd.read_csv(countsfile)
counts = counts.drop_duplicates('name')

#define sample/factor cols and conditions for de
samplecol = opts['sample_col']
factorcol = opts['factors_col']
conds = opts['conditions'].split(',')
print ('using these labels:')
print (labels[[samplecol, factorcol]].sort_values(factorcol))

#tell user about possible errors
if len(conds) < 2 or conds[0] == '':
print ('you need to provide 2 conditions to compare')
Expand All @@ -350,10 +354,11 @@ def diff_expression(opts):

print ('conditions:', ' vs '.join(conds))
#get the samples needed for the required conditions we want to compare
data = de.get_factor_samples(counts,
data, samples = de.get_factor_samples(counts,
labels, [(factorcol,conds[0]),(factorcol,conds[1])],
samplecol=samplecol, index='name')
#print (data[:4])
#print (samples)

res = de.run_edgeR(data=data, cutoff=logfccutoff)
res.to_csv(os.path.join(path,'de_genes_edger.csv'), float_format='%.4g')
print ('genes above log-fold cutoff using edgeR:')
Expand All @@ -367,13 +372,20 @@ def diff_expression(opts):
print (res2)

both = res[res.name.isin(res2.name)]
if len(both) == 0:
print ('no significant genes found above cutoff')
return
#limit number of plots
if len(both)>40:
names = both[(both.logFC>1.5) | (both.logFC<-1.5)].name[:50]
else:
names = both.name
#plot these genes with seaborn factor plot
xorder=conds
xorder = conds
counts = counts.set_index('name')[samples]
#normalize counts (don't rely on norm cols as they may be missing)
counts = base.total_library_normalize(counts)

m = de.melt_samples(counts, labels, names, samplecol=samplecol)
import seaborn as sns
kind = opts['de_plot']
Expand All @@ -388,9 +400,7 @@ def diff_expression(opts):
import pylab as plt
plt.savefig(os.path.join(path,'MD_plot.png'))

scols,ncols = base.get_column_names(counts)
c = counts.set_index('name')[ncols]
de.cluster_map(c, names)
de.cluster_map(counts, names)
plt.savefig(os.path.join(path,'de_clustermap.png'),bbox_inches='tight')

print ('wrote plots to %s' %path)
Expand Down
26 changes: 16 additions & 10 deletions smallrnaseq/de.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ def get_columns_by_label(labels, samplecol, filters=[], querystr=None):
q=[]
for f in filters:
print (f)
if type(f[1]) in ['int','float']:
dtype = labels[f[0]].dtype.kind
if dtype in 'bifc':
s = "%s==%s" %(f[0],f[1])
else:
s = "%s=='%s'" %(f[0],f[1])
Expand All @@ -64,18 +65,21 @@ def get_factor_samples(df, labels, factors, filters=[], samplecol='filename', in
samplecol: name of column holding sample/file names
Returns:
dataframe of data columns with header labels for edgeR script
a dict mapping condition to column names
"""

if index != None:
df=df.set_index(index)
l=0
res = None

samples = []
for f in factors:
cond = f[1]
f = filters + [f]
cols = get_columns_by_label(labels, samplecol, f)
print (cols)
cols = list(set(cols) & set(df.columns))
samples.extend(cols)
x = df[cols]
print ('%s samples, %s genes' %(len(cols),len(x)))
if len(x.columns)==0:
Expand All @@ -90,7 +94,7 @@ def get_factor_samples(df, labels, factors, filters=[], samplecol='filename', in
else:
res = res.join(x)
res=res.dropna()
return res
return res, samples

def run_edgeR(countsfile=None, data=None, cutoff=1.5):
"""Run edgeR from R script"""
Expand Down Expand Up @@ -179,26 +183,28 @@ def rpyEdgeR(data, groups, sizes, genes):
pvals = list(tags.r['adj.P.Val'][0])
return

def melt_samples(df, labels, names, samplecol='filename', index='name'):
def melt_samples(df, labels, names, samplecol='filename', index=None):
"""Melt sample data by factor labels so we can plot with seaborn"""

df=df.set_index(index)
scols,ncols = base.get_column_names(df)
df = df.ix[names][ncols]
if index is not None:
df=df.set_index(index)
df = df.ix[names]
t=df.T
t.index = scols
t.index = df.columns
t = t.merge(labels,left_index=True,right_on=samplecol)
m = pd.melt(t,id_vars=list(labels.columns),
var_name='name',value_name='read count')
return m

def cluster_map(data, names):
"""Cluster map of genes"""

import seaborn as sns
import pylab as plt
data = data.ix[names]
X = np.log(data).fillna(0)
X = X.apply(lambda x: x-x.mean(), 1)
cg = sns.clustermap(X,cmap='RdYlBu',figsize=(8,10),lw=1,linecolor='gray')
cg = sns.clustermap(X,cmap='RdYlBu_r',figsize=(8,10),lw=.5,linecolor='gray')
mt=plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
mt=plt.setp(cg.ax_heatmap.xaxis.get_majorticklabels(), rotation=90)
return cg
return cg

0 comments on commit 7638ba2

Please sign in to comment.