diff --git a/setup.py b/setup.py index 259705c..641be6a 100644 --- a/setup.py +++ b/setup.py @@ -18,10 +18,11 @@ name = 'smallrnaseq', version = '0.4.0', description = 'Package for short RNA-seq analysis', + long_description = 'smallrnaseq is a Python package for processing of small RNA seq data.', url='https://github.com/dmnfarrell/smallrnaseq', license='GPL v3', author = 'Damien Farrell', - author_email = 'farrell.damien[at]gmail.com', + author_email = 'farrell.damien@gmail.com', packages = ['smallrnaseq'], package_data={'smallrnaseq': ['data/*','*.R']}, install_requires=inst_requires, diff --git a/smallrnaseq/app.py b/smallrnaseq/app.py index 753d618..0b6716a 100755 --- a/smallrnaseq/app.py +++ b/smallrnaseq/app.py @@ -333,11 +333,15 @@ def diff_expression(opts): sep = '\t' labels = pd.read_csv(labelsfile, sep=sep) counts = pd.read_csv(countsfile) + counts = counts.drop_duplicates('name') #define sample/factor cols and conditions for de samplecol = opts['sample_col'] factorcol = opts['factors_col'] conds = opts['conditions'].split(',') + print ('using these labels:') + print (labels[[samplecol, factorcol]].sort_values(factorcol)) + #tell user about possible errors if len(conds) < 2 or conds[0] == '': print ('you need to provide 2 conditions to compare') @@ -350,10 +354,11 @@ def diff_expression(opts): print ('conditions:', ' vs '.join(conds)) #get the samples needed for the required conditions we want to compare - data = de.get_factor_samples(counts, + data, samples = de.get_factor_samples(counts, labels, [(factorcol,conds[0]),(factorcol,conds[1])], samplecol=samplecol, index='name') - #print (data[:4]) + #print (samples) + res = de.run_edgeR(data=data, cutoff=logfccutoff) res.to_csv(os.path.join(path,'de_genes_edger.csv'), float_format='%.4g') print ('genes above log-fold cutoff using edgeR:') @@ -367,13 +372,20 @@ def diff_expression(opts): print (res2) both = res[res.name.isin(res2.name)] + if len(both) == 0: + print ('no significant genes found above cutoff') + return #limit number of plots if len(both)>40: names = both[(both.logFC>1.5) | (both.logFC<-1.5)].name[:50] else: names = both.name #plot these genes with seaborn factor plot - xorder=conds + xorder = conds + counts = counts.set_index('name')[samples] + #normalize counts (don't rely on norm cols as they may be missing) + counts = base.total_library_normalize(counts) + m = de.melt_samples(counts, labels, names, samplecol=samplecol) import seaborn as sns kind = opts['de_plot'] @@ -388,9 +400,7 @@ def diff_expression(opts): import pylab as plt plt.savefig(os.path.join(path,'MD_plot.png')) - scols,ncols = base.get_column_names(counts) - c = counts.set_index('name')[ncols] - de.cluster_map(c, names) + de.cluster_map(counts, names) plt.savefig(os.path.join(path,'de_clustermap.png'),bbox_inches='tight') print ('wrote plots to %s' %path) diff --git a/smallrnaseq/de.py b/smallrnaseq/de.py index b4731c5..5d41592 100644 --- a/smallrnaseq/de.py +++ b/smallrnaseq/de.py @@ -43,7 +43,8 @@ def get_columns_by_label(labels, samplecol, filters=[], querystr=None): q=[] for f in filters: print (f) - if type(f[1]) in ['int','float']: + dtype = labels[f[0]].dtype.kind + if dtype in 'bifc': s = "%s==%s" %(f[0],f[1]) else: s = "%s=='%s'" %(f[0],f[1]) @@ -64,18 +65,21 @@ def get_factor_samples(df, labels, factors, filters=[], samplecol='filename', in samplecol: name of column holding sample/file names Returns: dataframe of data columns with header labels for edgeR script + a dict mapping condition to column names """ if index != None: df=df.set_index(index) l=0 res = None - + samples = [] for f in factors: + cond = f[1] f = filters + [f] cols = get_columns_by_label(labels, samplecol, f) print (cols) cols = list(set(cols) & set(df.columns)) + samples.extend(cols) x = df[cols] print ('%s samples, %s genes' %(len(cols),len(x))) if len(x.columns)==0: @@ -90,7 +94,7 @@ def get_factor_samples(df, labels, factors, filters=[], samplecol='filename', in else: res = res.join(x) res=res.dropna() - return res + return res, samples def run_edgeR(countsfile=None, data=None, cutoff=1.5): """Run edgeR from R script""" @@ -179,26 +183,28 @@ def rpyEdgeR(data, groups, sizes, genes): pvals = list(tags.r['adj.P.Val'][0]) return -def melt_samples(df, labels, names, samplecol='filename', index='name'): +def melt_samples(df, labels, names, samplecol='filename', index=None): """Melt sample data by factor labels so we can plot with seaborn""" - df=df.set_index(index) - scols,ncols = base.get_column_names(df) - df = df.ix[names][ncols] + if index is not None: + df=df.set_index(index) + df = df.ix[names] t=df.T - t.index = scols + t.index = df.columns t = t.merge(labels,left_index=True,right_on=samplecol) m = pd.melt(t,id_vars=list(labels.columns), var_name='name',value_name='read count') return m def cluster_map(data, names): + """Cluster map of genes""" + import seaborn as sns import pylab as plt data = data.ix[names] X = np.log(data).fillna(0) X = X.apply(lambda x: x-x.mean(), 1) - cg = sns.clustermap(X,cmap='RdYlBu',figsize=(8,10),lw=1,linecolor='gray') + cg = sns.clustermap(X,cmap='RdYlBu_r',figsize=(8,10),lw=.5,linecolor='gray') mt=plt.setp(cg.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) mt=plt.setp(cg.ax_heatmap.xaxis.get_majorticklabels(), rotation=90) - return cg \ No newline at end of file + return cg