From 5342c89f6ef17efe03bc36a1520c148cf2cb92b0 Mon Sep 17 00:00:00 2001 From: Greg Way Date: Wed, 1 Nov 2017 13:01:50 -0400 Subject: [PATCH] calculate effect size of phenocopy experiment (#62) save output to results folder --- results/tp53_phenocopy_results.tsv | 11 +++ scripts/tp53_phenocopy.ipynb | 114 ++++++++++++++++++++++------- 2 files changed, 97 insertions(+), 28 deletions(-) create mode 100644 results/tp53_phenocopy_results.tsv diff --git a/results/tp53_phenocopy_results.tsv b/results/tp53_phenocopy_results.tsv new file mode 100644 index 0000000..bdd9730 --- /dev/null +++ b/results/tp53_phenocopy_results.tsv @@ -0,0 +1,11 @@ +gene gene_b event neg_logp p effect_size +ATM TP53 Mutation 6.7 2.16E-7 0.34 +CREBBP TP53 Mutation 3.0 1.03E-3 0.31 +CDKN2A TP53 Deletion 67.2 6.71E-68 0.62 +ATR TP53 Mutation 1.2 NS 0.22 +CHEK2 TP53 Mutation 0.4 NS 0.14 +CHEK1 TP53 Mutation 0.4 NS 0.27 +MDM2 TP53 Amplification 6.4 3.78E-7 0.19 +RPS6KA3 TP53 Mutation 0.8 NS 0.25 +MDM4 TP53 Amplification 13.2 6.88E-14 0.23 +PPM1D TP53 Amplification 14.7 1.93E-15 0.31 diff --git a/scripts/tp53_phenocopy.ipynb b/scripts/tp53_phenocopy.ipynb index 9337f46..1749f47 100644 --- a/scripts/tp53_phenocopy.ipynb +++ b/scripts/tp53_phenocopy.ipynb @@ -60,16 +60,50 @@ "collapsed": true }, "outputs": [], + "source": [ + "def get_cohens_d(pos_samples, neg_samples):\n", + " \"\"\"\n", + " Find the effect size of aberrant vs. wildtype genes according to classifier score\n", + " Modified from:\n", + " https://github.com/AllenDowney/CompStats/blob/master/effect_size.ipynb\n", + "\n", + " Arguments:\n", + " pos_samples, neg_samples - aberrant or wildtype event dataframe storing classifier scores\n", + "\n", + " Output:\n", + " Returns the Cohen's d effect size estimate statistic for two status groups\n", + " \"\"\"\n", + " pos_samples = pos_samples['weight']\n", + " neg_samples = neg_samples['weight']\n", + " \n", + " mean_diff = pos_samples.mean() - neg_samples.mean()\n", + " n_pos, n_neg = len(pos_samples), len(neg_samples)\n", + "\n", + " var_pos = pos_samples.var()\n", + " var_neg = neg_samples.var()\n", + "\n", + " sd_pool = np.sqrt((n_pos * var_pos + n_neg * var_neg) / (n_pos + n_neg))\n", + " cohen_d = mean_diff / sd_pool\n", + " return cohen_d " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": true + }, + "outputs": [], "source": [ "# genes of interest\n", "amplifications = ['MDM2', 'MDM4', 'PPM1D']\n", "deletions = ['CDKN2A']\n", - "mutations = ['ATM', 'CHEK1', 'CHEK2', 'ATR', 'RPS6KA3']" + "mutations = ['ATM', 'ATR', 'CHEK1', 'CHEK2', 'CREBBP', 'RPS6KA3']" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": { "collapsed": false }, @@ -87,7 +121,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": { "collapsed": false }, @@ -202,7 +236,7 @@ "TCGA-02-0055-01 TP53 c.646G>A p.Val216Met Missense_Mutation " ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -217,7 +251,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": { "collapsed": false }, @@ -340,7 +374,7 @@ "[2 rows x 25128 columns]" ] }, - "execution_count": 7, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -368,7 +402,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": { "collapsed": false }, @@ -392,7 +426,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": { "collapsed": false }, @@ -401,7 +435,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "(86994, 9)\n" + "(96660, 9)\n" ] }, { @@ -475,7 +509,7 @@ "1 IDHwt 0.0 " ] }, - "execution_count": 9, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -494,7 +528,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": { "collapsed": false }, @@ -503,7 +537,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "(48690, 9)\n" + "(54100, 9)\n" ] } ], @@ -523,7 +557,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": { "collapsed": false }, @@ -531,19 +565,20 @@ { "data": { "text/plain": [ + "PPM1D 9666\n", "RPS6KA3 9666\n", - "CHEK1 9666\n", + "CREBBP 9666\n", "ATR 9666\n", "MDM4 9666\n", + "CDKN2A 9666\n", + "CHEK1 9666\n", "ATM 9666\n", - "CHEK2 9666\n", - "PPM1D 9666\n", "MDM2 9666\n", - "CDKN2A 9666\n", + "CHEK2 9666\n", "Name: gene, dtype: int64" ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -555,7 +590,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": { "collapsed": false }, @@ -578,7 +613,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": { "collapsed": false }, @@ -605,7 +640,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": { "collapsed": false }, @@ -613,19 +648,20 @@ { "data": { "text/plain": [ - "RPS6KA3 5410\n", "PPM1D 5410\n", "CHEK1 5410\n", - "ATR 5410\n", - "MDM4 5410\n", "ATM 5410\n", + "RPS6KA3 5410\n", "MDM2 5410\n", - "CDKN2A 5410\n", + "CREBBP 5410\n", + "ATR 5410\n", "CHEK2 5410\n", + "MDM4 5410\n", + "CDKN2A 5410\n", "Name: gene, dtype: int64" ] }, - "execution_count": 14, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -644,7 +680,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": { "collapsed": false }, @@ -653,6 +689,7 @@ "# Mutation plotting parameters\n", "x1_mut, x2_mut = -0.2, 0.2\n", "y_mut, h_mut = 1.01, 0.03\n", + "output_results = []\n", "\n", "for gene in set(plot_ready_df['gene']):\n", " gene_df = plot_ready_df[plot_ready_df['gene'] == gene]\n", @@ -675,7 +712,7 @@ "\n", " # Perform an independent t-test against:\n", " # Amplification vs. Wild-type weights\n", - " if gene == 'CDKN2A':\n", + " if gene in ['CDKN2A']:\n", " mutant_weight = gene_df[(gene_df['value'] == -1) | (gene_df['value'] == -2)]\n", " x1_amp, x2_amp = -0.25, 0\n", " y_amp, h_mut = 1.01, 0.03\n", @@ -716,6 +753,13 @@ " else:\n", " add_text = \"{:.2E}\".format(Decimal(p_val))\n", " \n", + " # Get Cohen's D effect size between the groups\n", + " effect_size = get_cohens_d(mutant_weight, wt_weight)\n", + " log_p = np.round(-1 * np.log10(p_val), decimals=1)\n", + " effect = \"{:.2}\".format(Decimal(effect_size))\n", + " gene_results = [gene, 'TP53', event, log_p, add_text, effect]\n", + " output_results.append(gene_results)\n", + " \n", " # Generate Plots\n", " ax = sns.boxplot(x=\"gene\", y=\"weight\", hue='value',\n", " data=gene_df,\n", @@ -782,6 +826,20 @@ " plt.savefig(phenocopy_fig_file)\n", " plt.close()" ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "output_df = pd.DataFrame(output_results, columns=['gene', 'gene_b', 'event',\n", + " 'neg_logp', 'p', 'effect_size'])\n", + "output_file = os.path.join('..', 'results', 'tp53_phenocopy_results.tsv')\n", + "output_df.to_csv(output_file, sep='\\t', index=False)" + ] } ], "metadata": {