Skip to content

Commit

Permalink
calculate effect size of phenocopy experiment (greenelab#62)
Browse files Browse the repository at this point in the history
save output to results folder
  • Loading branch information
gwaybio authored Nov 1, 2017
1 parent e85f925 commit 5342c89
Show file tree
Hide file tree
Showing 2 changed files with 97 additions and 28 deletions.
11 changes: 11 additions & 0 deletions results/tp53_phenocopy_results.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
gene gene_b event neg_logp p effect_size
ATM TP53 Mutation 6.7 2.16E-7 0.34
CREBBP TP53 Mutation 3.0 1.03E-3 0.31
CDKN2A TP53 Deletion 67.2 6.71E-68 0.62
ATR TP53 Mutation 1.2 NS 0.22
CHEK2 TP53 Mutation 0.4 NS 0.14
CHEK1 TP53 Mutation 0.4 NS 0.27
MDM2 TP53 Amplification 6.4 3.78E-7 0.19
RPS6KA3 TP53 Mutation 0.8 NS 0.25
MDM4 TP53 Amplification 13.2 6.88E-14 0.23
PPM1D TP53 Amplification 14.7 1.93E-15 0.31
114 changes: 86 additions & 28 deletions scripts/tp53_phenocopy.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -60,16 +60,50 @@
"collapsed": true
},
"outputs": [],
"source": [
"def get_cohens_d(pos_samples, neg_samples):\n",
" \"\"\"\n",
" Find the effect size of aberrant vs. wildtype genes according to classifier score\n",
" Modified from:\n",
" https://github.com/AllenDowney/CompStats/blob/master/effect_size.ipynb\n",
"\n",
" Arguments:\n",
" pos_samples, neg_samples - aberrant or wildtype event dataframe storing classifier scores\n",
"\n",
" Output:\n",
" Returns the Cohen's d effect size estimate statistic for two status groups\n",
" \"\"\"\n",
" pos_samples = pos_samples['weight']\n",
" neg_samples = neg_samples['weight']\n",
" \n",
" mean_diff = pos_samples.mean() - neg_samples.mean()\n",
" n_pos, n_neg = len(pos_samples), len(neg_samples)\n",
"\n",
" var_pos = pos_samples.var()\n",
" var_neg = neg_samples.var()\n",
"\n",
" sd_pool = np.sqrt((n_pos * var_pos + n_neg * var_neg) / (n_pos + n_neg))\n",
" cohen_d = mean_diff / sd_pool\n",
" return cohen_d "
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# genes of interest\n",
"amplifications = ['MDM2', 'MDM4', 'PPM1D']\n",
"deletions = ['CDKN2A']\n",
"mutations = ['ATM', 'CHEK1', 'CHEK2', 'ATR', 'RPS6KA3']"
"mutations = ['ATM', 'ATR', 'CHEK1', 'CHEK2', 'CREBBP', 'RPS6KA3']"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"metadata": {
"collapsed": false
},
Expand All @@ -87,7 +121,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"metadata": {
"collapsed": false
},
Expand Down Expand Up @@ -202,7 +236,7 @@
"TCGA-02-0055-01 TP53 c.646G>A p.Val216Met Missense_Mutation "
]
},
"execution_count": 6,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -217,7 +251,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 8,
"metadata": {
"collapsed": false
},
Expand Down Expand Up @@ -340,7 +374,7 @@
"[2 rows x 25128 columns]"
]
},
"execution_count": 7,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -368,7 +402,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 9,
"metadata": {
"collapsed": false
},
Expand All @@ -392,7 +426,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 10,
"metadata": {
"collapsed": false
},
Expand All @@ -401,7 +435,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"(86994, 9)\n"
"(96660, 9)\n"
]
},
{
Expand Down Expand Up @@ -475,7 +509,7 @@
"1 IDHwt 0.0 "
]
},
"execution_count": 9,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -494,7 +528,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 11,
"metadata": {
"collapsed": false
},
Expand All @@ -503,7 +537,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"(48690, 9)\n"
"(54100, 9)\n"
]
}
],
Expand All @@ -523,27 +557,28 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"PPM1D 9666\n",
"RPS6KA3 9666\n",
"CHEK1 9666\n",
"CREBBP 9666\n",
"ATR 9666\n",
"MDM4 9666\n",
"CDKN2A 9666\n",
"CHEK1 9666\n",
"ATM 9666\n",
"CHEK2 9666\n",
"PPM1D 9666\n",
"MDM2 9666\n",
"CDKN2A 9666\n",
"CHEK2 9666\n",
"Name: gene, dtype: int64"
]
},
"execution_count": 11,
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -555,7 +590,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 13,
"metadata": {
"collapsed": false
},
Expand All @@ -578,7 +613,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 14,
"metadata": {
"collapsed": false
},
Expand All @@ -605,27 +640,28 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"RPS6KA3 5410\n",
"PPM1D 5410\n",
"CHEK1 5410\n",
"ATR 5410\n",
"MDM4 5410\n",
"ATM 5410\n",
"RPS6KA3 5410\n",
"MDM2 5410\n",
"CDKN2A 5410\n",
"CREBBP 5410\n",
"ATR 5410\n",
"CHEK2 5410\n",
"MDM4 5410\n",
"CDKN2A 5410\n",
"Name: gene, dtype: int64"
]
},
"execution_count": 14,
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -644,7 +680,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 16,
"metadata": {
"collapsed": false
},
Expand All @@ -653,6 +689,7 @@
"# Mutation plotting parameters\n",
"x1_mut, x2_mut = -0.2, 0.2\n",
"y_mut, h_mut = 1.01, 0.03\n",
"output_results = []\n",
"\n",
"for gene in set(plot_ready_df['gene']):\n",
" gene_df = plot_ready_df[plot_ready_df['gene'] == gene]\n",
Expand All @@ -675,7 +712,7 @@
"\n",
" # Perform an independent t-test against:\n",
" # Amplification vs. Wild-type weights\n",
" if gene == 'CDKN2A':\n",
" if gene in ['CDKN2A']:\n",
" mutant_weight = gene_df[(gene_df['value'] == -1) | (gene_df['value'] == -2)]\n",
" x1_amp, x2_amp = -0.25, 0\n",
" y_amp, h_mut = 1.01, 0.03\n",
Expand Down Expand Up @@ -716,6 +753,13 @@
" else:\n",
" add_text = \"{:.2E}\".format(Decimal(p_val))\n",
" \n",
" # Get Cohen's D effect size between the groups\n",
" effect_size = get_cohens_d(mutant_weight, wt_weight)\n",
" log_p = np.round(-1 * np.log10(p_val), decimals=1)\n",
" effect = \"{:.2}\".format(Decimal(effect_size))\n",
" gene_results = [gene, 'TP53', event, log_p, add_text, effect]\n",
" output_results.append(gene_results)\n",
" \n",
" # Generate Plots\n",
" ax = sns.boxplot(x=\"gene\", y=\"weight\", hue='value',\n",
" data=gene_df,\n",
Expand Down Expand Up @@ -782,6 +826,20 @@
" plt.savefig(phenocopy_fig_file)\n",
" plt.close()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"output_df = pd.DataFrame(output_results, columns=['gene', 'gene_b', 'event',\n",
" 'neg_logp', 'p', 'effect_size'])\n",
"output_file = os.path.join('..', 'results', 'tp53_phenocopy_results.tsv')\n",
"output_df.to_csv(output_file, sep='\\t', index=False)"
]
}
],
"metadata": {
Expand Down

0 comments on commit 5342c89

Please sign in to comment.