From e85f92514d2c62a084cc7391a27d69ca8792f12d Mon Sep 17 00:00:00 2001
From: Greg Way <gregory.way@gmail.com>
Date: Fri, 20 Oct 2017 12:55:50 -0400
Subject: [PATCH] add tp53 phenocopy analysis (#61)

---
 scripts/tp53_phenocopy.ipynb | 808 +++++++++++++++++++++++++++++++++++
 1 file changed, 808 insertions(+)
 create mode 100644 scripts/tp53_phenocopy.ipynb
diff --git a/scripts/tp53_phenocopy.ipynb b/scripts/tp53_phenocopy.ipynb
new file mode 100644
index 0000000..9337f46
--- /dev/null
+++ b/scripts/tp53_phenocopy.ipynb
@@ -0,0 +1,808 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# TP53 Phenocopying Events\n",
+    "\n",
+    "In the following script, we visualize and determine if specific gene events phenocopy TP53 loss of function. We focus on _MDM2_, _MDM4_, and _PPM1D_ amplifications, _CDKN2A_ deletions, and _ATM_, _CHEK1_, _CHEK2_, _ATR_, and _RPS6KA3_ mutations. \n",
+    "\n",
+    "We first filter out samples with _TP53_ mutations and deep deletions as well as hypermutated samples. We then visualize the difference of the TP53 classifier score across tumors with the given events above. We perform t-tests to determine if the classifier scores are significantly different in the direction of a priori assumptions."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from decimal import Decimal\n",
+    "from scipy.stats import ttest_ind\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "np.random.seed(123)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "sns.set(style='whitegrid')\n",
+    "sns.set_context('paper', rc={'font.size':11, 'axes.titlesize':11, 'axes.labelsize':16,\n",
+    "                             'xtick.labelsize':11, 'ytick.labelsize':11})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# genes of interest\n",
+    "amplifications = ['MDM2', 'MDM4', 'PPM1D']\n",
+    "deletions = ['CDKN2A']\n",
+    "mutations = ['ATM', 'CHEK1', 'CHEK2', 'ATR', 'RPS6KA3']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# Load mutation, copy number, and sample freeze data\n",
+    "mut_file = os.path.join('..', 'data', 'pancan_mutation_freeze.tsv')\n",
+    "gistic_file = os.path.join('..', 'data', 'raw', 'pancan_GISTIC_threshold.tsv')\n",
+    "sample_freeze_file = os.path.join('..', 'data', 'sample_freeze.tsv')\n",
+    "\n",
+    "mutation_df = pd.read_table(mut_file, index_col=0)\n",
+    "copy_df = pd.read_table(gistic_file, index_col=0)\n",
+    "sample_freeze_df = pd.read_table(sample_freeze_file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style>\n",
+       "    .dataframe thead tr:only-child th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>log10_mut</th>\n",
+       "      <th>total_status</th>\n",
+       "      <th>weight</th>\n",
+       "      <th>TP53</th>\n",
+       "      <th>TP53_loss</th>\n",
+       "      <th>PATIENT_BARCODE</th>\n",
+       "      <th>DISEASE</th>\n",
+       "      <th>SUBTYPE</th>\n",
+       "      <th>hypermutated</th>\n",
+       "      <th>include</th>\n",
+       "      <th>Hugo_Symbol</th>\n",
+       "      <th>HGVSc</th>\n",
+       "      <th>HGVSp</th>\n",
+       "      <th>Variant_Classification</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ID</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>TCGA-02-0047-01</th>\n",
+       "      <td>1.812913</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.723395</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>TCGA-02-0047</td>\n",
+       "      <td>GBM</td>\n",
+       "      <td>IDHwt</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Wild-Type</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>TCGA-02-0055-01</th>\n",
+       "      <td>1.707570</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.698910</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>TCGA-02-0055</td>\n",
+       "      <td>GBM</td>\n",
+       "      <td>IDHwt</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>TP53</td>\n",
+       "      <td>c.646G&gt;A</td>\n",
+       "      <td>p.Val216Met</td>\n",
+       "      <td>Missense_Mutation</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                 log10_mut  total_status    weight  TP53  TP53_loss  \\\n",
+       "ID                                                                    \n",
+       "TCGA-02-0047-01   1.812913           0.0  0.723395   0.0        0.0   \n",
+       "TCGA-02-0055-01   1.707570           1.0  0.698910   1.0        0.0   \n",
+       "\n",
+       "                PATIENT_BARCODE DISEASE SUBTYPE  hypermutated  include  \\\n",
+       "ID                                                                       \n",
+       "TCGA-02-0047-01    TCGA-02-0047     GBM   IDHwt           0.0      1.0   \n",
+       "TCGA-02-0055-01    TCGA-02-0055     GBM   IDHwt           0.0      1.0   \n",
+       "\n",
+       "                Hugo_Symbol     HGVSc        HGVSp Variant_Classification  \n",
+       "ID                                                                         \n",
+       "TCGA-02-0047-01         NaN       NaN          NaN              Wild-Type  \n",
+       "TCGA-02-0055-01        TP53  c.646G>A  p.Val216Met      Missense_Mutation  "
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# The prediction file stores the TP53 classifier scores for all samples\n",
+    "prediction_file = os.path.join('..', 'classifiers', 'TP53', 'tables',\n",
+    "                               'TP53_loss_prediction_scores.tsv')\n",
+    "prediction_df = pd.read_table(prediction_file, index_col=0)\n",
+    "prediction_df.head(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style>\n",
+       "    .dataframe thead tr:only-child th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th>Gene Symbol</th>\n",
+       "      <th>ACAP3</th>\n",
+       "      <th>ACTRT2</th>\n",
+       "      <th>AGRN</th>\n",
+       "      <th>ANKRD65</th>\n",
+       "      <th>ATAD3A</th>\n",
+       "      <th>ATAD3B</th>\n",
+       "      <th>ATAD3C</th>\n",
+       "      <th>AURKAIP1</th>\n",
+       "      <th>B3GALT6</th>\n",
+       "      <th>C1orf159</th>\n",
+       "      <th>...</th>\n",
+       "      <th>RP13-228J13.1</th>\n",
+       "      <th>SMIM9</th>\n",
+       "      <th>SNORA36A</th>\n",
+       "      <th>SNORA56</th>\n",
+       "      <th>TMLHE</th>\n",
+       "      <th>VBP1</th>\n",
+       "      <th>DDX11L16|chrX</th>\n",
+       "      <th>IL9R|chrX</th>\n",
+       "      <th>SPRY3|chrX</th>\n",
+       "      <th>VAMP7|chrX</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>TCGA-02-0047-01</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>TCGA-02-0055-01</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>...</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>-1</td>\n",
+       "      <td>-1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>2 rows × 25128 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "Gene Symbol      ACAP3  ACTRT2  AGRN  ANKRD65  ATAD3A  ATAD3B  ATAD3C  \\\n",
+       "TCGA-02-0047-01      0       0     0        0       0       0       0   \n",
+       "TCGA-02-0055-01      1       1     1        1       1       1       1   \n",
+       "\n",
+       "Gene Symbol      AURKAIP1  B3GALT6  C1orf159     ...      RP13-228J13.1  \\\n",
+       "TCGA-02-0047-01         0        0         0     ...                  0   \n",
+       "TCGA-02-0055-01         1        1         1     ...                 -1   \n",
+       "\n",
+       "Gene Symbol      SMIM9  SNORA36A  SNORA56  TMLHE  VBP1  DDX11L16|chrX  \\\n",
+       "TCGA-02-0047-01      0         0        0      0     0              0   \n",
+       "TCGA-02-0055-01     -1        -1       -1     -1    -1             -1   \n",
+       "\n",
+       "Gene Symbol      IL9R|chrX  SPRY3|chrX  VAMP7|chrX  \n",
+       "TCGA-02-0047-01          0           0           0  \n",
+       "TCGA-02-0055-01         -1          -1          -1  \n",
+       "\n",
+       "[2 rows x 25128 columns]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Process and subset the GISTIC copy number data\n",
+    "# We do not use the preprocessed copy number data because we are also \n",
+    "# interested in intermediate copy number events\n",
+    "copy_df.drop(['Locus ID', 'Cytoband'], axis=1, inplace=True)\n",
+    "copy_df.columns = copy_df.columns.str[0:15]\n",
+    "\n",
+    "copy_df = copy_df.T\n",
+    "copy_df = copy_df.loc[sorted(sample_freeze_df['SAMPLE_BARCODE'])]\n",
+    "copy_df = copy_df.fillna(0)\n",
+    "copy_df = copy_df.astype(int)\n",
+    "copy_df.head(2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Subset corresponding data type according to predicted phenocopying mechanism"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "del_phenocopy_df = copy_df.loc[:, deletions]\n",
+    "del_phenocopy_df = del_phenocopy_df.reset_index().melt(id_vars=['index'])\n",
+    "del_phenocopy_df.columns = ['sample_id', 'gene', 'value']\n",
+    "del_phenocopy_df = del_phenocopy_df.assign(event = 'Deletion')\n",
+    "\n",
+    "amp_phenocopy_df = copy_df.loc[:, amplifications]\n",
+    "amp_phenocopy_df = amp_phenocopy_df.reset_index().melt(id_vars=['index'])\n",
+    "amp_phenocopy_df.columns = ['sample_id', 'gene', 'value']\n",
+    "amp_phenocopy_df = amp_phenocopy_df.assign(event = 'Amplification')\n",
+    "\n",
+    "mut_phenocopy_df = mutation_df.loc[:, mutations]\n",
+    "mut_phenocopy_df = mut_phenocopy_df.reset_index().melt(id_vars=['SAMPLE_BARCODE'])\n",
+    "mut_phenocopy_df.columns = ['sample_id', 'gene', 'value']\n",
+    "mut_phenocopy_df = mut_phenocopy_df.assign(event = 'Mutation')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(86994, 9)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style>\n",
+       "    .dataframe thead tr:only-child th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>sample_id</th>\n",
+       "      <th>gene</th>\n",
+       "      <th>value</th>\n",
+       "      <th>event</th>\n",
+       "      <th>weight</th>\n",
+       "      <th>total_status</th>\n",
+       "      <th>DISEASE</th>\n",
+       "      <th>SUBTYPE</th>\n",
+       "      <th>hypermutated</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>TCGA-02-0047-01</td>\n",
+       "      <td>CDKN2A</td>\n",
+       "      <td>-2</td>\n",
+       "      <td>Deletion</td>\n",
+       "      <td>0.723395</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>GBM</td>\n",
+       "      <td>IDHwt</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>TCGA-02-0055-01</td>\n",
+       "      <td>CDKN2A</td>\n",
+       "      <td>0</td>\n",
+       "      <td>Deletion</td>\n",
+       "      <td>0.698910</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>GBM</td>\n",
+       "      <td>IDHwt</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         sample_id    gene  value     event    weight  total_status DISEASE  \\\n",
+       "0  TCGA-02-0047-01  CDKN2A     -2  Deletion  0.723395           0.0     GBM   \n",
+       "1  TCGA-02-0055-01  CDKN2A      0  Deletion  0.698910           1.0     GBM   \n",
+       "\n",
+       "  SUBTYPE  hypermutated  \n",
+       "0   IDHwt           0.0  \n",
+       "1   IDHwt           0.0  "
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Combine the gene events of interest\n",
+    "event_df = pd.concat([del_phenocopy_df, amp_phenocopy_df, mut_phenocopy_df], axis=0)\n",
+    "\n",
+    "# Add predictions and covariate information for each sample in the event_df\n",
+    "pred_cols = ['weight', 'total_status', 'DISEASE', 'SUBTYPE', 'hypermutated']\n",
+    "prediction_sub_df = prediction_df.loc[:, pred_cols]\n",
+    "combined_df = event_df.merge(prediction_sub_df, how='left', left_on='sample_id', right_index=True)\n",
+    "print(combined_df.shape)\n",
+    "combined_df.head(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(48690, 9)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Filter TP53 mutants and hypermutated samples\n",
+    "plot_ready_df = combined_df[combined_df['total_status'] == 0.0]\n",
+    "plot_ready_df = plot_ready_df[plot_ready_df['hypermutated'] == 0.0]\n",
+    "print(plot_ready_df.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## How many samples were filtered?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "RPS6KA3    9666\n",
+       "CHEK1      9666\n",
+       "ATR        9666\n",
+       "MDM4       9666\n",
+       "ATM        9666\n",
+       "CHEK2      9666\n",
+       "PPM1D      9666\n",
+       "MDM2       9666\n",
+       "CDKN2A     9666\n",
+       "Name: gene, dtype: int64"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# The same samples have equivalent representation across genes\n",
+    "combined_df['gene'].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Filter 4037 TP53 mutated or deep deleted samples out of 9666 total samples\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Subset to a gene and observe filtering TP53 inactive tumors\n",
+    "tp53_status_df = combined_df[combined_df['gene'] == 'CDKN2A']\n",
+    "tp53_status = tp53_status_df['total_status'].value_counts()\n",
+    "print('Filter {} TP53 mutated or deep deleted samples out of {} total samples'\n",
+    "      .format(tp53_status[1], tp53_status_df.shape[0]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Filter additional 219 hypermutated samples out of the remaining 5629 total samples\n",
+      "Final sample size: n = 5410\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Observe effects of hypermutated tumor filtering\n",
+    "hyper_df = tp53_status_df[tp53_status_df['total_status'] == 0.0]\n",
+    "hyper_status = hyper_df['hypermutated'].value_counts()           \n",
+    "print('Filter additional {} hypermutated samples out of the remaining {} total samples'\n",
+    "      .format(hyper_status[1], hyper_df.shape[0]))\n",
+    "\n",
+    "use_sample_df = hyper_df[hyper_df['hypermutated'] == 0]\n",
+    "print('Final sample size: n = {}'.format(use_sample_df.shape[0]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "RPS6KA3    5410\n",
+       "PPM1D      5410\n",
+       "CHEK1      5410\n",
+       "ATR        5410\n",
+       "MDM4       5410\n",
+       "ATM        5410\n",
+       "MDM2       5410\n",
+       "CDKN2A     5410\n",
+       "CHEK2      5410\n",
+       "Name: gene, dtype: int64"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Confirm sample size\n",
+    "plot_ready_df['gene'].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Perform visualizations and include t-test results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# Mutation plotting parameters\n",
+    "x1_mut, x2_mut = -0.2, 0.2\n",
+    "y_mut, h_mut = 1.01, 0.03\n",
+    "\n",
+    "for gene in set(plot_ready_df['gene']):\n",
+    "    gene_df = plot_ready_df[plot_ready_df['gene'] == gene]\n",
+    "    event = gene_df['event'].tolist()[0]\n",
+    "    \n",
+    "    if event == 'Mutation':\n",
+    "        plt.rcParams['figure.figsize']=(3, 4)\n",
+    "        use_palette = {0: '#d8b365', 1: '#5ab4ac'}\n",
+    "        legend_label_key = {0: 'Wild-Type',  1: 'Mutant'}\n",
+    "        \n",
+    "        # Perform an independent t-test against:\n",
+    "        # Mutant vs. Wild-type weights\n",
+    "        mutant_weight = gene_df[gene_df['value'] == 1]\n",
+    "\n",
+    "    else:\n",
+    "        plt.rcParams['figure.figsize']=(6, 4)\n",
+    "        use_palette = {-2: '#2c7bb6', -1: '#abd9e9', 0: '#fee08b', 1: '#fdae61', 2: '#d7191c'}\n",
+    "        legend_label_key = {-2: 'Deep Deletion', -1: 'Mild Deletion', 0: 'Wild-Type',\n",
+    "                            1: 'Mild Amplification', 2: 'High Amplification'}\n",
+    "\n",
+    "        # Perform an independent t-test against:\n",
+    "        # Amplification vs. Wild-type weights\n",
+    "        if gene == 'CDKN2A':\n",
+    "            mutant_weight = gene_df[(gene_df['value'] == -1) | (gene_df['value'] == -2)]\n",
+    "            x1_amp, x2_amp = -0.25, 0\n",
+    "            y_amp, h_mut = 1.01, 0.03\n",
+    "            y_text_amp = -0.125\n",
+    "            \n",
+    "            \n",
+    "            deep_del_weight = gene_df[gene_df['value'] == -2]\n",
+    "            mild_del_weight = gene_df[gene_df['value'] == -1]\n",
+    "            \n",
+    "            t_del_results = ttest_ind(a = mild_del_weight['weight'],\n",
+    "                                      b = deep_del_weight['weight'], equal_var = False)\n",
+    "            \n",
+    "            p_val_del = t_del_results.pvalue\n",
+    "            if p_val_del > 0.05:\n",
+    "                add_text_del = 'NS'\n",
+    "            else:\n",
+    "                add_text_del = \"{:.2E}\".format(Decimal(p_val_del))\n",
+    "        \n",
+    "        else:\n",
+    "            mutant_weight = gene_df[(gene_df['value'] == 1) | (gene_df['value'] == 2)]\n",
+    "            x1_amp, x2_amp = 0, 0.25\n",
+    "            y_amp, h_mut = 1.01, 0.03\n",
+    "            y_text_amp = 0.125\n",
+    "        \n",
+    "        if gene == 'MDM2':\n",
+    "            x1_amp, x2_amp = -0.1, 0.2\n",
+    "            y_text_amp = 0.05\n",
+    "\n",
+    "    wt_weight = gene_df[gene_df['value'] == 0]\n",
+    "\n",
+    "    # Output t-test results\n",
+    "    t_results = ttest_ind(a = wt_weight['weight'],\n",
+    "                          b = mutant_weight['weight'], equal_var = False)\n",
+    "\n",
+    "    p_val = t_results.pvalue\n",
+    "    if p_val > 0.05:\n",
+    "        add_text = 'NS'\n",
+    "    else:\n",
+    "        add_text = \"{:.2E}\".format(Decimal(p_val))\n",
+    "    \n",
+    "    # Generate Plots\n",
+    "    ax = sns.boxplot(x=\"gene\", y=\"weight\", hue='value',\n",
+    "                     data=gene_df,\n",
+    "                     palette=use_palette,\n",
+    "                     fliersize=0)\n",
+    "    \n",
+    "    handles, labels = ax.get_legend_handles_labels()\n",
+    "\n",
+    "    ax = sns.stripplot(x='gene', y='weight', hue='value',\n",
+    "                       data=gene_df, \n",
+    "                       dodge=True,\n",
+    "                       palette=use_palette,\n",
+    "                       edgecolor='black',\n",
+    "                       jitter=0.3,\n",
+    "                       size=2,\n",
+    "                       alpha=0.75)\n",
+    "    \n",
+    "    # Set Legend labels\n",
+    "    use_labels = []\n",
+    "    for lab in labels:\n",
+    "        use_labels.append(legend_label_key[int(lab)])\n",
+    "\n",
+    "    # Get x axis tick labels\n",
+    "    x_axis_tick_labels = []\n",
+    "    for key, value in legend_label_key.items():\n",
+    "        n = gene_df['value'].value_counts()\n",
+    "        try:\n",
+    "            n = n[key]\n",
+    "            result = 'n = {}\\n{}'.format(n, value)\n",
+    "            x_axis_tick_labels.append(result)\n",
+    "        except:\n",
+    "            next\n",
+    "    \n",
+    "    l = plt.legend(handles, use_labels, borderaxespad=0.)\n",
+    "    l.set_title('Status')\n",
+    "    ax.axes.set_ylim(-0.003, 1.2)\n",
+    "    ax.set_yticklabels(['', 0, 0.2, 0.4, 0.6, 0.8, 1, ''])\n",
+    "    ax.set_xticklabels('')\n",
+    "    ax.set_ylabel('TP53 Classifier Score')\n",
+    "    ax.legend\n",
+    "    ax.set_title(gene)\n",
+    "    plt.axhline(0.5, color='black', linestyle='dashed', linewidth=1)\n",
+    "    plt.tight_layout()\n",
+    "    \n",
+    "    if event == 'Mutation':\n",
+    "        plt.plot([x1_mut, x1_mut, x2_mut, x2_mut], [y_mut, y_mut + h_mut, y_mut + h_mut, y_mut],\n",
+    "                 lw=1.2, c='black')\n",
+    "        plt.text(0, y_mut + h_mut, add_text, ha='center', va='bottom', color=\"black\")\n",
+    "    else:\n",
+    "        plt.plot([x1_amp, x1_amp, x2_amp, x2_amp], [y_amp, y_amp + h_mut, y_amp + h_mut, y_amp],\n",
+    "                  lw=1.2, c='black')\n",
+    "        plt.text(y_text_amp, y_amp + h_mut, add_text, ha='center', va='bottom', color=\"black\")\n",
+    "        \n",
+    "        if gene == 'CDKN2A':\n",
+    "            y_add = 0.1\n",
+    "            plt.plot([x1_amp - 0.075, x1_amp - 0.075, x2_amp - 0.16, x2_amp - 0.16],\n",
+    "                     [y_amp + y_add, y_amp + h_mut + y_add, y_amp + h_mut + y_add, y_amp + y_add],\n",
+    "                      lw=1.2, c='black')\n",
+    "            plt.text(y_text_amp - 0.1, y_amp + h_mut + 0.1,\n",
+    "                     add_text_del, ha='center', va='bottom', color=\"black\")\n",
+    "    \n",
+    "    phenocopy_fig_file = os.path.join('..', 'classifiers', 'TP53', 'figures',\n",
+    "                                      '{}_gene_phenocopy.pdf'.format(gene))\n",
+    "    plt.savefig(phenocopy_fig_file)\n",
+    "    plt.close()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python (pancancer-classifier)",
+   "language": "python",
+   "name": "pancancer-classifier"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

	log10_mut	total_status	weight	TP53	TP53_loss	PATIENT_BARCODE	DISEASE	SUBTYPE	hypermutated	include	Hugo_Symbol	HGVSc	HGVSp	Variant_Classification
ID
TCGA-02-0047-01	1.812913	0.0	0.723395	0.0	0.0	TCGA-02-0047	GBM	IDHwt	0.0	1.0	NaN	NaN	NaN	Wild-Type
TCGA-02-0055-01	1.707570	1.0	0.698910	1.0	0.0	TCGA-02-0055	GBM	IDHwt	0.0	1.0	TP53	c.646G>A	p.Val216Met	Missense_Mutation
	sample_id	gene	value	event	weight	total_status	DISEASE	SUBTYPE	hypermutated
0	TCGA-02-0047-01	CDKN2A	-2	Deletion	0.723395	0.0	GBM	IDHwt	0.0
1	TCGA-02-0055-01	CDKN2A	0	Deletion	0.698910	1.0	GBM	IDHwt	0.0