From e85f92514d2c62a084cc7391a27d69ca8792f12d Mon Sep 17 00:00:00 2001 From: Greg Way Date: Fri, 20 Oct 2017 12:55:50 -0400 Subject: [PATCH] add tp53 phenocopy analysis (#61) --- scripts/tp53_phenocopy.ipynb | 808 +++++++++++++++++++++++++++++++++++ 1 file changed, 808 insertions(+) create mode 100644 scripts/tp53_phenocopy.ipynb diff --git a/scripts/tp53_phenocopy.ipynb b/scripts/tp53_phenocopy.ipynb new file mode 100644 index 0000000..9337f46 --- /dev/null +++ b/scripts/tp53_phenocopy.ipynb @@ -0,0 +1,808 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# TP53 Phenocopying Events\n", + "\n", + "In the following script, we visualize and determine if specific gene events phenocopy TP53 loss of function. We focus on _MDM2_, _MDM4_, and _PPM1D_ amplifications, _CDKN2A_ deletions, and _ATM_, _CHEK1_, _CHEK2_, _ATR_, and _RPS6KA3_ mutations. \n", + "\n", + "We first filter out samples with _TP53_ mutations and deep deletions as well as hypermutated samples. We then visualize the difference of the TP53 classifier score across tumors with the given events above. We perform t-tests to determine if the classifier scores are significantly different in the direction of a priori assumptions." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd\n", + "import numpy as np\n", + "from decimal import Decimal\n", + "from scipy.stats import ttest_ind\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "np.random.seed(123)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "sns.set(style='whitegrid')\n", + "sns.set_context('paper', rc={'font.size':11, 'axes.titlesize':11, 'axes.labelsize':16,\n", + " 'xtick.labelsize':11, 'ytick.labelsize':11})" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# genes of interest\n", + "amplifications = ['MDM2', 'MDM4', 'PPM1D']\n", + "deletions = ['CDKN2A']\n", + "mutations = ['ATM', 'CHEK1', 'CHEK2', 'ATR', 'RPS6KA3']" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Load mutation, copy number, and sample freeze data\n", + "mut_file = os.path.join('..', 'data', 'pancan_mutation_freeze.tsv')\n", + "gistic_file = os.path.join('..', 'data', 'raw', 'pancan_GISTIC_threshold.tsv')\n", + "sample_freeze_file = os.path.join('..', 'data', 'sample_freeze.tsv')\n", + "\n", + "mutation_df = pd.read_table(mut_file, index_col=0)\n", + "copy_df = pd.read_table(gistic_file, index_col=0)\n", + "sample_freeze_df = pd.read_table(sample_freeze_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
log10_muttotal_statusweightTP53TP53_lossPATIENT_BARCODEDISEASESUBTYPEhypermutatedincludeHugo_SymbolHGVScHGVSpVariant_Classification
ID
TCGA-02-0047-011.8129130.00.7233950.00.0TCGA-02-0047GBMIDHwt0.01.0NaNNaNNaNWild-Type
TCGA-02-0055-011.7075701.00.6989101.00.0TCGA-02-0055GBMIDHwt0.01.0TP53c.646G>Ap.Val216MetMissense_Mutation
\n", + "
" + ], + "text/plain": [ + " log10_mut total_status weight TP53 TP53_loss \\\n", + "ID \n", + "TCGA-02-0047-01 1.812913 0.0 0.723395 0.0 0.0 \n", + "TCGA-02-0055-01 1.707570 1.0 0.698910 1.0 0.0 \n", + "\n", + " PATIENT_BARCODE DISEASE SUBTYPE hypermutated include \\\n", + "ID \n", + "TCGA-02-0047-01 TCGA-02-0047 GBM IDHwt 0.0 1.0 \n", + "TCGA-02-0055-01 TCGA-02-0055 GBM IDHwt 0.0 1.0 \n", + "\n", + " Hugo_Symbol HGVSc HGVSp Variant_Classification \n", + "ID \n", + "TCGA-02-0047-01 NaN NaN NaN Wild-Type \n", + "TCGA-02-0055-01 TP53 c.646G>A p.Val216Met Missense_Mutation " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# The prediction file stores the TP53 classifier scores for all samples\n", + "prediction_file = os.path.join('..', 'classifiers', 'TP53', 'tables',\n", + " 'TP53_loss_prediction_scores.tsv')\n", + "prediction_df = pd.read_table(prediction_file, index_col=0)\n", + "prediction_df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Gene SymbolACAP3ACTRT2AGRNANKRD65ATAD3AATAD3BATAD3CAURKAIP1B3GALT6C1orf159...RP13-228J13.1SMIM9SNORA36ASNORA56TMLHEVBP1DDX11L16|chrXIL9R|chrXSPRY3|chrXVAMP7|chrX
TCGA-02-0047-010000000000...0000000000
TCGA-02-0055-011111111111...-1-1-1-1-1-1-1-1-1-1
\n", + "

2 rows × 25128 columns

\n", + "
" + ], + "text/plain": [ + "Gene Symbol ACAP3 ACTRT2 AGRN ANKRD65 ATAD3A ATAD3B ATAD3C \\\n", + "TCGA-02-0047-01 0 0 0 0 0 0 0 \n", + "TCGA-02-0055-01 1 1 1 1 1 1 1 \n", + "\n", + "Gene Symbol AURKAIP1 B3GALT6 C1orf159 ... RP13-228J13.1 \\\n", + "TCGA-02-0047-01 0 0 0 ... 0 \n", + "TCGA-02-0055-01 1 1 1 ... -1 \n", + "\n", + "Gene Symbol SMIM9 SNORA36A SNORA56 TMLHE VBP1 DDX11L16|chrX \\\n", + "TCGA-02-0047-01 0 0 0 0 0 0 \n", + "TCGA-02-0055-01 -1 -1 -1 -1 -1 -1 \n", + "\n", + "Gene Symbol IL9R|chrX SPRY3|chrX VAMP7|chrX \n", + "TCGA-02-0047-01 0 0 0 \n", + "TCGA-02-0055-01 -1 -1 -1 \n", + "\n", + "[2 rows x 25128 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Process and subset the GISTIC copy number data\n", + "# We do not use the preprocessed copy number data because we are also \n", + "# interested in intermediate copy number events\n", + "copy_df.drop(['Locus ID', 'Cytoband'], axis=1, inplace=True)\n", + "copy_df.columns = copy_df.columns.str[0:15]\n", + "\n", + "copy_df = copy_df.T\n", + "copy_df = copy_df.loc[sorted(sample_freeze_df['SAMPLE_BARCODE'])]\n", + "copy_df = copy_df.fillna(0)\n", + "copy_df = copy_df.astype(int)\n", + "copy_df.head(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Subset corresponding data type according to predicted phenocopying mechanism" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "del_phenocopy_df = copy_df.loc[:, deletions]\n", + "del_phenocopy_df = del_phenocopy_df.reset_index().melt(id_vars=['index'])\n", + "del_phenocopy_df.columns = ['sample_id', 'gene', 'value']\n", + "del_phenocopy_df = del_phenocopy_df.assign(event = 'Deletion')\n", + "\n", + "amp_phenocopy_df = copy_df.loc[:, amplifications]\n", + "amp_phenocopy_df = amp_phenocopy_df.reset_index().melt(id_vars=['index'])\n", + "amp_phenocopy_df.columns = ['sample_id', 'gene', 'value']\n", + "amp_phenocopy_df = amp_phenocopy_df.assign(event = 'Amplification')\n", + "\n", + "mut_phenocopy_df = mutation_df.loc[:, mutations]\n", + "mut_phenocopy_df = mut_phenocopy_df.reset_index().melt(id_vars=['SAMPLE_BARCODE'])\n", + "mut_phenocopy_df.columns = ['sample_id', 'gene', 'value']\n", + "mut_phenocopy_df = mut_phenocopy_df.assign(event = 'Mutation')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(86994, 9)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sample_idgenevalueeventweighttotal_statusDISEASESUBTYPEhypermutated
0TCGA-02-0047-01CDKN2A-2Deletion0.7233950.0GBMIDHwt0.0
1TCGA-02-0055-01CDKN2A0Deletion0.6989101.0GBMIDHwt0.0
\n", + "
" + ], + "text/plain": [ + " sample_id gene value event weight total_status DISEASE \\\n", + "0 TCGA-02-0047-01 CDKN2A -2 Deletion 0.723395 0.0 GBM \n", + "1 TCGA-02-0055-01 CDKN2A 0 Deletion 0.698910 1.0 GBM \n", + "\n", + " SUBTYPE hypermutated \n", + "0 IDHwt 0.0 \n", + "1 IDHwt 0.0 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Combine the gene events of interest\n", + "event_df = pd.concat([del_phenocopy_df, amp_phenocopy_df, mut_phenocopy_df], axis=0)\n", + "\n", + "# Add predictions and covariate information for each sample in the event_df\n", + "pred_cols = ['weight', 'total_status', 'DISEASE', 'SUBTYPE', 'hypermutated']\n", + "prediction_sub_df = prediction_df.loc[:, pred_cols]\n", + "combined_df = event_df.merge(prediction_sub_df, how='left', left_on='sample_id', right_index=True)\n", + "print(combined_df.shape)\n", + "combined_df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(48690, 9)\n" + ] + } + ], + "source": [ + "# Filter TP53 mutants and hypermutated samples\n", + "plot_ready_df = combined_df[combined_df['total_status'] == 0.0]\n", + "plot_ready_df = plot_ready_df[plot_ready_df['hypermutated'] == 0.0]\n", + "print(plot_ready_df.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## How many samples were filtered?" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "RPS6KA3 9666\n", + "CHEK1 9666\n", + "ATR 9666\n", + "MDM4 9666\n", + "ATM 9666\n", + "CHEK2 9666\n", + "PPM1D 9666\n", + "MDM2 9666\n", + "CDKN2A 9666\n", + "Name: gene, dtype: int64" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# The same samples have equivalent representation across genes\n", + "combined_df['gene'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Filter 4037 TP53 mutated or deep deleted samples out of 9666 total samples\n" + ] + } + ], + "source": [ + "# Subset to a gene and observe filtering TP53 inactive tumors\n", + "tp53_status_df = combined_df[combined_df['gene'] == 'CDKN2A']\n", + "tp53_status = tp53_status_df['total_status'].value_counts()\n", + "print('Filter {} TP53 mutated or deep deleted samples out of {} total samples'\n", + " .format(tp53_status[1], tp53_status_df.shape[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Filter additional 219 hypermutated samples out of the remaining 5629 total samples\n", + "Final sample size: n = 5410\n" + ] + } + ], + "source": [ + "# Observe effects of hypermutated tumor filtering\n", + "hyper_df = tp53_status_df[tp53_status_df['total_status'] == 0.0]\n", + "hyper_status = hyper_df['hypermutated'].value_counts() \n", + "print('Filter additional {} hypermutated samples out of the remaining {} total samples'\n", + " .format(hyper_status[1], hyper_df.shape[0]))\n", + "\n", + "use_sample_df = hyper_df[hyper_df['hypermutated'] == 0]\n", + "print('Final sample size: n = {}'.format(use_sample_df.shape[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "RPS6KA3 5410\n", + "PPM1D 5410\n", + "CHEK1 5410\n", + "ATR 5410\n", + "MDM4 5410\n", + "ATM 5410\n", + "MDM2 5410\n", + "CDKN2A 5410\n", + "CHEK2 5410\n", + "Name: gene, dtype: int64" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Confirm sample size\n", + "plot_ready_df['gene'].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Perform visualizations and include t-test results" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Mutation plotting parameters\n", + "x1_mut, x2_mut = -0.2, 0.2\n", + "y_mut, h_mut = 1.01, 0.03\n", + "\n", + "for gene in set(plot_ready_df['gene']):\n", + " gene_df = plot_ready_df[plot_ready_df['gene'] == gene]\n", + " event = gene_df['event'].tolist()[0]\n", + " \n", + " if event == 'Mutation':\n", + " plt.rcParams['figure.figsize']=(3, 4)\n", + " use_palette = {0: '#d8b365', 1: '#5ab4ac'}\n", + " legend_label_key = {0: 'Wild-Type', 1: 'Mutant'}\n", + " \n", + " # Perform an independent t-test against:\n", + " # Mutant vs. Wild-type weights\n", + " mutant_weight = gene_df[gene_df['value'] == 1]\n", + "\n", + " else:\n", + " plt.rcParams['figure.figsize']=(6, 4)\n", + " use_palette = {-2: '#2c7bb6', -1: '#abd9e9', 0: '#fee08b', 1: '#fdae61', 2: '#d7191c'}\n", + " legend_label_key = {-2: 'Deep Deletion', -1: 'Mild Deletion', 0: 'Wild-Type',\n", + " 1: 'Mild Amplification', 2: 'High Amplification'}\n", + "\n", + " # Perform an independent t-test against:\n", + " # Amplification vs. Wild-type weights\n", + " if gene == 'CDKN2A':\n", + " mutant_weight = gene_df[(gene_df['value'] == -1) | (gene_df['value'] == -2)]\n", + " x1_amp, x2_amp = -0.25, 0\n", + " y_amp, h_mut = 1.01, 0.03\n", + " y_text_amp = -0.125\n", + " \n", + " \n", + " deep_del_weight = gene_df[gene_df['value'] == -2]\n", + " mild_del_weight = gene_df[gene_df['value'] == -1]\n", + " \n", + " t_del_results = ttest_ind(a = mild_del_weight['weight'],\n", + " b = deep_del_weight['weight'], equal_var = False)\n", + " \n", + " p_val_del = t_del_results.pvalue\n", + " if p_val_del > 0.05:\n", + " add_text_del = 'NS'\n", + " else:\n", + " add_text_del = \"{:.2E}\".format(Decimal(p_val_del))\n", + " \n", + " else:\n", + " mutant_weight = gene_df[(gene_df['value'] == 1) | (gene_df['value'] == 2)]\n", + " x1_amp, x2_amp = 0, 0.25\n", + " y_amp, h_mut = 1.01, 0.03\n", + " y_text_amp = 0.125\n", + " \n", + " if gene == 'MDM2':\n", + " x1_amp, x2_amp = -0.1, 0.2\n", + " y_text_amp = 0.05\n", + "\n", + " wt_weight = gene_df[gene_df['value'] == 0]\n", + "\n", + " # Output t-test results\n", + " t_results = ttest_ind(a = wt_weight['weight'],\n", + " b = mutant_weight['weight'], equal_var = False)\n", + "\n", + " p_val = t_results.pvalue\n", + " if p_val > 0.05:\n", + " add_text = 'NS'\n", + " else:\n", + " add_text = \"{:.2E}\".format(Decimal(p_val))\n", + " \n", + " # Generate Plots\n", + " ax = sns.boxplot(x=\"gene\", y=\"weight\", hue='value',\n", + " data=gene_df,\n", + " palette=use_palette,\n", + " fliersize=0)\n", + " \n", + " handles, labels = ax.get_legend_handles_labels()\n", + "\n", + " ax = sns.stripplot(x='gene', y='weight', hue='value',\n", + " data=gene_df, \n", + " dodge=True,\n", + " palette=use_palette,\n", + " edgecolor='black',\n", + " jitter=0.3,\n", + " size=2,\n", + " alpha=0.75)\n", + " \n", + " # Set Legend labels\n", + " use_labels = []\n", + " for lab in labels:\n", + " use_labels.append(legend_label_key[int(lab)])\n", + "\n", + " # Get x axis tick labels\n", + " x_axis_tick_labels = []\n", + " for key, value in legend_label_key.items():\n", + " n = gene_df['value'].value_counts()\n", + " try:\n", + " n = n[key]\n", + " result = 'n = {}\\n{}'.format(n, value)\n", + " x_axis_tick_labels.append(result)\n", + " except:\n", + " next\n", + " \n", + " l = plt.legend(handles, use_labels, borderaxespad=0.)\n", + " l.set_title('Status')\n", + " ax.axes.set_ylim(-0.003, 1.2)\n", + " ax.set_yticklabels(['', 0, 0.2, 0.4, 0.6, 0.8, 1, ''])\n", + " ax.set_xticklabels('')\n", + " ax.set_ylabel('TP53 Classifier Score')\n", + " ax.legend\n", + " ax.set_title(gene)\n", + " plt.axhline(0.5, color='black', linestyle='dashed', linewidth=1)\n", + " plt.tight_layout()\n", + " \n", + " if event == 'Mutation':\n", + " plt.plot([x1_mut, x1_mut, x2_mut, x2_mut], [y_mut, y_mut + h_mut, y_mut + h_mut, y_mut],\n", + " lw=1.2, c='black')\n", + " plt.text(0, y_mut + h_mut, add_text, ha='center', va='bottom', color=\"black\")\n", + " else:\n", + " plt.plot([x1_amp, x1_amp, x2_amp, x2_amp], [y_amp, y_amp + h_mut, y_amp + h_mut, y_amp],\n", + " lw=1.2, c='black')\n", + " plt.text(y_text_amp, y_amp + h_mut, add_text, ha='center', va='bottom', color=\"black\")\n", + " \n", + " if gene == 'CDKN2A':\n", + " y_add = 0.1\n", + " plt.plot([x1_amp - 0.075, x1_amp - 0.075, x2_amp - 0.16, x2_amp - 0.16],\n", + " [y_amp + y_add, y_amp + h_mut + y_add, y_amp + h_mut + y_add, y_amp + y_add],\n", + " lw=1.2, c='black')\n", + " plt.text(y_text_amp - 0.1, y_amp + h_mut + 0.1,\n", + " add_text_del, ha='center', va='bottom', color=\"black\")\n", + " \n", + " phenocopy_fig_file = os.path.join('..', 'classifiers', 'TP53', 'figures',\n", + " '{}_gene_phenocopy.pdf'.format(gene))\n", + " plt.savefig(phenocopy_fig_file)\n", + " plt.close()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (pancancer-classifier)", + "language": "python", + "name": "pancancer-classifier" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}