Skip to content

Commit

Permalink
Fix sample freeze (greenelab#94)
Browse files Browse the repository at this point in the history
* update sample freeze data to correct file

* run ras heatmap and alternative genes scripts with jupyter

* fix sample freeze and modernize pandas in ras heatmaps

* fix sample freeze in pathwaymapper script
  • Loading branch information
gwaybio authored Dec 3, 2018
1 parent 9fd9afb commit 2a0683b
Show file tree
Hide file tree
Showing 7 changed files with 9,182 additions and 10,261 deletions.
9,075 changes: 9,075 additions & 0 deletions data/sample_freeze.tsv

Large diffs are not rendered by default.

9,081 changes: 0 additions & 9,081 deletions data/sampleset_freeze.csv

This file was deleted.

13 changes: 11 additions & 2 deletions ras_analysis.sh
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,12 @@ python scripts/visualize_decisions.py --scores 'classifiers/RAS'
###############
python scripts/map_mutation_class.py --scores 'classifiers/RAS' \
--genes 'data/ras_genes.csv'
python scripts/alternative_genes_pathwaymapper.py

jupyter nbconvert --to=script \
--FilesWriter.build_directory=scripts \
--ExecutePreprocessor.kernel_name=python3 \
--ExecutePreprocessor.timeout=100000 \
--execute scripts/alternative_genes_pathwaymapper.ipynb

###############
# Step 6. Rerun Ras classifier without THCA and SKCM and perform analysis
Expand Down Expand Up @@ -183,7 +188,11 @@ python scripts/pancancer_classifier.py --genes 'KRAS,HRAS,NRAS' \
# Step 8. Plot additional Ras, NF1, and BRAF results
###############
# Plot Ras pathway heatmaps
python scripts/ras_count_heatmaps.py
jupyter nbconvert --to=script \
--FilesWriter.build_directory=scripts \
--ExecutePreprocessor.kernel_name=python3 \
--ExecutePreprocessor.timeout=100000 \
--execute scripts/ras_count_heatmaps.ipynb

# Visualize CCLE predictions
jupyter nbconvert --to=script \
Expand Down
74 changes: 19 additions & 55 deletions scripts/alternative_genes_pathwaymapper.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@
"source": [
"# Load Datasets\n",
"mut_file = os.path.join('..', 'data', 'pancan_mutation_freeze.tsv.gz')\n",
"sample_freeze_file = os.path.join('..', 'data', 'sampleset_freeze.csv')\n",
"sample_freeze_file = os.path.join('..', 'data', 'sample_freeze.tsv')\n",
"copy_loss_file = os.path.join('..', 'data', 'copy_number_loss_status.tsv.gz')\n",
"copy_gain_file = os.path.join('..', 'data', 'copy_number_gain_status.tsv.gz')\n",
"\n",
Expand Down Expand Up @@ -114,8 +114,8 @@
" <th>total_status</th>\n",
" <th>weight</th>\n",
" <th>NRAS</th>\n",
" <th>HRAS</th>\n",
" <th>KRAS</th>\n",
" <th>HRAS</th>\n",
" <th>HRAS_gain</th>\n",
" <th>KRAS_gain</th>\n",
" <th>NRAS_gain</th>\n",
Expand Down Expand Up @@ -222,7 +222,7 @@
"</div>"
],
"text/plain": [
" SAMPLE_BARCODE log10_mut total_status weight NRAS HRAS KRAS \\\n",
" SAMPLE_BARCODE log10_mut total_status weight NRAS KRAS HRAS \\\n",
"0 TCGA-02-0047-01 1.812913 0 0.357117 0 0 0 \n",
"1 TCGA-02-0055-01 1.707570 0 0.530723 0 0 0 \n",
"2 TCGA-02-2483-01 1.662758 0 0.642091 0 0 0 \n",
Expand Down Expand Up @@ -524,28 +524,6 @@
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/gway/anaconda3/envs/pancancer-classifier/lib/python3.6/site-packages/ipykernel_launcher.py:2: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
"of pandas will change to not sort by default.\n",
"\n",
"To accept the future behavior, pass 'sort=True'.\n",
"\n",
"To retain the current behavior and silence the warning, pass sort=False\n",
"\n",
" \n",
"/home/gway/anaconda3/envs/pancancer-classifier/lib/python3.6/site-packages/ipykernel_launcher.py:8: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
"of pandas will change to not sort by default.\n",
"\n",
"To accept the future behavior, pass 'sort=True'.\n",
"\n",
"To retain the current behavior and silence the warning, pass sort=False\n",
"\n",
" \n"
]
},
{
"data": {
"text/html": [
Expand Down Expand Up @@ -596,30 +574,30 @@
" <td>12.584116</td>\n",
" </tr>\n",
" <tr>\n",
" <th>CBL</th>\n",
" <td>9.166522</td>\n",
" <td>9.345175</td>\n",
" <td>2.555843</td>\n",
" <td>3.376221</td>\n",
" </tr>\n",
" <tr>\n",
" <th>EGFR</th>\n",
" <td>-4.273536</td>\n",
" <td>-3.678272</td>\n",
" <td>5.194929</td>\n",
" <td>5.384128</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ERBB2</th>\n",
" <td>8.547825</td>\n",
" <td>10.703101</td>\n",
" <td>7.275319</td>\n",
" <td>7.803525</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ras_auroc no_ras_auroc ras_auprc no_ras_auprc\n",
"ALK 10.336711 9.806159 5.136642 4.880183\n",
"ARAF 9.596890 9.838528 2.657702 2.588441\n",
"BRAF -5.287740 -4.301018 8.447411 12.584116\n",
"CBL 9.166522 9.345175 2.555843 3.376221\n",
"EGFR -4.273536 -3.678272 5.194929 5.384128"
" ras_auroc no_ras_auroc ras_auprc no_ras_auprc\n",
"ALK 10.336711 9.806159 5.136642 4.880183\n",
"ARAF 9.596890 9.838528 2.657702 2.588441\n",
"BRAF -5.287740 -4.301018 8.447411 12.584116\n",
"EGFR -4.273536 -3.678272 5.194929 5.384128\n",
"ERBB2 8.547825 10.703101 7.275319 7.803525"
]
},
"execution_count": 14,
Expand All @@ -629,13 +607,13 @@
],
"source": [
"# Get output metrics for Ras classification\n",
"output_ras_metrics = pd.concat([full_auroc, full_auroc_remove], axis=1)\n",
"output_ras_metrics = pd.concat([full_auroc, full_auroc_remove], axis=1, sort=False)\n",
"output_ras_metrics = output_ras_metrics * 100 # To get percent\n",
"output_ras_metrics = output_ras_metrics - 50 # Subtract 50 from AUROC only\n",
"\n",
"# Combine with AUPRC\n",
"output_ras_metrics = pd.concat([output_ras_metrics, full_auprc * 100,\n",
" full_auprc_remove * 100], axis=1)\n",
" full_auprc_remove * 100], axis=1, sort=False)\n",
"output_ras_metrics.columns = ['ras_auroc', 'no_ras_auroc', 'ras_auprc', 'no_ras_auprc']\n",
"\n",
"# Fill removed Ras metrics with included metrics\n",
Expand Down Expand Up @@ -1015,20 +993,6 @@
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/gway/anaconda3/envs/pancancer-classifier/lib/python3.6/site-packages/ipykernel_launcher.py:11: DeprecationWarning: \n",
".ix is deprecated. Please use\n",
".loc for label based indexing or\n",
".iloc for positional indexing\n",
"\n",
"See the documentation here:\n",
"http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated\n",
" # This is added back by InteractiveShellApp.init_path()\n"
]
},
{
"data": {
"text/html": [
Expand Down Expand Up @@ -1183,7 +1147,7 @@
"all_genes_auroc_df = all_genes_auroc_df.assign(auroc_rank = list(range(0, all_genes_auprc_df.shape[0])))\n",
"\n",
"all_genes_auprc_df = all_genes_auprc_df.assign(ras = 0)\n",
"all_genes_auprc_df.ix[all_genes_auprc_df.index.isin(ras_genes_df['genes']), 'ras'] = 1\n",
"all_genes_auprc_df.loc[all_genes_auprc_df.index.isin(ras_genes_df['genes']), 'ras'] = 1\n",
"\n",
"all_genes_metrics_df = all_genes_auprc_df.reset_index().merge(all_genes_auroc_df,\n",
" left_on='index', right_index=True)\n",
Expand Down
11 changes: 6 additions & 5 deletions scripts/alternative_genes_pathwaymapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

# Ensure that the path is starting in the scripts directory
if not cwd.split('/')[-1] == 'scripts':
os.chdir(os.path.join(cwd, 'scripts'))
sys.path.append(os.path.join(cwd, 'scripts'))


# In[3]:
Expand All @@ -48,7 +48,7 @@ def get_gene_auprc(x, w):

# Load Datasets
mut_file = os.path.join('..', 'data', 'pancan_mutation_freeze.tsv.gz')
sample_freeze_file = os.path.join('..', 'data', 'sampleset_freeze.csv')
sample_freeze_file = os.path.join('..', 'data', 'sample_freeze.tsv')
copy_loss_file = os.path.join('..', 'data', 'copy_number_loss_status.tsv.gz')
copy_gain_file = os.path.join('..', 'data', 'copy_number_gain_status.tsv.gz')

Expand Down Expand Up @@ -151,13 +151,13 @@ def get_gene_auprc(x, w):


# Get output metrics for Ras classification
output_ras_metrics = pd.concat([full_auroc, full_auroc_remove], axis=1)
output_ras_metrics = pd.concat([full_auroc, full_auroc_remove], axis=1, sort=False)
output_ras_metrics = output_ras_metrics * 100 # To get percent
output_ras_metrics = output_ras_metrics - 50 # Subtract 50 from AUROC only

# Combine with AUPRC
output_ras_metrics = pd.concat([output_ras_metrics, full_auprc * 100,
full_auprc_remove * 100], axis=1)
full_auprc_remove * 100], axis=1, sort=False)
output_ras_metrics.columns = ['ras_auroc', 'no_ras_auroc', 'ras_auprc', 'no_ras_auprc']

# Fill removed Ras metrics with included metrics
Expand Down Expand Up @@ -254,11 +254,12 @@ def get_gene_auprc(x, w):
all_genes_auroc_df = all_genes_auroc_df.assign(auroc_rank = list(range(0, all_genes_auprc_df.shape[0])))

all_genes_auprc_df = all_genes_auprc_df.assign(ras = 0)
all_genes_auprc_df.ix[all_genes_auprc_df.index.isin(ras_genes_df['genes']), 'ras'] = 1
all_genes_auprc_df.loc[all_genes_auprc_df.index.isin(ras_genes_df['genes']), 'ras'] = 1

all_genes_metrics_df = all_genes_auprc_df.reset_index().merge(all_genes_auroc_df,
left_on='index', right_index=True)

all_genes_metrics_df.columns = ['Gene', 'AUPRC', 'AUPRC Rank', 'ras', 'AUROC', 'AUROC Rank']
all_genes_metrics_df.to_csv(all_gene_metrics_file, sep='\t', index=False)
all_genes_metrics_df.head(10)

1,121 changes: 38 additions & 1,083 deletions scripts/ras_count_heatmaps.ipynb

Large diffs are not rendered by default.

Loading

0 comments on commit 2a0683b

Please sign in to comment.