Fix sample freeze (greenelab#94)

* update sample freeze data to correct file * run ras heatmap and alternative genes scripts with jupyter * fix sample freeze and modernize pandas in ras heatmaps * fix sample freeze in pathwaymapper script
nvk747 · Dec 3, 2018 · 2a0683b · 2a0683b
1 parent 9fd9afb
commit 2a0683b
Show file tree

Hide file tree

Showing 7 changed files with 9,182 additions and 10,261 deletions.
diff --git a/data/sample_freeze.tsv b/data/sample_freeze.tsv
diff --git a/data/sampleset_freeze.csv b/data/sampleset_freeze.csv
diff --git a/ras_analysis.sh b/ras_analysis.sh
@@ -76,7 +76,12 @@ python scripts/visualize_decisions.py --scores 'classifiers/RAS'
 ###############
 python scripts/map_mutation_class.py --scores 'classifiers/RAS' \
         --genes 'data/ras_genes.csv'
-python scripts/alternative_genes_pathwaymapper.py
+
+jupyter nbconvert --to=script \
+        --FilesWriter.build_directory=scripts \
+        --ExecutePreprocessor.kernel_name=python3 \
+        --ExecutePreprocessor.timeout=100000 \
+        --execute scripts/alternative_genes_pathwaymapper.ipynb
 
 ###############
 # Step 6. Rerun Ras classifier without THCA and SKCM and perform analysis
@@ -183,7 +188,11 @@ python scripts/pancancer_classifier.py --genes 'KRAS,HRAS,NRAS' \
 # Step 8. Plot additional Ras, NF1, and BRAF results
 ###############
 # Plot Ras pathway heatmaps
-python scripts/ras_count_heatmaps.py
+jupyter nbconvert --to=script \
+        --FilesWriter.build_directory=scripts \
+        --ExecutePreprocessor.kernel_name=python3 \
+        --ExecutePreprocessor.timeout=100000 \
+        --execute scripts/ras_count_heatmaps.ipynb
 
 # Visualize CCLE predictions
 jupyter nbconvert --to=script \

diff --git a/scripts/alternative_genes_pathwaymapper.ipynb b/scripts/alternative_genes_pathwaymapper.ipynb
@@ -62,7 +62,7 @@
    "source": [
     "# Load Datasets\n",
     "mut_file = os.path.join('..', 'data', 'pancan_mutation_freeze.tsv.gz')\n",
-    "sample_freeze_file = os.path.join('..', 'data', 'sampleset_freeze.csv')\n",
+    "sample_freeze_file = os.path.join('..', 'data', 'sample_freeze.tsv')\n",
     "copy_loss_file = os.path.join('..', 'data', 'copy_number_loss_status.tsv.gz')\n",
     "copy_gain_file = os.path.join('..', 'data', 'copy_number_gain_status.tsv.gz')\n",
     "\n",
@@ -114,8 +114,8 @@
        "      <th>total_status</th>\n",
        "      <th>weight</th>\n",
        "      <th>NRAS</th>\n",
-       "      <th>HRAS</th>\n",
        "      <th>KRAS</th>\n",
+       "      <th>HRAS</th>\n",
        "      <th>HRAS_gain</th>\n",
        "      <th>KRAS_gain</th>\n",
        "      <th>NRAS_gain</th>\n",
@@ -222,7 +222,7 @@
        "</div>"
       ],
       "text/plain": [
-       "    SAMPLE_BARCODE  log10_mut  total_status    weight  NRAS  HRAS  KRAS  \\\n",
+       "    SAMPLE_BARCODE  log10_mut  total_status    weight  NRAS  KRAS  HRAS  \\\n",
        "0  TCGA-02-0047-01   1.812913             0  0.357117     0     0     0   \n",
        "1  TCGA-02-0055-01   1.707570             0  0.530723     0     0     0   \n",
        "2  TCGA-02-2483-01   1.662758             0  0.642091     0     0     0   \n",
@@ -524,28 +524,6 @@
    "execution_count": 14,
    "metadata": {},
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/gway/anaconda3/envs/pancancer-classifier/lib/python3.6/site-packages/ipykernel_launcher.py:2: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
-      "of pandas will change to not sort by default.\n",
-      "\n",
-      "To accept the future behavior, pass 'sort=True'.\n",
-      "\n",
-      "To retain the current behavior and silence the warning, pass sort=False\n",
-      "\n",
-      "  \n",
-      "/home/gway/anaconda3/envs/pancancer-classifier/lib/python3.6/site-packages/ipykernel_launcher.py:8: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
-      "of pandas will change to not sort by default.\n",
-      "\n",
-      "To accept the future behavior, pass 'sort=True'.\n",
-      "\n",
-      "To retain the current behavior and silence the warning, pass sort=False\n",
-      "\n",
-      "  \n"
-     ]
-    },
     {
      "data": {
       "text/html": [
@@ -596,30 +574,30 @@
        "      <td>12.584116</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>CBL</th>\n",
-       "      <td>9.166522</td>\n",
-       "      <td>9.345175</td>\n",
-       "      <td>2.555843</td>\n",
-       "      <td>3.376221</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
        "      <th>EGFR</th>\n",
        "      <td>-4.273536</td>\n",
        "      <td>-3.678272</td>\n",
        "      <td>5.194929</td>\n",
        "      <td>5.384128</td>\n",
        "    </tr>\n",
+       "    <tr>\n",
+       "      <th>ERBB2</th>\n",
+       "      <td>8.547825</td>\n",
+       "      <td>10.703101</td>\n",
+       "      <td>7.275319</td>\n",
+       "      <td>7.803525</td>\n",
+       "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "      ras_auroc  no_ras_auroc  ras_auprc  no_ras_auprc\n",
-       "ALK   10.336711      9.806159   5.136642      4.880183\n",
-       "ARAF   9.596890      9.838528   2.657702      2.588441\n",
-       "BRAF  -5.287740     -4.301018   8.447411     12.584116\n",
-       "CBL    9.166522      9.345175   2.555843      3.376221\n",
-       "EGFR  -4.273536     -3.678272   5.194929      5.384128"
+       "       ras_auroc  no_ras_auroc  ras_auprc  no_ras_auprc\n",
+       "ALK    10.336711      9.806159   5.136642      4.880183\n",
+       "ARAF    9.596890      9.838528   2.657702      2.588441\n",
+       "BRAF   -5.287740     -4.301018   8.447411     12.584116\n",
+       "EGFR   -4.273536     -3.678272   5.194929      5.384128\n",
+       "ERBB2   8.547825     10.703101   7.275319      7.803525"
       ]
      },
      "execution_count": 14,
@@ -629,13 +607,13 @@
    ],
    "source": [
     "# Get output metrics for Ras classification\n",
-    "output_ras_metrics = pd.concat([full_auroc, full_auroc_remove], axis=1)\n",
+    "output_ras_metrics = pd.concat([full_auroc, full_auroc_remove], axis=1, sort=False)\n",
     "output_ras_metrics = output_ras_metrics * 100  # To get percent\n",
     "output_ras_metrics = output_ras_metrics - 50  # Subtract 50 from AUROC only\n",
     "\n",
     "# Combine with AUPRC\n",
     "output_ras_metrics = pd.concat([output_ras_metrics, full_auprc * 100,\n",
-    "                                full_auprc_remove * 100], axis=1)\n",
+    "                                full_auprc_remove * 100], axis=1, sort=False)\n",
     "output_ras_metrics.columns = ['ras_auroc', 'no_ras_auroc', 'ras_auprc', 'no_ras_auprc']\n",
     "\n",
     "# Fill removed Ras metrics with included metrics\n",
@@ -1015,20 +993,6 @@
    "execution_count": 20,
    "metadata": {},
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/home/gway/anaconda3/envs/pancancer-classifier/lib/python3.6/site-packages/ipykernel_launcher.py:11: DeprecationWarning: \n",
-      ".ix is deprecated. Please use\n",
-      ".loc for label based indexing or\n",
-      ".iloc for positional indexing\n",
-      "\n",
-      "See the documentation here:\n",
-      "http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated\n",
-      "  # This is added back by InteractiveShellApp.init_path()\n"
-     ]
-    },
     {
      "data": {
       "text/html": [
@@ -1183,7 +1147,7 @@
     "all_genes_auroc_df = all_genes_auroc_df.assign(auroc_rank = list(range(0, all_genes_auprc_df.shape[0])))\n",
     "\n",
     "all_genes_auprc_df = all_genes_auprc_df.assign(ras = 0)\n",
-    "all_genes_auprc_df.ix[all_genes_auprc_df.index.isin(ras_genes_df['genes']), 'ras'] = 1\n",
+    "all_genes_auprc_df.loc[all_genes_auprc_df.index.isin(ras_genes_df['genes']), 'ras'] = 1\n",
     "\n",
     "all_genes_metrics_df = all_genes_auprc_df.reset_index().merge(all_genes_auroc_df,\n",
     "                                                              left_on='index', right_index=True)\n",

diff --git a/scripts/alternative_genes_pathwaymapper.py b/scripts/alternative_genes_pathwaymapper.py
@@ -22,7 +22,7 @@
 
 # Ensure that the path is starting in the scripts directory
 if not cwd.split('/')[-1] == 'scripts':
-    os.chdir(os.path.join(cwd, 'scripts'))
+    sys.path.append(os.path.join(cwd, 'scripts'))
 
 
 # In[3]:
@@ -48,7 +48,7 @@ def get_gene_auprc(x, w):
 
 # Load Datasets
 mut_file = os.path.join('..', 'data', 'pancan_mutation_freeze.tsv.gz')
-sample_freeze_file = os.path.join('..', 'data', 'sampleset_freeze.csv')
+sample_freeze_file = os.path.join('..', 'data', 'sample_freeze.tsv')
 copy_loss_file = os.path.join('..', 'data', 'copy_number_loss_status.tsv.gz')
 copy_gain_file = os.path.join('..', 'data', 'copy_number_gain_status.tsv.gz')
 
@@ -151,13 +151,13 @@ def get_gene_auprc(x, w):
 
 
 # Get output metrics for Ras classification
-output_ras_metrics = pd.concat([full_auroc, full_auroc_remove], axis=1)
+output_ras_metrics = pd.concat([full_auroc, full_auroc_remove], axis=1, sort=False)
 output_ras_metrics = output_ras_metrics * 100  # To get percent
 output_ras_metrics = output_ras_metrics - 50  # Subtract 50 from AUROC only
 
 # Combine with AUPRC
 output_ras_metrics = pd.concat([output_ras_metrics, full_auprc * 100,
-                                full_auprc_remove * 100], axis=1)
+                                full_auprc_remove * 100], axis=1, sort=False)
 output_ras_metrics.columns = ['ras_auroc', 'no_ras_auroc', 'ras_auprc', 'no_ras_auprc']
 
 # Fill removed Ras metrics with included metrics
@@ -254,11 +254,12 @@ def get_gene_auprc(x, w):
 all_genes_auroc_df = all_genes_auroc_df.assign(auroc_rank = list(range(0, all_genes_auprc_df.shape[0])))
 
 all_genes_auprc_df = all_genes_auprc_df.assign(ras = 0)
-all_genes_auprc_df.ix[all_genes_auprc_df.index.isin(ras_genes_df['genes']), 'ras'] = 1
+all_genes_auprc_df.loc[all_genes_auprc_df.index.isin(ras_genes_df['genes']), 'ras'] = 1
 
 all_genes_metrics_df = all_genes_auprc_df.reset_index().merge(all_genes_auroc_df,
                                                               left_on='index', right_index=True)
 
 all_genes_metrics_df.columns = ['Gene', 'AUPRC', 'AUPRC Rank', 'ras', 'AUROC', 'AUROC Rank']
 all_genes_metrics_df.to_csv(all_gene_metrics_file, sep='\t', index=False)
 all_genes_metrics_df.head(10)
+
diff --git a/scripts/ras_count_heatmaps.ipynb b/scripts/ras_count_heatmaps.ipynb