From a399e60c819ac1faf616b9ffadb78b15b120c5a5 Mon Sep 17 00:00:00 2001
From: Yu Han <yuhanbrain@gmail.com>
Date: Wed, 22 Sep 2021 16:38:43 -0400
Subject: [PATCH] addressed Greg's additional comments and ready to merge to
 master branch

---
 2.describe-data/1.merge-datasets-gct.ipynb    | 14 ---------
 .../0.training-test-split.ipynb               |  4 ++-
 ...one-signature-application_validation.ipynb | 31 ++++---------------
 .../3.get-performance-metrics.ipynb           |  7 -----
 .../8.compile-otherclone-dataset.ipynb        | 16 ----------
 .../nbconverted/0.training-test-split.py      |  4 ++-
 ...erclone-signature-application_validation.r | 28 +++--------------
 .../nbconverted/3.get-performance-metrics.py  |  6 ----
 .../8.compile-otherclone-dataset.py           | 12 -------
 9 files changed, 17 insertions(+), 105 deletions(-)

diff --git a/2.describe-data/1.merge-datasets-gct.ipynb b/2.describe-data/1.merge-datasets-gct.ipynb
index 0d63434..8392b99 100644
--- a/2.describe-data/1.merge-datasets-gct.ipynb
+++ b/2.describe-data/1.merge-datasets-gct.ipynb
@@ -969,20 +969,6 @@
     "output_gct_file = os.path.join(gct_dir, \"consensus_profiles.gct\")\n",
     "write_gct(profiles=median_consensus_df, output_file=output_gct_file)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/3.resistance-signature/0.training-test-split.ipynb b/3.resistance-signature/0.training-test-split.ipynb
index b16353a..2b6079e 100644
--- a/3.resistance-signature/0.training-test-split.ipynb
+++ b/3.resistance-signature/0.training-test-split.ipynb
@@ -46,7 +46,9 @@
     "Yu Han, 2021\n",
     "\n",
     "https://github.com/broadinstitute/profiling-resistance-mechanisms/issues/116 \n",
-    "We dropped the inference set (batch 3) because of overly confluent plates and suboptimal plate design. In the notebook, I still need to output the bortezomib signature analytical set, but I can also include the new batches of data, which will serve as a better experimentally designed inference set as included in pull request #114\n",
+    "We dropped the inference set (batch 3) because of overly confluent plates and suboptimal plate design. \n",
+    "\n",
+    "In the notebook, I still need to output the bortezomib signature analytical set, but I can also include the new batches of data, which will serve as a better experimentally designed inference set as included in pull request #114\n",
     "\n"
    ]
   },
diff --git a/3.resistance-signature/11.1.last-batch-visualize-otherclone-signature-application_validation.ipynb b/3.resistance-signature/11.1.last-batch-visualize-otherclone-signature-application_validation.ipynb
index 071040a..66d412d 100644
--- a/3.resistance-signature/11.1.last-batch-visualize-otherclone-signature-application_validation.ipynb
+++ b/3.resistance-signature/11.1.last-batch-visualize-otherclone-signature-application_validation.ipynb
@@ -195,11 +195,7 @@
     "    max_permuted_value = readr::col_double()\n",
     ")\n",
     "\n",
-    "#inference_df <- readr::read_tsv(sig_results_file, col_types = sig_cols) %>%\n",
-    "    #dplyr::filter(Metadata_model_split == \"inference\")\n",
-    "\n",
-    "signature_df <- readr::read_tsv(sig_results_file, col_types = sig_cols) %>%\n",
-    "    #dplyr::filter(Metadata_model_split != \"inference\")\n",
+    "signature_df <- readr::read_tsv(sig_results_file, col_types = sig_cols) \n",
     "\n",
     "print(dim(signature_df))\n",
     "head(signature_df, 4)"
@@ -262,19 +258,12 @@
     "    shuffled = readr::col_character()\n",
     ")\n",
     "\n",
-    "roc_df <- readr::read_tsv(roc_auc_file, col_types=roc_cols) %>%\n",
+    "roc_df <- readr::read_tsv(roc_auc_file, col_types=roc_cols) \n",
     "    dplyr::arrange(shuffled) %>%\n",
     "    dplyr::mutate(roc_auc = round(roc_auc, 3))\n",
     "\n",
     "roc_df <- roc_df[, c(\"shuffled\", \"roc_auc\")]\n",
-    "colnames(roc_df) <- c(\"AUROC\\nShuffled:\", \"roc_auc\")\n",
-    "\n",
-    "#roc_df <- roc_df %>%\n",
-    "    #tidyr::spread(`AUROC\\nShuffled:`, roc_auc, sep=\"\") %>%\n",
-    "    #dplyr::arrange(desc(Split))\n",
-    "\n",
-    "#roc_df$Split <- dplyr::recode(roc_df$Split, !!!legend_labels)\n",
-    "#roc_df"
+    "colnames(roc_df) <- c(\"AUROC\\nShuffled:\", \"roc_auc\")\n"
    ]
   },
   {
@@ -302,15 +291,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "threshold_points_df <- roc_curve_df %>%\n",
-    "    dplyr::filter(shuffled == \"False\") %>%\n",
-    "    #dplyr::group_by(model_split) %>%\n",
-    "    dplyr::filter(abs(threshold) == min(abs(threshold))) %>%\n",
+    "threshold_points_df <- roc_curve_df \n",
+    "    dplyr::filter(shuffled == \"False\") \n",
+    "    dplyr::filter(abs(threshold) == min(abs(threshold))) \n",
     "    dplyr::ungroup()\n",
     "\n",
-    "#legend_colors <- c(\"#D41159\")\n",
-    "#legend_labels <- c(\"WT\" = \"WT\", \"BZ\" = \"BZ\")\n",
-    "\n",
     "roc_gg <- (\n",
     "    ggplot(roc_curve_df, aes(x = fpr, y = tpr))\n",
     "    + geom_line(aes(linetype = shuffled), size=0.2)\n",
@@ -357,10 +342,6 @@
     "        shape = 21,\n",
     "        lwd = 0.5\n",
     "    )\n",
-    "    #+ facet_wrap(\n",
-    "       # \"~Metadata_dataset\",\n",
-    "       # ncol = 1,\n",
-    "       # scales = \"free_y\")\n",
     "    \n",
     "    + xlab(\"\")\n",
     "    + ylab(\"Signature score\\n(singscore)\")\n",
diff --git a/3.resistance-signature/3.get-performance-metrics.ipynb b/3.resistance-signature/3.get-performance-metrics.ipynb
index 9c408b9..df6983b 100644
--- a/3.resistance-signature/3.get-performance-metrics.ipynb
+++ b/3.resistance-signature/3.get-performance-metrics.ipynb
@@ -426,13 +426,6 @@
     "output_file = pathlib.Path(f\"{output_dir}/{dataset}_roc_curve.tsv\")\n",
     "roc_curve_data_df.to_csv(output_file, sep=\"\\t\", index=False)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/3.resistance-signature/8.compile-otherclone-dataset.ipynb b/3.resistance-signature/8.compile-otherclone-dataset.ipynb
index 28892e7..e92a660 100644
--- a/3.resistance-signature/8.compile-otherclone-dataset.ipynb
+++ b/3.resistance-signature/8.compile-otherclone-dataset.ipynb
@@ -2230,22 +2230,6 @@
     "output_file = pathlib.Path(f\"{output_dir}/otherclones_normalized_profiles.tsv.gz\")\n",
     "full_df.to_csv(output_file, sep=\"\\t\", index=False)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "66f2fed6",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "34488b99",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/3.resistance-signature/scripts/nbconverted/0.training-test-split.py b/3.resistance-signature/scripts/nbconverted/0.training-test-split.py
index 7487b31..2b6d3bd 100644
--- a/3.resistance-signature/scripts/nbconverted/0.training-test-split.py
+++ b/3.resistance-signature/scripts/nbconverted/0.training-test-split.py
@@ -42,7 +42,9 @@
 # Yu Han, 2021
 # 
 # https://github.com/broadinstitute/profiling-resistance-mechanisms/issues/116 
-# We dropped the inference set (batch 3) because of overly confluent plates and suboptimal plate design. In the notebook, I still need to output the bortezomib signature analytical set, but I can also include the new batches of data, which will serve as a better experimentally designed inference set as included in pull request #114
+# We dropped the inference set (batch 3) because of overly confluent plates and suboptimal plate design. 
+# 
+# In the notebook, I still need to output the bortezomib signature analytical set, but I can also include the new batches of data, which will serve as a better experimentally designed inference set as included in pull request #114
 # 
 # 
 
diff --git a/3.resistance-signature/scripts/nbconverted/11.1.last-batch-visualize-otherclone-signature-application_validation.r b/3.resistance-signature/scripts/nbconverted/11.1.last-batch-visualize-otherclone-signature-application_validation.r
index 97bbb6d..e607d81 100644
--- a/3.resistance-signature/scripts/nbconverted/11.1.last-batch-visualize-otherclone-signature-application_validation.r
+++ b/3.resistance-signature/scripts/nbconverted/11.1.last-batch-visualize-otherclone-signature-application_validation.r
@@ -38,11 +38,7 @@ sig_cols <- readr::cols(
     max_permuted_value = readr::col_double()
 )
 
-#inference_df <- readr::read_tsv(sig_results_file, col_types = sig_cols) %>%
-    #dplyr::filter(Metadata_model_split == "inference")
-
-signature_df <- readr::read_tsv(sig_results_file, col_types = sig_cols) %>%
-    #dplyr::filter(Metadata_model_split != "inference")
+signature_df <- readr::read_tsv(sig_results_file, col_types = sig_cols) 
 
 print(dim(signature_df))
 head(signature_df, 4)
@@ -87,19 +83,13 @@ roc_cols <- readr::cols(
     shuffled = readr::col_character()
 )
 
-roc_df <- readr::read_tsv(roc_auc_file, col_types=roc_cols) %>%
+roc_df <- readr::read_tsv(roc_auc_file, col_types=roc_cols) 
     dplyr::arrange(shuffled) %>%
     dplyr::mutate(roc_auc = round(roc_auc, 3))
 
 roc_df <- roc_df[, c("shuffled", "roc_auc")]
 colnames(roc_df) <- c("AUROC\nShuffled:", "roc_auc")
 
-#roc_df <- roc_df %>%
-    #tidyr::spread(`AUROC\nShuffled:`, roc_auc, sep="") %>%
-    #dplyr::arrange(desc(Split))
-
-#roc_df$Split <- dplyr::recode(roc_df$Split, !!!legend_labels)
-#roc_df
 
 table_theme <- gridExtra::ttheme_default(
     core = list(fg_params=list(cex = 0.3)),
@@ -111,15 +101,11 @@ table_gg <- gridExtra::tableGrob(roc_df,
                                  theme = table_theme,
                                  rows = NULL)
 
-threshold_points_df <- roc_curve_df %>%
-    dplyr::filter(shuffled == "False") %>%
-    #dplyr::group_by(model_split) %>%
-    dplyr::filter(abs(threshold) == min(abs(threshold))) %>%
+threshold_points_df <- roc_curve_df 
+    dplyr::filter(shuffled == "False") 
+    dplyr::filter(abs(threshold) == min(abs(threshold))) 
     dplyr::ungroup()
 
-#legend_colors <- c("#D41159")
-#legend_labels <- c("WT" = "WT", "BZ" = "BZ")
-
 roc_gg <- (
     ggplot(roc_curve_df, aes(x = fpr, y = tpr))
     + geom_line(aes(linetype = shuffled), size=0.2)
@@ -150,10 +136,6 @@ box_plot_gg <- (
         shape = 21,
         lwd = 0.5
     )
-    #+ facet_wrap(
-       # "~Metadata_dataset",
-       # ncol = 1,
-       # scales = "free_y")
     
     + xlab("")
     + ylab("Signature score\n(singscore)")
diff --git a/3.resistance-signature/scripts/nbconverted/3.get-performance-metrics.py b/3.resistance-signature/scripts/nbconverted/3.get-performance-metrics.py
index ecde79e..fa61ad8 100644
--- a/3.resistance-signature/scripts/nbconverted/3.get-performance-metrics.py
+++ b/3.resistance-signature/scripts/nbconverted/3.get-performance-metrics.py
@@ -161,9 +161,3 @@
 output_file = pathlib.Path(f"{output_dir}/{dataset}_roc_curve.tsv")
 roc_curve_data_df.to_csv(output_file, sep="\t", index=False)
 
-
-# In[ ]:
-
-
-
-
diff --git a/3.resistance-signature/scripts/nbconverted/8.compile-otherclone-dataset.py b/3.resistance-signature/scripts/nbconverted/8.compile-otherclone-dataset.py
index 0458e32..c6b558a 100644
--- a/3.resistance-signature/scripts/nbconverted/8.compile-otherclone-dataset.py
+++ b/3.resistance-signature/scripts/nbconverted/8.compile-otherclone-dataset.py
@@ -138,15 +138,3 @@
 output_file = pathlib.Path(f"{output_dir}/otherclones_normalized_profiles.tsv.gz")
 full_df.to_csv(output_file, sep="\t", index=False)
 
-
-# In[ ]:
-
-
-
-
-
-# In[ ]:
-
-
-
-