From a399e60c819ac1faf616b9ffadb78b15b120c5a5 Mon Sep 17 00:00:00 2001 From: Yu Han Date: Wed, 22 Sep 2021 16:38:43 -0400 Subject: [PATCH] addressed Greg's additional comments and ready to merge to master branch --- 2.describe-data/1.merge-datasets-gct.ipynb | 14 --------- .../0.training-test-split.ipynb | 4 ++- ...one-signature-application_validation.ipynb | 31 ++++--------------- .../3.get-performance-metrics.ipynb | 7 ----- .../8.compile-otherclone-dataset.ipynb | 16 ---------- .../nbconverted/0.training-test-split.py | 4 ++- ...erclone-signature-application_validation.r | 28 +++-------------- .../nbconverted/3.get-performance-metrics.py | 6 ---- .../8.compile-otherclone-dataset.py | 12 ------- 9 files changed, 17 insertions(+), 105 deletions(-) diff --git a/2.describe-data/1.merge-datasets-gct.ipynb b/2.describe-data/1.merge-datasets-gct.ipynb index 0d63434..8392b99 100644 --- a/2.describe-data/1.merge-datasets-gct.ipynb +++ b/2.describe-data/1.merge-datasets-gct.ipynb @@ -969,20 +969,6 @@ "output_gct_file = os.path.join(gct_dir, \"consensus_profiles.gct\")\n", "write_gct(profiles=median_consensus_df, output_file=output_gct_file)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/3.resistance-signature/0.training-test-split.ipynb b/3.resistance-signature/0.training-test-split.ipynb index b16353a..2b6079e 100644 --- a/3.resistance-signature/0.training-test-split.ipynb +++ b/3.resistance-signature/0.training-test-split.ipynb @@ -46,7 +46,9 @@ "Yu Han, 2021\n", "\n", "https://github.com/broadinstitute/profiling-resistance-mechanisms/issues/116 \n", - "We dropped the inference set (batch 3) because of overly confluent plates and suboptimal plate design. In the notebook, I still need to output the bortezomib signature analytical set, but I can also include the new batches of data, which will serve as a better experimentally designed inference set as included in pull request #114\n", + "We dropped the inference set (batch 3) because of overly confluent plates and suboptimal plate design. \n", + "\n", + "In the notebook, I still need to output the bortezomib signature analytical set, but I can also include the new batches of data, which will serve as a better experimentally designed inference set as included in pull request #114\n", "\n" ] }, diff --git a/3.resistance-signature/11.1.last-batch-visualize-otherclone-signature-application_validation.ipynb b/3.resistance-signature/11.1.last-batch-visualize-otherclone-signature-application_validation.ipynb index 071040a..66d412d 100644 --- a/3.resistance-signature/11.1.last-batch-visualize-otherclone-signature-application_validation.ipynb +++ b/3.resistance-signature/11.1.last-batch-visualize-otherclone-signature-application_validation.ipynb @@ -195,11 +195,7 @@ " max_permuted_value = readr::col_double()\n", ")\n", "\n", - "#inference_df <- readr::read_tsv(sig_results_file, col_types = sig_cols) %>%\n", - " #dplyr::filter(Metadata_model_split == \"inference\")\n", - "\n", - "signature_df <- readr::read_tsv(sig_results_file, col_types = sig_cols) %>%\n", - " #dplyr::filter(Metadata_model_split != \"inference\")\n", + "signature_df <- readr::read_tsv(sig_results_file, col_types = sig_cols) \n", "\n", "print(dim(signature_df))\n", "head(signature_df, 4)" @@ -262,19 +258,12 @@ " shuffled = readr::col_character()\n", ")\n", "\n", - "roc_df <- readr::read_tsv(roc_auc_file, col_types=roc_cols) %>%\n", + "roc_df <- readr::read_tsv(roc_auc_file, col_types=roc_cols) \n", " dplyr::arrange(shuffled) %>%\n", " dplyr::mutate(roc_auc = round(roc_auc, 3))\n", "\n", "roc_df <- roc_df[, c(\"shuffled\", \"roc_auc\")]\n", - "colnames(roc_df) <- c(\"AUROC\\nShuffled:\", \"roc_auc\")\n", - "\n", - "#roc_df <- roc_df %>%\n", - " #tidyr::spread(`AUROC\\nShuffled:`, roc_auc, sep=\"\") %>%\n", - " #dplyr::arrange(desc(Split))\n", - "\n", - "#roc_df$Split <- dplyr::recode(roc_df$Split, !!!legend_labels)\n", - "#roc_df" + "colnames(roc_df) <- c(\"AUROC\\nShuffled:\", \"roc_auc\")\n" ] }, { @@ -302,15 +291,11 @@ "metadata": {}, "outputs": [], "source": [ - "threshold_points_df <- roc_curve_df %>%\n", - " dplyr::filter(shuffled == \"False\") %>%\n", - " #dplyr::group_by(model_split) %>%\n", - " dplyr::filter(abs(threshold) == min(abs(threshold))) %>%\n", + "threshold_points_df <- roc_curve_df \n", + " dplyr::filter(shuffled == \"False\") \n", + " dplyr::filter(abs(threshold) == min(abs(threshold))) \n", " dplyr::ungroup()\n", "\n", - "#legend_colors <- c(\"#D41159\")\n", - "#legend_labels <- c(\"WT\" = \"WT\", \"BZ\" = \"BZ\")\n", - "\n", "roc_gg <- (\n", " ggplot(roc_curve_df, aes(x = fpr, y = tpr))\n", " + geom_line(aes(linetype = shuffled), size=0.2)\n", @@ -357,10 +342,6 @@ " shape = 21,\n", " lwd = 0.5\n", " )\n", - " #+ facet_wrap(\n", - " # \"~Metadata_dataset\",\n", - " # ncol = 1,\n", - " # scales = \"free_y\")\n", " \n", " + xlab(\"\")\n", " + ylab(\"Signature score\\n(singscore)\")\n", diff --git a/3.resistance-signature/3.get-performance-metrics.ipynb b/3.resistance-signature/3.get-performance-metrics.ipynb index 9c408b9..df6983b 100644 --- a/3.resistance-signature/3.get-performance-metrics.ipynb +++ b/3.resistance-signature/3.get-performance-metrics.ipynb @@ -426,13 +426,6 @@ "output_file = pathlib.Path(f\"{output_dir}/{dataset}_roc_curve.tsv\")\n", "roc_curve_data_df.to_csv(output_file, sep=\"\\t\", index=False)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/3.resistance-signature/8.compile-otherclone-dataset.ipynb b/3.resistance-signature/8.compile-otherclone-dataset.ipynb index 28892e7..e92a660 100644 --- a/3.resistance-signature/8.compile-otherclone-dataset.ipynb +++ b/3.resistance-signature/8.compile-otherclone-dataset.ipynb @@ -2230,22 +2230,6 @@ "output_file = pathlib.Path(f\"{output_dir}/otherclones_normalized_profiles.tsv.gz\")\n", "full_df.to_csv(output_file, sep=\"\\t\", index=False)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "66f2fed6", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "34488b99", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/3.resistance-signature/scripts/nbconverted/0.training-test-split.py b/3.resistance-signature/scripts/nbconverted/0.training-test-split.py index 7487b31..2b6d3bd 100644 --- a/3.resistance-signature/scripts/nbconverted/0.training-test-split.py +++ b/3.resistance-signature/scripts/nbconverted/0.training-test-split.py @@ -42,7 +42,9 @@ # Yu Han, 2021 # # https://github.com/broadinstitute/profiling-resistance-mechanisms/issues/116 -# We dropped the inference set (batch 3) because of overly confluent plates and suboptimal plate design. In the notebook, I still need to output the bortezomib signature analytical set, but I can also include the new batches of data, which will serve as a better experimentally designed inference set as included in pull request #114 +# We dropped the inference set (batch 3) because of overly confluent plates and suboptimal plate design. +# +# In the notebook, I still need to output the bortezomib signature analytical set, but I can also include the new batches of data, which will serve as a better experimentally designed inference set as included in pull request #114 # # diff --git a/3.resistance-signature/scripts/nbconverted/11.1.last-batch-visualize-otherclone-signature-application_validation.r b/3.resistance-signature/scripts/nbconverted/11.1.last-batch-visualize-otherclone-signature-application_validation.r index 97bbb6d..e607d81 100644 --- a/3.resistance-signature/scripts/nbconverted/11.1.last-batch-visualize-otherclone-signature-application_validation.r +++ b/3.resistance-signature/scripts/nbconverted/11.1.last-batch-visualize-otherclone-signature-application_validation.r @@ -38,11 +38,7 @@ sig_cols <- readr::cols( max_permuted_value = readr::col_double() ) -#inference_df <- readr::read_tsv(sig_results_file, col_types = sig_cols) %>% - #dplyr::filter(Metadata_model_split == "inference") - -signature_df <- readr::read_tsv(sig_results_file, col_types = sig_cols) %>% - #dplyr::filter(Metadata_model_split != "inference") +signature_df <- readr::read_tsv(sig_results_file, col_types = sig_cols) print(dim(signature_df)) head(signature_df, 4) @@ -87,19 +83,13 @@ roc_cols <- readr::cols( shuffled = readr::col_character() ) -roc_df <- readr::read_tsv(roc_auc_file, col_types=roc_cols) %>% +roc_df <- readr::read_tsv(roc_auc_file, col_types=roc_cols) dplyr::arrange(shuffled) %>% dplyr::mutate(roc_auc = round(roc_auc, 3)) roc_df <- roc_df[, c("shuffled", "roc_auc")] colnames(roc_df) <- c("AUROC\nShuffled:", "roc_auc") -#roc_df <- roc_df %>% - #tidyr::spread(`AUROC\nShuffled:`, roc_auc, sep="") %>% - #dplyr::arrange(desc(Split)) - -#roc_df$Split <- dplyr::recode(roc_df$Split, !!!legend_labels) -#roc_df table_theme <- gridExtra::ttheme_default( core = list(fg_params=list(cex = 0.3)), @@ -111,15 +101,11 @@ table_gg <- gridExtra::tableGrob(roc_df, theme = table_theme, rows = NULL) -threshold_points_df <- roc_curve_df %>% - dplyr::filter(shuffled == "False") %>% - #dplyr::group_by(model_split) %>% - dplyr::filter(abs(threshold) == min(abs(threshold))) %>% +threshold_points_df <- roc_curve_df + dplyr::filter(shuffled == "False") + dplyr::filter(abs(threshold) == min(abs(threshold))) dplyr::ungroup() -#legend_colors <- c("#D41159") -#legend_labels <- c("WT" = "WT", "BZ" = "BZ") - roc_gg <- ( ggplot(roc_curve_df, aes(x = fpr, y = tpr)) + geom_line(aes(linetype = shuffled), size=0.2) @@ -150,10 +136,6 @@ box_plot_gg <- ( shape = 21, lwd = 0.5 ) - #+ facet_wrap( - # "~Metadata_dataset", - # ncol = 1, - # scales = "free_y") + xlab("") + ylab("Signature score\n(singscore)") diff --git a/3.resistance-signature/scripts/nbconverted/3.get-performance-metrics.py b/3.resistance-signature/scripts/nbconverted/3.get-performance-metrics.py index ecde79e..fa61ad8 100644 --- a/3.resistance-signature/scripts/nbconverted/3.get-performance-metrics.py +++ b/3.resistance-signature/scripts/nbconverted/3.get-performance-metrics.py @@ -161,9 +161,3 @@ output_file = pathlib.Path(f"{output_dir}/{dataset}_roc_curve.tsv") roc_curve_data_df.to_csv(output_file, sep="\t", index=False) - -# In[ ]: - - - - diff --git a/3.resistance-signature/scripts/nbconverted/8.compile-otherclone-dataset.py b/3.resistance-signature/scripts/nbconverted/8.compile-otherclone-dataset.py index 0458e32..c6b558a 100644 --- a/3.resistance-signature/scripts/nbconverted/8.compile-otherclone-dataset.py +++ b/3.resistance-signature/scripts/nbconverted/8.compile-otherclone-dataset.py @@ -138,15 +138,3 @@ output_file = pathlib.Path(f"{output_dir}/otherclones_normalized_profiles.tsv.gz") full_df.to_csv(output_file, sep="\t", index=False) - -# In[ ]: - - - - - -# In[ ]: - - - -