Copy Burden Analysis (#22)

* make comparing within model figure smaller * update tedious arrows for pointing at coefficient plot * add segmented copy number data to initialize logic * scripts to process and then plot copy burden data
greenelab · Mar 16, 2017 · f010dfd · f010dfd
1 parent 60881df
commit f010dfd
Show file tree

Hide file tree

Showing 6 changed files with 196 additions and 46 deletions.
diff --git a/scripts/compare_within_models.R b/scripts/compare_within_models.R
@@ -46,7 +46,7 @@ process_classifier_summary <- function(summary_list, model_type) {
 # Process PanCancer Classifier and summary files
 pan_summary <- file.path(pan_summary_dir, "classifier_summary.txt")
 pancan_list <- parse_summary(pan_summary)
-pancan_df <- process_classifier_summary(pancan_list, "Pan Cancer")
+pancan_df <- process_classifier_summary(pancan_list, "Pan")
 
 # Process Within Cancer Results
 within_tissue_files <- list.files(within_folder,
@@ -55,7 +55,7 @@ within_tissue_files <- list.files(within_folder,
 within_tissue_data <- data.frame()
 for (file in within_tissue_files) {
   file_summary <- parse_summary(file)
-  file_frame <- process_classifier_summary(file_summary, "Within Disease")
+  file_frame <- process_classifier_summary(file_summary, "Within")
   within_tissue_data <- rbind(within_tissue_data, file_frame)
 }
 
@@ -65,7 +65,7 @@ plot_ready$AUROC <- as.numeric(paste(plot_ready$AUROC))
 ggplot(plot_ready, aes(x = Disease, y = AUROC, fill = Model)) +
   geom_bar(position = "dodge", stat = "identity") +
   theme_bw() + within_theme +
-  theme(legend.position = c(1.09, 0.65),
+  theme(legend.position = c(1.07, 0.65),
         legend.background = element_rect(fill = alpha("white", 0)),
         plot.margin = unit(c(0.2, 1.5, 0, 0.1), "cm")) +
   scale_fill_manual(values = c("brown", "gold")) +
@@ -76,4 +76,4 @@ ggplot(plot_ready, aes(x = Disease, y = AUROC, fill = Model)) +
   scale_y_continuous(breaks = seq(0.4, 1, 0.1))
 
 ggsave(file.path(pan_summary_dir, "figures", "comparison.pdf"), units = "in",
-       height = 1.8, width = 4.2, dpi = 600)
+       height = 1.4, width = 4.2, dpi = 600)
diff --git a/scripts/copy_burden_figures.R b/scripts/copy_burden_figures.R
@@ -0,0 +1,104 @@
+# Gregory Way 2017
+# PanCancer Classifier
+# scripts/copy_burden_figures.R
+#
+# Generate figures for visualizing copy burden across different samples
+# stratified by TP53 mutation status
+#
+# Usage: Run in command line
+#
+#     Rscript --vanilla scripts/copy_burden_figures.R
+#
+# Output:
+# Two figures summarizing copy burden across TCGA Pan Can samples
+
+library(ggplot2)
+
+# Set File Names
+base_file <- file.path("classifiers", "TP53")
+burden_file <- file.path(base_file, "tables", "copy_burden_predictions.tsv")
+snaptron_file <- file.path("scripts", "snaptron",
+                           "junctions_with_mutations.csv")
+frac_alt_plot <- file.path(base_file, "figures", "fraction_altered_plot.pdf")
+violin_plot <- file.path(base_file, "figures", "seg_altered_violin_plot.pdf")
+
+# Load Files
+copy_burden <- readr::read_tsv(burden_file)
+junc_df <- readr::read_csv(snaptron_file)
+junc_df <- junc_df[,-1]
+junc_df <- junc_df[!duplicated(junc_df), ]
+
+# Location of the silent mutation and truncation
+junc_exon_df = junc_df[junc_df$start == "7675237", ]
+silent_junc <- junc_exon_df[junc_exon_df$Variant_Classification == "Silent", ]
+silent_junc <- silent_junc[silent_junc$snaptron_id == "13945701", ]
+silent_junc <- silent_junc[silent_junc$TP53 %in% 0, ]
+silent_junc <- silent_junc[silent_junc$include %in% 1, ]
+
+ggplot(copy_burden, aes(weight, frac_altered, color = factor(TP53))) +
+  geom_point(alpha = 0.6, size = 0.3) + theme_bw() +
+  xlab("TP53 Inactivation Probability") +
+  ylab("CNV Burden (Fraction Altered)") +
+  labs(color = "TP53 Status")
+ggsave(frac_alt_plot, width = 5, height = 4, dpi = 600)
+
+# Build and Process Copy Burden DataFrame
+copy_burden$silent <- 0
+copy_burden[copy_burden$Sample %in% silent_junc$tcga_id, "silent"] <- 1
+silent_and_junc <-  copy_burden[copy_burden$silent == 1, ]
+silent_and_junc$TP53 <- "c.375G>T Mutation"
+
+copy_burden[copy_burden$total_status == 0, "TP53"] = "Wild-Type"
+copy_burden[copy_burden$total_status == 1, "TP53"] = "TP53 Loss of Function"
+
+plot_ready <- copy_burden[, c("frac_altered", "TP53")]
+plot_ready <- rbind(plot_ready, silent_and_junc[, c("frac_altered", "TP53")])
+
+false_negatives <- copy_burden[(copy_burden$total_status == 1) &
+                               (copy_burden$weight < 0.5), ]
+false_negatives$TP53 <- "False Negative"
+plot_ready <- rbind(plot_ready, false_negatives[, c("frac_altered", "TP53")])
+
+false_positives <- copy_burden[(copy_burden$total_status == 0) &
+                               (copy_burden$weight >= 0.5), ]
+false_positives$TP53 <- "False Positive"
+plot_ready <- rbind(plot_ready, false_positives[, c("frac_altered", "TP53")])
+
+predicted_neg <- copy_burden[copy_burden$weight < 0.5, ]
+predicted_neg$TP53 <- "Predicted Wild-Type"
+plot_ready <- rbind(plot_ready, predicted_neg[, c("frac_altered", "TP53")])
+
+predicted_pos <- copy_burden[copy_burden$weight >= 0.5, ]
+predicted_pos$TP53 <- "Predicted Loss"
+plot_ready <- rbind(plot_ready, predicted_pos[, c("frac_altered", "TP53")])
+
+plot_levels <- c("c.375G>T Mutation", "False Positive",
+                 "Predicted Loss", "TP53 Loss of Function",
+                 "False Negative", "Predicted Wild-Type", "Wild-Type")
+
+plot_ready$TP53 <- factor(plot_ready$TP53, levels = plot_levels)
+
+# Build violin plots for copy number alterations comparison
+ggplot(plot_ready, aes(x = TP53, y = frac_altered)) +
+  ylab("CNV Burden (Fraction Altered)") + xlab("TP53 Status") +
+  labs(fill = "") + geom_violin(aes(fill = TP53), size = 0.3, alpha = 0.3,
+                                adjust = 0.7, trim = TRUE) + 
+  geom_boxplot(aes(fill = TP53), size = 0.3, width = 0.1, outlier.size = 0.3) +
+  coord_flip() + geom_hline(yintercept = 0.5, linetype = "dashed",
+                            color = "red") +
+  theme(legend.position = c(1.4, 0.7), axis.text.y = element_blank(),
+        axis.text.x = element_text(size = rel(0.7)),
+        axis.title = element_text(size = rel(0.7)),
+        legend.text = element_text(size = rel(0.35)),
+        legend.key = element_blank(),
+        legend.key.size = unit(0.8, "lines"),
+        legend.background = element_rect(fill = alpha("white", 0)),
+        panel.grid.major = element_line(color = "white", size = 0.3),
+        panel.grid.minor = element_line(color = "white", size = 0.3),
+        panel.background = element_rect(fill = "white"),
+        plot.margin = unit(c(0.1, 2.6, 0.2, 0.2),"cm"),
+        panel.border = element_rect(fill = NA, size = 0.4)) +
+  guides(fill = guide_legend(reverse = TRUE, ncol = 1), color = FALSE)
+
+ggsave(violin_plot, height = 2.25, width = 2.5, dpi = 600)
+
diff --git a/scripts/copy_burden_merge.py b/scripts/copy_burden_merge.py
@@ -0,0 +1,40 @@
+"""
+Gregory Way 2017
+PanCancer Classifier
+scripts/copy_burden_merge.py
+
+Merge per sample classifier scores with segment based scores
+
+Usage: Run in command line with required command argument:
+
+        python scripts/copy_burden_merge.py --classifier_folder
+
+classifier_folder is a string pointing to the location of the classifier data
+
+Output:
+.tsv file of classifier scores merged with segment based copy number scores
+"""
+
+import os
+import argparse
+import pandas as pd
+
+parser = argparse.ArgumentParser()
+parser.add_argument('-c', '--classifier_folder',
+                    help='string of the location of classifier data')
+args = parser.parse_args()
+
+# Load command arguments
+pred_fild = os.path.join(args.classifier_folder, 'classifier_decisions.tsv')
+burden_file = os.path.join('data', 'seg_based_scores.tsv')
+out_file = os.path.join(os.path.dirname(pred_fild), 'tables',
+                        'copy_burden_predictions.tsv')
+
+# Load and process data
+copy_burden_df = pd.read_table(burden_file)
+classifier_df = pd.read_table(pred_fild, index_col=0)
+
+combined_df = classifier_df.merge(copy_burden_df, left_index=True,
+                                  right_on='Sample')
+combined_df.index = combined_df['Sample']
+combined_df.to_csv(out_file, sep='\t')
diff --git a/scripts/ddr_summary_figures.R b/scripts/ddr_summary_figures.R
@@ -64,57 +64,58 @@ coef_df <- coef_df[order(coef_df$weight, decreasing = FALSE), ]
 coef_df$rank <- 1:nrow(coef_df)
 
 p <- ggplot(coef_df, aes(x = 1:nrow(coef_df), y = weight)) +
-  geom_point(aes(fill = "black"), size = 0.01) +
+  geom_point(fill = "black", size = 0.01) +
   base_theme + theme(axis.line.x = element_line(),
                      axis.line.y = element_line(),
                      axis.ticks = element_line(),
-                     plot.margin = unit(c(0.25, 0.25, 0.1, 0.1),"cm")) +
+                     axis.title = element_text(size = rel(1.5)),
+                     plot.margin = unit(c(0.25, 0.25, 0.1, 0.1), "cm")) +
   labs(list(x = "Rank", y = "Weight")) +
   scale_y_continuous(breaks = seq(-0.25, 0.25, 0.05)) +
-  scale_x_continuous(breaks = seq(0, 8000, 1000)) +
+  scale_x_continuous(breaks = seq(0, 8000, 2000)) +
   geom_segment(aes(x = 0, y = 0, yend = 0, xend = nrow(coef_df)),
                colour = "red", linetype = "dashed", size = 0.2)
 
-p <- add_arrow_label(p = p, x = 650, y = -0.205, label = "DDB2",
-                     offset = c(70, 0.001, -445, -.0002))
-p <- add_arrow_label(p = p, x = 750, y = -0.190, label = "AEN",
-                     offset = c(80, 0.001, -350, -.002))
-p <- add_arrow_label(p = p, x = 1080, y = -0.172, label = "RPS27L",
-                     offset = c(60, 0.001, -610, .0003))
+p <- add_arrow_label(p = p, x = 1050, y = -0.205, label = "DDB2",
+                     offset = c(80, 0.001, -645, -.0002))
+p <- add_arrow_label(p = p, x = 1150, y = -0.190, label = "AEN",
+                     offset = c(80, 0.001, -450, 0))
+p <- add_arrow_label(p = p, x = 1480, y = -0.172, label = "RPS27L",
+                     offset = c(80, 0.001, -890, .0003))
 p <- add_arrow_label(p = p, x = 950, y = -0.155, label = "MDM2",
-                     offset = c(50, -0.0001, -480, -.00013))
-p <- add_arrow_label(p = p, x = 1300, y = -0.14, label = "BAX",
-                     offset = c(80, -0.0001, -340, .0002))
-p <- add_arrow_label(p = p, x = 1300, y = -0.12, label = "CDKN1A",
-                offset = c(90, -0.0001, -710, -.00015))
-p <- add_arrow_label(p = p, x = 1200, y = -0.1, label = "XPC",
-                     offset = c(80, 0, -450, -.0002))
-p <- add_arrow_label(p = p, x = 1700, y = -0.085, label = "MPDU1",
-                     offset = c(80, 0, -600, -.0002))
-p <- add_arrow_label(p = p, x = 1400, y = -0.07, label = "FDXR",
-                     offset = c(80, 0, -500, -.0006))
-p <- add_arrow_label(p = p, x = 1500, y = -0.055, label = "CYB5D2",
-                     offset = c(80, 0, -650, -.0002))
-
-p <- add_arrow_label(p = p, x = 7200, y = 0.1, label = "KIF1B",
-                     offset = c(-50, .001, 450, -.006))
-p <- add_arrow_label(p = p, x = 6900, y = 0.085, label = "EEPD1",
-                     offset = c(-50, .0006, 360, -.006))
-p <- add_arrow_label(p = p, x = 6600, y = 0.07, label = "SPATS2L",
-                     offset = c(-50, .0004, 760, -.0015))
-p <- add_arrow_label(p = p, x = 6250, y = 0.054, label = "CDC123",
-                     offset = c(-50, .0004, 700, -.0015))
-p <- add_arrow_label(p = p, x = 5950, y = 0.036, label = "ID4",
-                     offset = c(-50, .0004, 390, -.0015))
-p <- add_arrow_label(p = p, x = 5550, y = 0.021, label = "MIIP",
-                     offset = c(-50, .0004, 390, -.0015))
-p <- add_arrow_label(p = p, x = 6950, y = 0.012, label = "DCAF13",
-                     offset = c(-50, -.001, 470, .0055))
-
-p <- add_arrow_label(p = p, x = 2500, y = -0.03, label = "log10_mut",
-                     offset = c(50, 0, -900, 0))
-ggsave(file = file.path(results_folder, "figures", "ddr_coefficient_plot.pdf"),
-       plot = p, height = 2.5, width = 2.75, dpi = 600)
+                     offset = c(80, -0.001, -680, -.00013))
+p <- add_arrow_label(p = p, x = 1700, y = -0.14, label = "BAX",
+                     offset = c(80, -0.001, -510, .0002))
+p <- add_arrow_label(p = p, x = 2000, y = -0.12, label = "CDKN1A",
+                offset = c(90, 0.0001, -950, -.00015))
+p <- add_arrow_label(p = p, x = 1800, y = -0.1, label = "XPC",
+                     offset = c(80, 0, -550, -.0002))
+p <- add_arrow_label(p = p, x = 2100, y = -0.085, label = "MPDU1",
+                     offset = c(80, 0, -800, -.0002))
+p <- add_arrow_label(p = p, x = 1800, y = -0.07, label = "FDXR",
+                     offset = c(80, 0, -650, -.0006))
+p <- add_arrow_label(p = p, x = 1900, y = -0.055, label = "PHLDA3",
+                     offset = c(80, 0, -890, -.0002))
+
+p <- add_arrow_label(p = p, x = 7000, y = 0.1, label = "KIF1B",
+                     offset = c(-80, .001, 550, -.006))
+p <- add_arrow_label(p = p, x = 6700, y = 0.085, label = "EEPD1",
+                     offset = c(-80, .0006, 460, -.006))
+p <- add_arrow_label(p = p, x = 6400, y = 0.07, label = "MIIP",
+                     offset = c(-80, .0004, 500, -.0015))
+p <- add_arrow_label(p = p, x = 6050, y = 0.054, label = "ID4",
+                     offset = c(-80, .0004, 490, -.0015))
+p <- add_arrow_label(p = p, x = 5750, y = 0.036, label = "CDC123",
+                     offset = c(-80, .0004, 900, -.0015))
+p <- add_arrow_label(p = p, x = 5350, y = 0.021, label = "DDX27",
+                     offset = c(-80, .0004, 790, -.0015))
+p <- add_arrow_label(p = p, x = 6750, y = 0.012, label = "DCAF13",
+                     offset = c(-80, -.001, 570, .007))
+
+p <- add_arrow_label(p = p, x = 6500, y = -0.03, label = "log10_mut",
+                     offset = c(-50, -0.002, 900, 0.005))
+ggsave(file.path(results_folder, "figures", "ddr_coefficient_plot.pdf"),
+       plot = p, height = 2.5, width = 2.25, dpi = 600)
 
 # 3) Plot distributions of predictions according to variant classification
 mut_df <- readr::read_tsv(file.path(results_folder, "tables",

diff --git a/scripts/initialize/download_data.sh b/scripts/initialize/download_data.sh
@@ -50,6 +50,10 @@ data/raw/pancan_normalized_rnaseq.tsv
 synapse get syn5049520
 mv all_thresholded.by_genes_whitelisted.tsv data/raw/pancan_GISTIC_threshold.tsv
 
+# Segment based scores - measurement of total copy number burden
+synapse get syn7415024
+mv seg_based_scores.tsv data/seg_based_scores.tsv
+
 ####################################
 # Checksums
 ####################################

diff --git a/scripts/initialize/md5sums.txt b/scripts/initialize/md5sums.txt
@@ -2,3 +2,4 @@
 6d983890c5693a556621142eead3d3bb  data/raw/pancan_GISTIC_threshold.tsv
 91f16c866b069a0b607638fdb0f111de  data/raw/mc3.v0.2.8.PUBLIC.maf
 9d3b6f3e369944043d7855c013554e18  data/mutation-load.txt
+24ad6e2c29beb206ca068828ec99da74  data/seg_based_scores.tsv