Skip to content

Commit

Permalink
Copy Burden Analysis (#22)
Browse files Browse the repository at this point in the history
* make comparing within model figure smaller

* update tedious arrows for pointing at coefficient plot

* add segmented copy number data to initialize logic

* scripts to process and then plot copy burden data
  • Loading branch information
gwaybio authored Mar 16, 2017
1 parent 60881df commit f010dfd
Show file tree
Hide file tree
Showing 6 changed files with 196 additions and 46 deletions.
8 changes: 4 additions & 4 deletions scripts/compare_within_models.R
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ process_classifier_summary <- function(summary_list, model_type) {
# Process PanCancer Classifier and summary files
pan_summary <- file.path(pan_summary_dir, "classifier_summary.txt")
pancan_list <- parse_summary(pan_summary)
pancan_df <- process_classifier_summary(pancan_list, "Pan Cancer")
pancan_df <- process_classifier_summary(pancan_list, "Pan")

# Process Within Cancer Results
within_tissue_files <- list.files(within_folder,
Expand All @@ -55,7 +55,7 @@ within_tissue_files <- list.files(within_folder,
within_tissue_data <- data.frame()
for (file in within_tissue_files) {
file_summary <- parse_summary(file)
file_frame <- process_classifier_summary(file_summary, "Within Disease")
file_frame <- process_classifier_summary(file_summary, "Within")
within_tissue_data <- rbind(within_tissue_data, file_frame)
}

Expand All @@ -65,7 +65,7 @@ plot_ready$AUROC <- as.numeric(paste(plot_ready$AUROC))
ggplot(plot_ready, aes(x = Disease, y = AUROC, fill = Model)) +
geom_bar(position = "dodge", stat = "identity") +
theme_bw() + within_theme +
theme(legend.position = c(1.09, 0.65),
theme(legend.position = c(1.07, 0.65),
legend.background = element_rect(fill = alpha("white", 0)),
plot.margin = unit(c(0.2, 1.5, 0, 0.1), "cm")) +
scale_fill_manual(values = c("brown", "gold")) +
Expand All @@ -76,4 +76,4 @@ ggplot(plot_ready, aes(x = Disease, y = AUROC, fill = Model)) +
scale_y_continuous(breaks = seq(0.4, 1, 0.1))

ggsave(file.path(pan_summary_dir, "figures", "comparison.pdf"), units = "in",
height = 1.8, width = 4.2, dpi = 600)
height = 1.4, width = 4.2, dpi = 600)
104 changes: 104 additions & 0 deletions scripts/copy_burden_figures.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# Gregory Way 2017
# PanCancer Classifier
# scripts/copy_burden_figures.R
#
# Generate figures for visualizing copy burden across different samples
# stratified by TP53 mutation status
#
# Usage: Run in command line
#
# Rscript --vanilla scripts/copy_burden_figures.R
#
# Output:
# Two figures summarizing copy burden across TCGA Pan Can samples

library(ggplot2)

# Set File Names
base_file <- file.path("classifiers", "TP53")
burden_file <- file.path(base_file, "tables", "copy_burden_predictions.tsv")
snaptron_file <- file.path("scripts", "snaptron",
"junctions_with_mutations.csv")
frac_alt_plot <- file.path(base_file, "figures", "fraction_altered_plot.pdf")
violin_plot <- file.path(base_file, "figures", "seg_altered_violin_plot.pdf")

# Load Files
copy_burden <- readr::read_tsv(burden_file)
junc_df <- readr::read_csv(snaptron_file)
junc_df <- junc_df[,-1]
junc_df <- junc_df[!duplicated(junc_df), ]

# Location of the silent mutation and truncation
junc_exon_df = junc_df[junc_df$start == "7675237", ]
silent_junc <- junc_exon_df[junc_exon_df$Variant_Classification == "Silent", ]
silent_junc <- silent_junc[silent_junc$snaptron_id == "13945701", ]
silent_junc <- silent_junc[silent_junc$TP53 %in% 0, ]
silent_junc <- silent_junc[silent_junc$include %in% 1, ]

ggplot(copy_burden, aes(weight, frac_altered, color = factor(TP53))) +
geom_point(alpha = 0.6, size = 0.3) + theme_bw() +
xlab("TP53 Inactivation Probability") +
ylab("CNV Burden (Fraction Altered)") +
labs(color = "TP53 Status")
ggsave(frac_alt_plot, width = 5, height = 4, dpi = 600)

# Build and Process Copy Burden DataFrame
copy_burden$silent <- 0
copy_burden[copy_burden$Sample %in% silent_junc$tcga_id, "silent"] <- 1
silent_and_junc <- copy_burden[copy_burden$silent == 1, ]
silent_and_junc$TP53 <- "c.375G>T Mutation"

copy_burden[copy_burden$total_status == 0, "TP53"] = "Wild-Type"
copy_burden[copy_burden$total_status == 1, "TP53"] = "TP53 Loss of Function"

plot_ready <- copy_burden[, c("frac_altered", "TP53")]
plot_ready <- rbind(plot_ready, silent_and_junc[, c("frac_altered", "TP53")])

false_negatives <- copy_burden[(copy_burden$total_status == 1) &
(copy_burden$weight < 0.5), ]
false_negatives$TP53 <- "False Negative"
plot_ready <- rbind(plot_ready, false_negatives[, c("frac_altered", "TP53")])

false_positives <- copy_burden[(copy_burden$total_status == 0) &
(copy_burden$weight >= 0.5), ]
false_positives$TP53 <- "False Positive"
plot_ready <- rbind(plot_ready, false_positives[, c("frac_altered", "TP53")])

predicted_neg <- copy_burden[copy_burden$weight < 0.5, ]
predicted_neg$TP53 <- "Predicted Wild-Type"
plot_ready <- rbind(plot_ready, predicted_neg[, c("frac_altered", "TP53")])

predicted_pos <- copy_burden[copy_burden$weight >= 0.5, ]
predicted_pos$TP53 <- "Predicted Loss"
plot_ready <- rbind(plot_ready, predicted_pos[, c("frac_altered", "TP53")])

plot_levels <- c("c.375G>T Mutation", "False Positive",
"Predicted Loss", "TP53 Loss of Function",
"False Negative", "Predicted Wild-Type", "Wild-Type")

plot_ready$TP53 <- factor(plot_ready$TP53, levels = plot_levels)

# Build violin plots for copy number alterations comparison
ggplot(plot_ready, aes(x = TP53, y = frac_altered)) +
ylab("CNV Burden (Fraction Altered)") + xlab("TP53 Status") +
labs(fill = "") + geom_violin(aes(fill = TP53), size = 0.3, alpha = 0.3,
adjust = 0.7, trim = TRUE) +
geom_boxplot(aes(fill = TP53), size = 0.3, width = 0.1, outlier.size = 0.3) +
coord_flip() + geom_hline(yintercept = 0.5, linetype = "dashed",
color = "red") +
theme(legend.position = c(1.4, 0.7), axis.text.y = element_blank(),
axis.text.x = element_text(size = rel(0.7)),
axis.title = element_text(size = rel(0.7)),
legend.text = element_text(size = rel(0.35)),
legend.key = element_blank(),
legend.key.size = unit(0.8, "lines"),
legend.background = element_rect(fill = alpha("white", 0)),
panel.grid.major = element_line(color = "white", size = 0.3),
panel.grid.minor = element_line(color = "white", size = 0.3),
panel.background = element_rect(fill = "white"),
plot.margin = unit(c(0.1, 2.6, 0.2, 0.2),"cm"),
panel.border = element_rect(fill = NA, size = 0.4)) +
guides(fill = guide_legend(reverse = TRUE, ncol = 1), color = FALSE)

ggsave(violin_plot, height = 2.25, width = 2.5, dpi = 600)

40 changes: 40 additions & 0 deletions scripts/copy_burden_merge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""
Gregory Way 2017
PanCancer Classifier
scripts/copy_burden_merge.py
Merge per sample classifier scores with segment based scores
Usage: Run in command line with required command argument:
python scripts/copy_burden_merge.py --classifier_folder
classifier_folder is a string pointing to the location of the classifier data
Output:
.tsv file of classifier scores merged with segment based copy number scores
"""

import os
import argparse
import pandas as pd

parser = argparse.ArgumentParser()
parser.add_argument('-c', '--classifier_folder',
help='string of the location of classifier data')
args = parser.parse_args()

# Load command arguments
pred_fild = os.path.join(args.classifier_folder, 'classifier_decisions.tsv')
burden_file = os.path.join('data', 'seg_based_scores.tsv')
out_file = os.path.join(os.path.dirname(pred_fild), 'tables',
'copy_burden_predictions.tsv')

# Load and process data
copy_burden_df = pd.read_table(burden_file)
classifier_df = pd.read_table(pred_fild, index_col=0)

combined_df = classifier_df.merge(copy_burden_df, left_index=True,
right_on='Sample')
combined_df.index = combined_df['Sample']
combined_df.to_csv(out_file, sep='\t')
85 changes: 43 additions & 42 deletions scripts/ddr_summary_figures.R
Original file line number Diff line number Diff line change
Expand Up @@ -64,57 +64,58 @@ coef_df <- coef_df[order(coef_df$weight, decreasing = FALSE), ]
coef_df$rank <- 1:nrow(coef_df)

p <- ggplot(coef_df, aes(x = 1:nrow(coef_df), y = weight)) +
geom_point(aes(fill = "black"), size = 0.01) +
geom_point(fill = "black", size = 0.01) +
base_theme + theme(axis.line.x = element_line(),
axis.line.y = element_line(),
axis.ticks = element_line(),
plot.margin = unit(c(0.25, 0.25, 0.1, 0.1),"cm")) +
axis.title = element_text(size = rel(1.5)),
plot.margin = unit(c(0.25, 0.25, 0.1, 0.1), "cm")) +
labs(list(x = "Rank", y = "Weight")) +
scale_y_continuous(breaks = seq(-0.25, 0.25, 0.05)) +
scale_x_continuous(breaks = seq(0, 8000, 1000)) +
scale_x_continuous(breaks = seq(0, 8000, 2000)) +
geom_segment(aes(x = 0, y = 0, yend = 0, xend = nrow(coef_df)),
colour = "red", linetype = "dashed", size = 0.2)

p <- add_arrow_label(p = p, x = 650, y = -0.205, label = "DDB2",
offset = c(70, 0.001, -445, -.0002))
p <- add_arrow_label(p = p, x = 750, y = -0.190, label = "AEN",
offset = c(80, 0.001, -350, -.002))
p <- add_arrow_label(p = p, x = 1080, y = -0.172, label = "RPS27L",
offset = c(60, 0.001, -610, .0003))
p <- add_arrow_label(p = p, x = 1050, y = -0.205, label = "DDB2",
offset = c(80, 0.001, -645, -.0002))
p <- add_arrow_label(p = p, x = 1150, y = -0.190, label = "AEN",
offset = c(80, 0.001, -450, 0))
p <- add_arrow_label(p = p, x = 1480, y = -0.172, label = "RPS27L",
offset = c(80, 0.001, -890, .0003))
p <- add_arrow_label(p = p, x = 950, y = -0.155, label = "MDM2",
offset = c(50, -0.0001, -480, -.00013))
p <- add_arrow_label(p = p, x = 1300, y = -0.14, label = "BAX",
offset = c(80, -0.0001, -340, .0002))
p <- add_arrow_label(p = p, x = 1300, y = -0.12, label = "CDKN1A",
offset = c(90, -0.0001, -710, -.00015))
p <- add_arrow_label(p = p, x = 1200, y = -0.1, label = "XPC",
offset = c(80, 0, -450, -.0002))
p <- add_arrow_label(p = p, x = 1700, y = -0.085, label = "MPDU1",
offset = c(80, 0, -600, -.0002))
p <- add_arrow_label(p = p, x = 1400, y = -0.07, label = "FDXR",
offset = c(80, 0, -500, -.0006))
p <- add_arrow_label(p = p, x = 1500, y = -0.055, label = "CYB5D2",
offset = c(80, 0, -650, -.0002))

p <- add_arrow_label(p = p, x = 7200, y = 0.1, label = "KIF1B",
offset = c(-50, .001, 450, -.006))
p <- add_arrow_label(p = p, x = 6900, y = 0.085, label = "EEPD1",
offset = c(-50, .0006, 360, -.006))
p <- add_arrow_label(p = p, x = 6600, y = 0.07, label = "SPATS2L",
offset = c(-50, .0004, 760, -.0015))
p <- add_arrow_label(p = p, x = 6250, y = 0.054, label = "CDC123",
offset = c(-50, .0004, 700, -.0015))
p <- add_arrow_label(p = p, x = 5950, y = 0.036, label = "ID4",
offset = c(-50, .0004, 390, -.0015))
p <- add_arrow_label(p = p, x = 5550, y = 0.021, label = "MIIP",
offset = c(-50, .0004, 390, -.0015))
p <- add_arrow_label(p = p, x = 6950, y = 0.012, label = "DCAF13",
offset = c(-50, -.001, 470, .0055))

p <- add_arrow_label(p = p, x = 2500, y = -0.03, label = "log10_mut",
offset = c(50, 0, -900, 0))
ggsave(file = file.path(results_folder, "figures", "ddr_coefficient_plot.pdf"),
plot = p, height = 2.5, width = 2.75, dpi = 600)
offset = c(80, -0.001, -680, -.00013))
p <- add_arrow_label(p = p, x = 1700, y = -0.14, label = "BAX",
offset = c(80, -0.001, -510, .0002))
p <- add_arrow_label(p = p, x = 2000, y = -0.12, label = "CDKN1A",
offset = c(90, 0.0001, -950, -.00015))
p <- add_arrow_label(p = p, x = 1800, y = -0.1, label = "XPC",
offset = c(80, 0, -550, -.0002))
p <- add_arrow_label(p = p, x = 2100, y = -0.085, label = "MPDU1",
offset = c(80, 0, -800, -.0002))
p <- add_arrow_label(p = p, x = 1800, y = -0.07, label = "FDXR",
offset = c(80, 0, -650, -.0006))
p <- add_arrow_label(p = p, x = 1900, y = -0.055, label = "PHLDA3",
offset = c(80, 0, -890, -.0002))

p <- add_arrow_label(p = p, x = 7000, y = 0.1, label = "KIF1B",
offset = c(-80, .001, 550, -.006))
p <- add_arrow_label(p = p, x = 6700, y = 0.085, label = "EEPD1",
offset = c(-80, .0006, 460, -.006))
p <- add_arrow_label(p = p, x = 6400, y = 0.07, label = "MIIP",
offset = c(-80, .0004, 500, -.0015))
p <- add_arrow_label(p = p, x = 6050, y = 0.054, label = "ID4",
offset = c(-80, .0004, 490, -.0015))
p <- add_arrow_label(p = p, x = 5750, y = 0.036, label = "CDC123",
offset = c(-80, .0004, 900, -.0015))
p <- add_arrow_label(p = p, x = 5350, y = 0.021, label = "DDX27",
offset = c(-80, .0004, 790, -.0015))
p <- add_arrow_label(p = p, x = 6750, y = 0.012, label = "DCAF13",
offset = c(-80, -.001, 570, .007))

p <- add_arrow_label(p = p, x = 6500, y = -0.03, label = "log10_mut",
offset = c(-50, -0.002, 900, 0.005))
ggsave(file.path(results_folder, "figures", "ddr_coefficient_plot.pdf"),
plot = p, height = 2.5, width = 2.25, dpi = 600)

# 3) Plot distributions of predictions according to variant classification
mut_df <- readr::read_tsv(file.path(results_folder, "tables",
Expand Down
4 changes: 4 additions & 0 deletions scripts/initialize/download_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,10 @@ data/raw/pancan_normalized_rnaseq.tsv
synapse get syn5049520
mv all_thresholded.by_genes_whitelisted.tsv data/raw/pancan_GISTIC_threshold.tsv

# Segment based scores - measurement of total copy number burden
synapse get syn7415024
mv seg_based_scores.tsv data/seg_based_scores.tsv

####################################
# Checksums
####################################
Expand Down
1 change: 1 addition & 0 deletions scripts/initialize/md5sums.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
6d983890c5693a556621142eead3d3bb data/raw/pancan_GISTIC_threshold.tsv
91f16c866b069a0b607638fdb0f111de data/raw/mc3.v0.2.8.PUBLIC.maf
9d3b6f3e369944043d7855c013554e18 data/mutation-load.txt
24ad6e2c29beb206ca068828ec99da74 data/seg_based_scores.tsv

0 comments on commit f010dfd

Please sign in to comment.