week9_work.Rmd

---
title: "week9_work"
author: "Eric Weine"
date: "4/28/2022"
output:
  html_document:
    df_print: paged
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{r, include=FALSE}
set.seed(1)
library(ggplot2)
'%>%' <- dplyr::'%>%'
```

# Pallares Classification

```{r, include=FALSE}
summary_table <- read.delim('data/SummaryTable_allsites_12Nov20.txt')

# replace 0 p-values with small numbers
summary_table <- summary_table %>%
  dplyr::select(c(site, pval_CTRL, pval_HS, coef_CTRL, coef_HS, sig_cat)) %>%
  dplyr::mutate(
    pval_CTRL = pmax(.00000000001, pval_CTRL),
    pval_HS = pmax(.00000000001, pval_HS)
  )

# construct std error estimates from coefficients and p-values
summary_table <- summary_table %>%
  dplyr::mutate(
    std_error_ctrl = abs(coef_CTRL) / qnorm((2 - pval_CTRL) / 2),
    std_error_hs = abs(coef_HS) / qnorm((2 - pval_HS) / 2)
  )

sites_df <- data.frame(stringr::str_split_fixed(summary_table$site, ":", 2))
colnames(sites_df) <- c("chromosome", "site_id")
sites_df <- sites_df %>%
  dplyr::mutate(site_id = as.numeric(site_id))

# split into blocks of a certain length
split_into_LD_blocks <- function(df, block_length) {

  block_range <- seq(from = min(df$site_id), to = max(df$site_id), by = block_length)
  df %>%
    dplyr::mutate(block_id = plyr::laply(site_id, function(x) sum(x > block_range)))

}

# group by chromosome and then split into blocks
sites_df <- sites_df %>%
  dplyr::group_by(chromosome) %>%
  dplyr::group_modify(~ split_into_LD_blocks(.x, 1e4))

# Now, want to sample one SNP from each group
sites_sample_df <- sites_df %>%
  dplyr::ungroup() %>%
  dplyr::sample_frac() %>% #randomly shuffle df
  dplyr::distinct(chromosome, block_id, .keep_all = TRUE) %>%
  dplyr::select(chromosome, site_id)

# Reconstruct site names
selected_sites <- purrr::pmap_chr(
  list(sites_sample_df$chromosome, sites_sample_df$site_id),
  function(x, y) glue::glue("{x}:{y}")
)

summary_table_samp <- summary_table %>%
  dplyr::filter(site %in% selected_sites)

summary_table_signif <- summary_table %>%
  dplyr::filter(sig_cat != 'NS')

# generate a test df
sites_test_df <- sites_df %>%
  dplyr::ungroup() %>%
  dplyr::sample_frac() %>% #randomly shuffle df
  dplyr::distinct(chromosome, block_id, .keep_all = TRUE) %>%
  dplyr::select(chromosome, site_id)

# Reconstruct site names
selected_sites_test <- purrr::pmap_chr(
  list(sites_test_df$chromosome, sites_test_df$site_id),
  function(x, y) glue::glue("{x}:{y}")
)

summary_table_test <- summary_table %>%
  dplyr::filter(site %in% selected_sites_test)

reg_fx_samp_mat <- t(matrix(
  data = c(summary_table_samp$coef_CTRL, summary_table_samp$coef_HS),
  nrow = 2,
  byrow = TRUE
))
colnames(reg_fx_samp_mat) <- c("ctrl", "hs")

reg_fx_mat_test <- t(matrix(
  data = c(summary_table_test$coef_CTRL, summary_table_test$coef_HS),
  nrow = 2,
  byrow = TRUE
))
colnames(reg_fx_mat_test) <- c("ctrl", "hs")

reg_fx_mat_signif <- t(matrix(
  data = c(summary_table_signif$coef_CTRL, summary_table_signif$coef_HS),
  nrow = 2,
  byrow = TRUE
))
colnames(reg_fx_mat_signif) <- c("ctrl", "hs")

reg_se_samp_mat <- t(matrix(
  data = c(summary_table_samp$std_error_ctrl, summary_table_samp$std_error_hs),
  nrow = 2,
  byrow = TRUE
))
colnames(reg_se_samp_mat) <- c("ctrl", "hs")

reg_se_mat_test <- t(matrix(
  data = c(summary_table_test$std_error_ctrl, summary_table_test$std_error_hs),
  nrow = 2,
  byrow = TRUE
))
colnames(reg_se_mat_test) <- c("ctrl", "hs")

reg_se_mat_signif <- t(matrix(
  data = c(summary_table_signif$std_error_ctrl, summary_table_signif$std_error_hs),
  nrow = 2,
  byrow = TRUE
))
colnames(reg_se_mat_signif) <- c("ctrl", "hs")

mash_samp_data <- mashr::mash_set_data(reg_fx_samp_mat, reg_se_samp_mat)
mash_test_data <- mashr::mash_set_data(reg_fx_mat_test, reg_se_mat_test)
mash_signif_data <- mashr::mash_set_data(reg_fx_mat_signif, reg_se_mat_signif)

mash_fit <- readr::read_rds("rds_data/12k_fitted_g_loose.rds")

mash_posterior_samp <- mashr::mash(
  data = mash_samp_data,
  g = mash_fit,
  fixg = TRUE
)

mash_posterior_signif <- mashr::mash(
  data = mash_signif_data,
  g = mash_fit,
  fixg = TRUE
)

# if null weight is > .05, classify as null
mash_posterior_samp$posterior_weights[,"null"] <- ifelse(
  mash_posterior_samp$posterior_weights[,"null"] > .05, 
  1, 
  mash_posterior_samp$posterior_weights[,"null"]
)

cov_mat_names_samp <- colnames(mash_posterior_samp$posterior_weights)
map_mash_samp <- cov_mat_names_samp[max.col(mash_posterior_samp$posterior_weights)]
map_mash_samp_df <- data.frame(
  site = summary_table_samp$site, 
  cov_map = map_mash_samp
)
summary_table_samp <- summary_table_samp %>%
  dplyr::inner_join(map_mash_samp_df, by=c("site"))

# if null weight is > .05, classify as null
mash_posterior_signif$posterior_weights[,"null"] <- ifelse(
  mash_posterior_signif$posterior_weights[,"null"] > .05, 
  1, 
  mash_posterior_signif$posterior_weights[,"null"]
)

cov_mat_names_sig <- colnames(mash_posterior_signif$posterior_weights)
map_mash_sig <- cov_mat_names_sig[
  max.col(mash_posterior_signif$posterior_weights)
]
map_mash_sig_df <- data.frame(
  site = summary_table_signif$site, 
  cov_map = map_mash_sig
)
summary_table_sig <- summary_table_signif %>%
  dplyr::inner_join(map_mash_sig_df, by=c("site"))
```

We will use the sparse mash model to classify signals this week. Below is a comparison of the classifications for a sample of 12030 SNPs (1 from each LD block). The rows are Pallares classifications and the columns are MASH classifications. This time, if the probability of a null signal exceeds .05, we classify that signal as null.

```{r, include = FALSE}
summary_table_samp <- summary_table_samp %>%
  dplyr::mutate(
    mixt_comp = dplyr::case_when(
      cov_map %in% c("equal_corr_1.11", "equal_corr_1.12") ~ "equal_corr_1",
      cov_map == "hs_amp_1.5_corr_1.11" ~ "hs_amp_1.5_corr_1",
      cov_map == "hs_amp_3_corr_-1.14" ~ "hs_amp_3_corr_-1",
      cov_map == "null" ~ "null"
    )
  )

tab <- table(summary_table_samp$sig_cat, summary_table_samp$mixt_comp)
mat <- as.matrix(tab)
control_spec_mat <- matrix(data = c(0, 0, 0, 0), nrow = 1, ncol = 4)
rownames(control_spec_mat) <- c("C")
colnames(control_spec_mat) <- c("equal_corr_1", "hs_amp_1.5_corr_1", "hs_amp_3_corr_-1", "null")
ctrl_spec_mat <- matrix(data = c(0, 0, 0, 0), nrow = 4)
rownames(ctrl_spec_mat) <- c("NS", "shared", "HS", "C")
colnames(ctrl_spec_mat) <- c("ctrl_spec")
hs_spec_mat <- matrix(data = c(0, 0, 0, 0), nrow = 4)
rownames(hs_spec_mat) <- c("NS", "shared", "HS", "C")
colnames(hs_spec_mat) <- c("hs_spec")
mat <- rbind(mat, control_spec_mat)
mat <- cbind(mat, ctrl_spec_mat)
mat <- cbind(mat, hs_spec_mat)
mat <- mat[c("NS", "shared", "HS", "C"),,drop=FALSE]
rownames(mat) <- c("null", "shared", "hs_spec", "ctrl_spec")
tab <- as.table(mat)
```

```{r, echo=FALSE}
knitr::kable(tab)
```

Below is a plot of the estimated regression coefficients of Pallares colored by mash classification. Note that the columns in the table above that sum to zero are not shown in the plot legend. I felt that this made the plot easier to understand but we can certainly change this. The x-axis and regression line are shown in the plot.

```{r, echo=FALSE}
plot_colors <- c("orange", "green", "red", "blue")
names(plot_colors) <- c("equal_corr_1", "hs_amp_1.5_corr_1", "hs_amp_3_corr_-1", "null")
summary_table_samp <- summary_table_samp %>%
  dplyr::mutate(
    corr = dplyr::case_when(
      mixt_comp %in% c("equal_corr_1", "hs_amp_1.5_corr_1") ~ "non_null_corr_1",
      mixt_comp == "hs_amp_3_corr_-1" ~ "non_null_corr_-1",
      mixt_comp == "null" ~ "null"
    ),
    amplification = dplyr::case_when(
      mixt_comp == "equal_corr_1" ~ 1.0,
      mixt_comp == "hs_amp_1.5_corr_1" ~ 1.5,
      mixt_comp == "hs_amp_3_corr_-1" ~ 3.0,
      mixt_comp == "null" ~ 1.0
    ),
  )

reg_line <- lm(coef_CTRL ~ coef_HS, data = summary_table_samp)

plot_colors <- c("orange", "green", "red", "blue")
names(plot_colors) <- c("equal_corr_1", "hs_amp_1.5_corr_1", "hs_amp_3_corr_-1", "null")
ggplot(data = summary_table_samp, aes(x = coef_HS, y = coef_CTRL, color = mixt_comp)) +
  geom_point(size = 1) +
  scale_color_manual(name = "mixt_comp", values = plot_colors) + 
  geom_abline(slope = 0, intercept = 0, linetype = "dashed") +
  geom_abline(
    intercept = coef(summary(reg_line))['(Intercept)', 'Estimate'],
    slope = coef(summary(reg_line))['coef_HS', 'Estimate'],
    )
```


It is also instructive to look only at the SNPs that Pallares classifies as significant.

```{r, include=FALSE}
summary_table_sig <- summary_table_sig %>%
  dplyr::mutate(
    mixt_comp = dplyr::case_when(
      cov_map %in% c("equal_corr_1.11", "equal_corr_1.12") ~ "equal_corr_1",
      cov_map %in% c("hs_amp_1.5_corr_1.11", "hs_amp_1.5_corr_1.15") ~ "hs_amp_1.5_corr_1",
      cov_map == "hs_amp_3_corr_-1.14" ~ "hs_amp_3_corr_-1",
      cov_map == "ctrl_spec.14" ~ "ctrl_spec",
      cov_map == "null" ~ "null"
    )
  )

tab <- table(summary_table_sig$sig_cat, summary_table_sig$mixt_comp)
mat <- as.matrix(tab)
extra_mat <- matrix(data = c(0, 0, 0, 0, 0, 0), nrow = 3, ncol = 2)
rownames(extra_mat) <- c("CTRL", "HS", "shared")
colnames(extra_mat) <- c("null", "hs_spec")
mat <- cbind(mat, extra_mat)
rownames(mat) <- c("ctrl_spec", "hs_spec", "shared")
mat <- mat[,c("equal_corr_1", "hs_amp_1.5_corr_1", "hs_amp_3_corr_-1", "null", "ctrl_spec", "hs_spec"),drop=FALSE]
mat <- mat[c("shared", "hs_spec", "ctrl_spec"),,drop=FALSE]
tab <- as.table(mat)
```

```{r, echo=FALSE}
knitr::kable(tab)
```

Again, below is a plot of the regression coefficients colored by mash classification. 

```{r, echo=FALSE}
summary_table_sig <- summary_table_sig %>%
  dplyr::mutate(
    corr = dplyr::case_when(
      mixt_comp %in% c("equal_corr_1", "hs_amp_1.5_corr_1") ~ "non_null_corr_1",
      mixt_comp == "hs_amp_3_corr_-1" ~ "non_null_corr_-1",
      mixt_comp == "ctrl_spec" ~ "non_null_ctrl_spec"
    ),
    amplification = dplyr::case_when(
      mixt_comp == "equal_corr_1" ~ 1.0,
      mixt_comp == "hs_amp_1.5_corr_1" ~ 1.5,
      mixt_comp == "hs_amp_3_corr_-1" ~ 3.0,
      mixt_comp == "ctrl_spec" ~ 1.0
    ),
  )

reg_line <- lm(coef_CTRL ~ coef_HS, data = summary_table_samp)

plot_colors <- c("orange", "blue", "red", "green")
names(plot_colors) <- c("equal_corr_1", "hs_amp_1.5_corr_1", "hs_amp_3_corr_-1", "ctrl_spec")
ggplot(data = summary_table_sig, aes(x = coef_HS, y = coef_CTRL, color = mixt_comp)) +
  geom_point(size = .75) +
  scale_color_manual(name = "mixt_comp", values = plot_colors)  + 
  geom_abline(slope = 0, intercept = 0, linetype = "dashed") +
  geom_abline(
    intercept = coef(summary(reg_line))['(Intercept)', 'Estimate'],
    slope = coef(summary(reg_line))['coef_HS', 'Estimate'],
    )
```