bugfix in theta

snaketron · Feb 14, 2024 · 1bbb804 · 1bbb804
1 parent 6247dc0
commit 1bbb804
Show file tree

Hide file tree

Showing 8 changed files with 118 additions and 44 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -33,7 +33,8 @@ Suggests:
     ggplot2,
     ggforce,
     gridExtra,
-    ggrepel
+    ggrepel,
+    MASS
 LinkingTo: 
     BH (>= 1.66.0),
     Rcpp (>= 0.12.0),

diff --git a/R/dgu.R b/R/dgu.R
@@ -39,18 +39,15 @@ DGU <- function(ud,
                          pars = m$pars,
                          refresh = 50)
 
-  if(m$model_type=="GU") {
-    message("Computing summaries ... \n")
-    gu <- get_gu_summary_gu(glm = glm, hdi_lvl = hdi_lvl, ud = ud)
-    dgu <- NA
-    dgu_prob <- NA
-  }
-  if(m$model_type=="DGU") {
-    message("Computing summaries ... \n")
-    gu <- get_gu_summary_dgu(glm = glm, hdi_lvl = hdi_lvl, ud = ud)
-    dgu <- get_dgu_summary(glm = glm, hdi_lvl = hdi_lvl, ud = ud)
-    dgu_prob <- get_dgu_prob_summary(glm = glm, hdi_lvl = hdi_lvl, ud = ud)
-  }
+  message("Computing summaries ... \n")
+  gu <- get_condition_prop(glm = glm, hdi_lvl = hdi_lvl, ud = ud, 
+                           model_type = m$model_type)
+  dgu <- get_dgu(glm = glm, hdi_lvl = hdi_lvl, ud = ud, 
+                 model_type = m$model_type)
+  dgu_prob <- get_dgu_prob(glm = glm, hdi_lvl = hdi_lvl, ud = ud, 
+                           model_type = m$model_type)
+  theta <- get_sample_prop_gu(glm = glm, hdi_lvl = hdi_lvl, ud = ud)
+
 
   # ppc
   message("Computing posterior predictions ... \n")
@@ -62,7 +59,8 @@ DGU <- function(ud,
   return (list(dgu = dgu,
                dgu_prob = dgu_prob,
                gu = gu,
-               glm = glm,
+               theta = theta,
                ppc = ppc,
-               ud = ud))
+               ud = ud,
+               fit = glm))
 }
diff --git a/R/loo.R b/R/loo.R
@@ -1,11 +1,6 @@
 
 # Description:
 # LOO = leave-one-out
-# ud: 4 columns
-#   * sample_id: char column
-#   * condition: char column
-#   * gene_name: char column
-#   * gene_usage_count: num column
 LOO <- function(ud,
                 mcmc_warmup = 500,
                 mcmc_steps = 1500,
@@ -28,8 +23,7 @@ LOO <- function(ud,
   ud <- udp$proc_ud
 
   # setup control list
-  control_list <- list(adapt_delta = adapt_delta,
-                       max_treedepth = max_treedepth)
+  control_list <- list(adapt_delta = adapt_delta, max_treedepth = max_treedepth)
 
   # unique repertoire names
   ud$loo_id <- ud$sample_id
@@ -74,6 +68,9 @@ LOO <- function(ud,
     if(is.data.frame(out$dgu_prob)==TRUE) {
       out$dgu_prob$loo_id <- rs[r]
     }
+    if(is.data.frame(out$theta)==TRUE) {
+      out$theta$loo_id <- rs[r]
+    }
 
     # collect results
     loo_out[[rs[r]]] <- out
@@ -82,4 +79,3 @@ LOO <- function(ud,
   return (loo_out)
 }
 
-
diff --git a/R/utils_dgu.R b/R/utils_dgu.R
@@ -22,7 +22,10 @@ get_contrast_map <- function(ud) {
 }
 
 
-get_dgu_summary <- function(glm, hdi_lvl, ud) {
+get_dgu <- function(glm, hdi_lvl, ud, model_type) {
+  if(model_type == "GU") {
+    return(NA)
+  }
   contrast_map <- get_contrast_map(ud)
 
   dgu_summary <- summary(object = glm, 
@@ -69,7 +72,10 @@ get_dgu_summary <- function(glm, hdi_lvl, ud) {
 }
 
 
-get_dgu_prob_summary <- function(glm, hdi_lvl, ud) {
+get_dgu_prob <- function(glm, hdi_lvl, ud, model_type) {
+  if(model_type == "GU") {
+    return(NA)
+  }
   contrast_map <- get_contrast_map(ud)
 
   dgu_summary <- summary(object = glm, 

diff --git a/R/utils_gu.R b/R/utils_gu.R
@@ -1,5 +1,15 @@
 
-get_gu_summary_dgu <- function(glm, hdi_lvl, ud) {
+get_condition_prop <- function(glm, hdi_lvl, ud, model_type) {
+  if(model_type == "DGU") {
+    return(get_condition_prop_dgu(glm = glm, hdi_lvl = hdi_lvl, ud = ud))
+  }
+  if(model_type == "GU") {
+    return(get_condition_prop_gu(glm = glm, hdi_lvl = hdi_lvl, ud = ud))
+  }
+}
+
+
+get_condition_prop_dgu <- function(glm, hdi_lvl, ud) {
 
   gu_summary <- summary(object = glm, 
                         digits = 4,
@@ -38,7 +48,7 @@ get_gu_summary_dgu <- function(glm, hdi_lvl, ud) {
 }
 
 
-get_gu_summary_gu <- function(glm, hdi_lvl, ud) {
+get_condition_prop_gu <- function(glm, hdi_lvl, ud) {
 
   gu_summary <- summary(object = glm, 
                         digits = 4,
@@ -69,3 +79,39 @@ get_gu_summary_gu <- function(glm, hdi_lvl, ud) {
   return(gu_summary)
 }
 
+
+get_sample_prop_gu <- function(glm, hdi_lvl, ud) {
+
+  gu <- summary(object = glm, digits = 4, pars = "theta",
+                prob = c(0.5, (1-hdi_lvl)/2, 1-(1-hdi_lvl)/2))
+  gu <- data.frame(gu$summary)
+  colnames(gu) <- c("theta_mean", "theta_mean_se",
+                    "theta_sd", "theta_median",
+                    "theta_L", "theta_H",
+                    "Neff", "Rhat")
+  gu[, c("Rhat", "Neff")] <- NULL
+
+  par <- rownames(gu)
+  par <- gsub(pattern = "theta|\\[|\\]", replacement = '', x = par)
+  par <- do.call(rbind, strsplit(x = par, split = ','))
+
+  gu$gene_id <- as.numeric(par[,2])
+  gu$sample_id <- as.numeric(par[,1])
+
+  gu$gene_name <- ud$gene_names[gu$gene_id]
+  gu$sample_name <- ud$sample_names[gu$sample_id]
+
+  m <- ud$proc_ud[, c("sample_id", "individual_id", "individual_org_name",
+                      "gene_name", "condition", "gene_usage_prop")]
+
+  gu <- merge(x = gu, y = m, 
+              by.x = c("sample_id", "gene_name"), 
+              by.y = c("sample_id", "gene_name"))
+
+  # remove unused vars
+  gu$gene_id <- NULL
+  gu$sample_id <- NULL
+  rownames(gu) <- NULL
+  return(gu)
+}
+
diff --git a/R/utils_usage.R b/R/utils_usage.R
@@ -106,7 +106,7 @@ get_model <- function(has_replicates, has_conditions, debug = FALSE) {
                 "sigma_individual", "sigma_replicate",
                 "beta_sample", "beta_individual", "beta_condition",
                 "Yhat_rep", "Yhat_rep_prop", "Yhat_condition_prop", 
-                "log_lik")
+                "log_lik", "theta")
       model_name <- "GU_rep"
     } 
     else {
@@ -119,7 +119,7 @@ get_model <- function(has_replicates, has_conditions, debug = FALSE) {
                 "sigma_individual",
                 "beta_individual", "beta_condition",
                 "Yhat_rep", "Yhat_rep_prop", "Yhat_condition_prop", 
-                "log_lik")
+                "log_lik", "theta")
       model_name <- "GU"
     }
   } 
@@ -134,7 +134,7 @@ get_model <- function(has_replicates, has_conditions, debug = FALSE) {
                 "sigma_condition", "sigma_individual", "sigma_replicate",
                 "beta_sample", "beta_individual", "beta_condition", 
                 "Yhat_rep", "Yhat_rep_prop", "Yhat_condition_prop", 
-                "log_lik", "dgu", "dgu_prob")
+                "log_lik", "dgu", "dgu_prob", "theta")
       model_name <- "DGU_rep"
     } 
     else {
@@ -147,7 +147,7 @@ get_model <- function(has_replicates, has_conditions, debug = FALSE) {
                 "sigma_condition", "sigma_individual",
                 "beta_individual", "beta_condition", 
                 "Yhat_rep", "Yhat_rep_prop", "Yhat_condition_prop", 
-                "log_lik", "dgu", "dgu_prob")
+                "log_lik", "dgu", "dgu_prob", "theta")
       model_name <- "DGU"
     }
   }

diff --git a/man/DGU.Rd b/man/DGU.Rd
@@ -58,7 +58,7 @@ of HDI), H (high boundary of HDI); 2) contrast = direction of the effect; 3)
 pmax = probability of DGU. This summary is only available if the input data
 contains at least two conditions}
 \item{gu}{gene usage (GU) summary of each gene in each condition}
-\item{glm}{stanfit object}
+\item{fit}{stanfit object}
 \item{ppc}{two types of posterior predictive checks: 1) repertoire-
 specific, 2) condition-specific}
 \item{ud}{processed gene usage data used for the model}
@@ -86,7 +86,7 @@ M <- DGU(ud = d_zibb_2,
          max_treedepth = 10)
 
 # look at DGU results
-head(M$glm)
+head(M$fit)
 
 # look at posterior predictive checks (PPC)
 head(M$ppc)

diff --git a/vignettes/User_Manual.Rmd b/vignettes/User_Manual.Rmd
@@ -28,6 +28,7 @@ require(ggforce)
 require(gridExtra)
 require(ggrepel)
 require(reshape2)
+require(MASS)
 ```
 
 
@@ -221,15 +222,15 @@ summary(M)
   * none found
 
 ```{r}
-rstan::check_hmc_diagnostics(M$glm)
+rstan::check_hmc_diagnostics(M$fit)
 ```
 
   * rhat < 1.03 and n_eff > 0
 
 
 ```{r, fig.height = 3, fig.width = 6}
-gridExtra::grid.arrange(rstan::stan_rhat(object = M$glm),
-                        rstan::stan_ess(object = M$glm),
+gridExtra::grid.arrange(rstan::stan_rhat(object = M$fit),
+                        rstan::stan_ess(object = M$fit),
                         nrow = 1)
 ```
 
@@ -271,7 +272,8 @@ ggplot(data = M$ppc$ppc_condition)+
   theme_bw(base_size = 11)+
   theme(legend.position = "top")+
   xlab(label = "Observed usage [%]")+
-  ylab(label = "PPC usage [%]")
+  ylab(label = "PPC usage [%]")+
+  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4))
 ```
 
 
@@ -288,7 +290,7 @@ For `es` we also have the mean, median standard error (se), standard
 deviation (sd), L (low bound of 95% HDI), H (high bound of 95% HDI)
 
 ```{r}
-kable(x = head(M$dgu), row.names = FALSE, digits = 3)
+kable(x = head(M$dgu), row.names = FALSE, digits = 2)
 ```
 
 ### DGU: differential gene usage
@@ -352,6 +354,7 @@ ggplot()+
                 position = position_dodge(width = 0.35), width = 0.15)+
   theme_bw(base_size = 11)+
   theme(legend.position = "top")+
+  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4))+
   ylab(label = "PPC usage [%]")+
   xlab(label = '')
 ```
@@ -371,7 +374,8 @@ ggplot()+
   theme_bw(base_size = 11)+
   theme(legend.position = "top")+
   ylab(label = "PPC usage [count]")+
-  xlab(label = '')
+  xlab(label = '')+
+  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4))
 ```
 
 ## GU: gene usage summary
@@ -389,7 +393,8 @@ ggplot(data = M$gu)+
              position = position_dodge(width = 0.4))+
   theme_bw(base_size = 11)+
   theme(legend.position = "top")+
-  ylab(label = "GU [probability]")
+  ylab(label = "GU [probability]")+
+  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4))
 ```
 
 # Leave-one-out (LOO) analysis
@@ -439,7 +444,8 @@ ggplot(data = L_dgu)+
              position = position_dodge(width = 0.5))+
   theme_bw(base_size = 11)+
   theme(legend.position = "top")+
-  ylab(expression(gamma))
+  ylab(expression(gamma))+
+  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4))
 ```
 
 ## LOO-DGU: variability of $\pi$
@@ -451,13 +457,14 @@ ggplot(data = L_dgu)+
              position = position_dodge(width = 0.5))+
   theme_bw(base_size = 11)+
   theme(legend.position = "top")+
-  ylab(expression(pi))
+  ylab(expression(pi))+
+  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4))
 ```
 
 
 ## LOO-GU: variability of the gene usage
 
-```{r, fig.width=6.5, fig.height=4}
+```{r, fig.width=6, fig.height=4}
 ggplot(data = L_gu)+
   geom_hline(yintercept = 0, linetype = "dashed", col = "gray")+
   geom_errorbar(aes(x = gene_name, y = prob_mean, ymin = prob_L,
@@ -469,9 +476,29 @@ ggplot(data = L_gu)+
              position = position_dodge(width = 0.5))+
   theme_bw(base_size = 11)+
   theme(legend.position = "top")+
-  ylab("GU [probability]")
+  ylab("GU [probability]")+
+  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.4))
 ```
 
+# Multidimensional scaling (MDS) analaysis based 
+
+```{r, fig.width=5, fig.height=4}
+# x <- M$theta
+x <- acast(individual_id~gene_name, 
+           data = M$theta, value.var = "theta_mean")
+
+nmds <- isoMDS(dist(x), k=2)
+
+x <- data.frame(x = nmds$points[,1], y = nmds$points[,2], id = row.names(x))
+
+ggplot(data = x)+
+  geom_point(aes(x = x, y = y), shape = 21, size = 2)+
+  geom_text_repel(aes(x = x, y = y, label = id), 
+                  size = 3.5, min.segment.length = 0)+
+  theme_bw()
+```
+
+
 # Session
 
 ```{r}