diff --git a/lect17-scRNA-topic-model/Setup.R b/lect17-scRNA-topic-model/Setup.R new file mode 100644 index 0000000..ebb6081 --- /dev/null +++ b/lect17-scRNA-topic-model/Setup.R @@ -0,0 +1,231 @@ +options(stringsAsFactors = FALSE) + +`%&%` <- function(a,b) paste0(a, b) +`%r%` <- function(mat,rr) mat[rr, , drop = FALSE] +`%c%` <- function(mat,cc) mat[, cc, drop = FALSE] + +library(tidyverse) +library(data.table) +library(ggrepel) + +num.int <- function(...) format(..., justify="none", big.mark=",", drop0trailing = TRUE) + +num.sci <- function(...) format(..., justify="none", digits=2, scientific = TRUE) + +row.order <- function(mat) { + require(cba) + require(proxy) + + if(nrow(mat) < 3) { + return(1:nrow(mat)) + } + + D = proxy::dist(mat, method <- function(a,b) 1 - cor(a,b, method = 'spearman')) + D[!is.finite(D)] = 0 + h.out = hclust(D) + o.out = cba::order.optimal(D, h.out$merge) + return(o.out$order) +} + +col.order <- function(pair.tab, .ro, ret.tab = FALSE) { + + M = pair.tab %>% + dplyr::select(row, col, weight) %>% + dplyr::mutate(row = factor(row, .ro)) %>% + tidyr::spread(key = col, value = weight, fill = 0) + + co = order(apply(M[, -1], 2, which.max), decreasing = TRUE) + .co = colnames(M)[-1][co] + if(ret.tab) { + ret = pair.tab %>% + dplyr::mutate(row = factor(row, .ro)) %>% + dplyr::mutate(col = factor(col, .co)) + } else { + ret = .co + } + return(ret) +} + +order.pair <- function(pair.tab, ret.tab=FALSE) { + + require(tidyr) + require(dplyr) + + .tab = pair.tab %>% dplyr::select(row, col, weight) + + M = .tab %>% tidyr::spread(key = col, value = weight, fill = 0) + rr = M[, 1] %>% unlist(use.names = FALSE) + cc = colnames(M)[-1] %>% unlist(use.names = FALSE) + + ## log.msg('Built the Mat: %d x %d', nrow(M), ncol(M)) + ro = row.order(M %>% dplyr::select(-row) %>% as.matrix()) + + ## log.msg('Sort the rows: %d', length(ro)) + co = order(apply(M[ro, -1], 2, which.max), decreasing = TRUE) + + ## co = row.order(t(M %>% dplyr::select(-row) %>% as.matrix())) + ## log.msg('Sort the columns: %d', length(co)) + + if(ret.tab){ + ret = pair.tab %>% + dplyr::mutate(row = factor(row, rr[ro])) %>% + dplyr::mutate(col = factor(col, cc[co])) + } else { + ret = list(rows = rr[ro], cols = cc[co], M = M) + } + + return(ret) +} + + +.sort.matrix <- function(.X) { + as.matrix(.X) %>% + reshape2::melt() %>% + rename(row = Var1, col = Var2, weight = value) %>% + order.pair(ret.tab=TRUE) %>% + as.data.table %>% + dcast(row ~ col, value.var = "weight") %>% + dplyr::select(-row) %>% + as.matrix +} + +.rnorm <- function(d1, d2) { + matrix(rnorm(d1 * d2), d1, d2) +} + +############################################################### +.matshow <- function(.mat, .lab=0, .axis.lab=0, .lw=0, .scale=TRUE) { + + library(ggrastr) + + .mat <- as.matrix(.mat) + .cols <- colnames(.mat) + .rows <- rownames(.mat) + if(length(.cols) < ncol(.mat)){ + colnames(.mat) <- str_c("c", 1:ncol(.mat)) + } + if(length(.rows) < nrow(.mat)){ + rownames(.mat) <- str_c("r", 1:nrow(.mat)) + } + .cols <- colnames(.mat) + .rows <- rownames(.mat) + + .dt <- + as.data.frame(.mat) %>% + rownames_to_column("row") %>% + as.data.table %>% + melt(id.vars = "row", variable.name = "col") %>% + dplyr::mutate(row = factor(as.character(row), rev(.rows))) %>% + dplyr::mutate(col = factor(as.character(col), .cols)) + + ret <- + ggplot(.dt, aes(y = row, x = col, fill = value)) + + theme(legend.position = "none") + if(.axis.lab > 0){ + ret <- + ret + + theme(axis.text.x = element_text(size=.axis.lab, angle=90, vjust=1, hjust=1)) + + theme(axis.text.y = element_text(size=.axis.lab)) + } else { + ret <- + ret + + theme(axis.text = element_blank()) + + theme(axis.ticks = element_blank()) + } + ret <- ret + + theme(axis.title = element_blank()) + + if(.lw > 0){ + ret <- ret + + ggrastr::rasterise(geom_tile(linewidth = .lw, colour = "black"), dpi=300) + } else { + ret <- ret + ggrastr::rasterise(geom_tile(), dpi=300) + } + + if(.scale){ + ret <- ret + + scale_fill_distiller(palette = "RdBu", direction = -1) + } else { + ret <- ret + + scale_fill_distiller(palette = "Greys", direction = 1) + } + + if(.lab > 0) { + ret <- ret + + geom_text(aes(label = round(value,1)), size = .lab) + } + + return(ret) +} + +################################################################ +.gg.plot <- function(...) { + ggplot(...) + + theme_classic() + + theme(axis.title = element_text(size=8)) + + theme(axis.text = element_text(size=6)) + + theme(legend.spacing = unit(.1, "lines"), + legend.key.size = unit(.5, "lines"), + legend.text = element_text(size=5), + legend.title = element_text(size=6), + panel.background = element_rect(fill='transparent'), + plot.background = element_rect(fill='transparent', color=NA), + legend.background = element_rect(fill='transparent', linewidth=0.05), + legend.box.background = element_rect(fill='transparent')) +} + +as.dt <- function(x, col.names=NULL) { + .mat <- as.matrix(x) + if(is.null(col.names)) col.names <- str_c("V",1:ncol(.mat)) + colnames(.mat) <- col.names + as.data.table(.mat) +} + +################################################################ +if.needed <- function(.file, .code) { + if(!all(file.exists(unlist(.file)))){ .code } + stopifnot(all(file.exists(unlist(.file)))) +} + +################################################################ +setup.env <- function(fig.dir) { + + ## save figures here + dir.create(fig.dir, showWarnings = FALSE, recursive = TRUE) + knitr::opts_chunk$set(warning = FALSE, message = FALSE, fig.path = fig.dir) + knitr::opts_chunk$set(echo=FALSE, fig.align="center") + + ## allow the code to chunk set size="tiny" ## + hook.chunk <- knitr::knit_hooks$get("chunk") + + ## Default: normalsize -> scriptsize + ## This will redefine normalsize + knitr::knit_hooks$set(chunk = function(x, options) { + x <- hook.chunk(x, options) + ifelse(options$size != "normalsize", + paste0("\n \\", options$size, "\n\n", x, "\n\n \\normalsize"), + paste0("\n \\", "scriptsize", "\n\n", x, "\n\n \\normalsize")) + }) + + ## show plot one by one in beamer ## + hook.plot <- knitr::knit_hooks$get("plot") + + knitr::knit_hooks$set(plot = function(x, options) { + if (!is.null(options$onslide.plot)) { + bf <- paste0("\\onslide<", options$onslide.plot, ">{") + ret <- paste(c(bf, knitr::hook_plot_tex(x, options), "}"), + collapse = "\n") + return(ret) + } else if (!is.null(options$only.plot)) { + bf <- paste0("\\only<", options$only.plot, ">{") + ret <- paste(c(bf, knitr::hook_plot_tex(x, options), "}"), + collapse = "\n") + return(ret) + } + return(hook.plot(x, options)) + }) + + knitr::opts_chunk$set(tidy.opts = list(width.cutoff = 60), tidy = TRUE) + knitr::opts_chunk$set(dev = "cairo_pdf") +} + diff --git a/lect17-scRNA-topic-model/Util.R b/lect17-scRNA-topic-model/Util.R new file mode 100644 index 0000000..0726bb2 --- /dev/null +++ b/lect17-scRNA-topic-model/Util.R @@ -0,0 +1,169 @@ +sort.theta <- function(.dt){ + + .co <- order(apply(as.matrix(.dt[, -1]), 1, which.max)) + .to <- colnames(.dt)[-1] + + .melt <- .dt %>% + mutate(col = factor(1:n(), rev(.co))) %>% + melt(id.vars = c("col", "celltype")) %>% + mutate(variable = factor(variable, .to)) + + .cto <- + .melt[, + .(value = mean(value)), + by = .(celltype, variable)] %>% + dcast(celltype ~ variable, value.var = "value") %>% + (function(x){ + .mat <- as.matrix(x[,-1]) + .o <- order(apply(.mat, 1, which.max), decreasing = T) + rev(x$celltype[.o]) + }) + + .melt[, celltype := factor(celltype, .cto)] + return(.melt) +} + +read.theta <- function(.file){ + return(sort.theta(fread(.file))) +} + +plot.theta <- function(.file, with.ct = F){ + .dt <- read.theta(.file) + .thm <- + theme(axis.text.y = element_blank()) + + theme(axis.ticks.y = element_blank()) + + theme(legend.title = element_blank()) + + theme(legend.position = c(1,1)) + + theme(legend.justification = c(1,1)) + + ret <- + ggplot(.dt, aes(variable, col, fill = value)) + + ggrastr::rasterise(geom_tile(), dpi=300) + + scale_fill_distiller(direction=1) + + .thm + + if(with.ct){ + ret <- ret + + theme(strip.text.y = element_text(angle=0, hjust=0)) + + facet_grid(celltype ~ ., scales="free", space="free") + } + return(ret) +} + +################################################################ + +sort.beta <- function(.dt, genes.selected){ + + .genes <- rownames(.dt) + .mat <- as.matrix(.dt) + colnames(.mat) <- 1:ncol(.mat) + rownames(.mat) <- .genes + + .mat <- .mat[rownames(.mat) %in% genes.selected, , drop = F] + .melt <- reshape2::melt(.mat) %>% as.data.table() + .melt <- .melt[order(`value`, decreasing = T), head(.SD, 1), + by = .(Var1, Var2)] # remove redundancy + + .mat <- dcast(.melt, Var1 ~ Var2, value.var="value", fun.aggregate = max) + .rows <- unlist(.mat[,1], use.names = F) + .mat <- .mat[,-1] + + .ro <- apply(t(.mat), 2, which.max) %>% + order(decreasing = F) %>% + (function(x) .rows[x]) + .to <- colnames(.mat) + + .melt[, row := factor(`Var1`, .ro)] + .melt[, variable := factor(`Var2`, .to)] + + return(.melt) +} + +read.beta <- function(.file, ...){ + return(sort.beta(fread(.file), ...)) +} + +################################################################ +make.gs.lol <- function(.dt) { + .dt <- as.data.table(.dt) %>% unique() + .list <- + .dt[, .(gene = .(gene_symbol)), by = .(gs_name)] %>% + as.list() + .names <- .list$gs_name + .ret <- .list$gene + names(.ret) <- .names + return(.ret) +} + +run.beta.gsea <- function(.dt, .db){ + + vars <- unique(.dt$Var2) + ret <- data.table() + for(v in vars){ + + .scores <- as.vector(scale(.dt[Var2 == v]$value)) + names(.scores) <- .dt[Var2 == v]$Var1 + + .gsea <- fgsea::fgsea(pathways = .db, + stats = .scores, + scoreType = "pos") %>% + as.data.table() + .gsea[, Var2 := v] + ret <- rbind(ret, .gsea) + } + return(ret) +} + +read.panglao <- function(.organs = NULL, .file = "../data/PanglaoDB_markers_27_Mar_2020.tsv.gz"){ + + if(is.null(.organs)){ + .dt <- fread(.file) + } else { + .dt <- fread(.file)[organ %in% .organs] + } + + ret <- .dt %>% + rename(gene_symbol = `official gene symbol`, gs_name = `cell type`) %>% + mutate(gs_name = gsub(pattern="[ ]*cells", replacement = "", gs_name)) %>% + make.gs.lol() +} + +select.gsea.beta <- function(.gsea, .beta, ntop = 3, nmax = 30, sd.cutoff = 1e-4, padj.cutoff = .1){ + + ## cell types (pathways) to show + .cts <- unlist(.gsea[order(pval), + head(.SD, ntop), + by = .(Var2)][padj < padj.cutoff, + .(pathway)]) + + .gsea.show <- .gsea[pathway %in% .cts] + .gsea.show[, variable := factor(`Var2`, 0:100)] # 100 should be enough + + .temp <- dcast(.gsea.show, variable ~ pathway, + value.var = "pval", + fun.aggregate = min) + + .po <- order(apply(-log10(1e-20 + .temp[,-1]), 2, which.max)) + .po <- colnames(.temp)[-1][.po] + .gsea.show[, ct := factor(pathway, .po)] + + ## genes to show + .sd <- .beta[, .(s = sd(`value`)), by = .(Var1)] + + .genes.show <- intersect(unlist(.gsea.show$leadingEdge), + .sd[s > sd.cutoff]$Var1) + .beta.show <- .beta[Var1 %in% .genes.show] + + ## leading edges to show + .temp <- copy(.gsea.show) + .temp[, r := as.character(.I)] + .leading.edges <- .temp[, leadingEdge[[1]], by=r] %>% + left_join(.temp[, .(r, ct)]) %>% + mutate(row = factor(V1, sort(unique(.beta.show$row)))) + + .leading.genes <- .beta.show[row %in% .leading.edges$row][order(value, decreasing = T), head(.SD, 10), by = .(variable)][order(value, decreasing = T)]$row + .leading.genes <- head(unique(.leading.genes), nmax) + + list(gsea = .gsea.show, beta = .beta.show, leading.edges = .leading.edges, + leading.genes = .leading.genes) +} diff --git a/lect17-scRNA-topic-model/lect17-scRNA-topic-model-combined.pdf b/lect17-scRNA-topic-model/lect17-scRNA-topic-model-combined.pdf new file mode 100644 index 0000000..d210f0e Binary files /dev/null and b/lect17-scRNA-topic-model/lect17-scRNA-topic-model-combined.pdf differ diff --git a/lect17-scRNA-topic-model/lect17-scRNA-topic-model.Rmd b/lect17-scRNA-topic-model/lect17-scRNA-topic-model.Rmd new file mode 100644 index 0000000..3fe8bda --- /dev/null +++ b/lect17-scRNA-topic-model/lect17-scRNA-topic-model.Rmd @@ -0,0 +1,1272 @@ +--- +title: "Statistical Methods for single cell data analysis 2" +author: "Yongjin Park, UBC Path + Stat, BC Cancer" +date: "`r format(Sys.time(), '%d %B %Y')`" +mainfont: Roboto +classoption: "aspectratio=169" +fontsize: 12pt +output: + powerpoint_presentation: + reference_doc: "_template.pptx" + html_document: + self_contained: true + beamer_presentation: + theme: Madrid + keep_md: true + keep_tex: true + latex_engine: xelatex + slide_level: 2 +header-includes: + - \usepackage{cancel} + - \definecolor{UBCblue}{rgb}{0.04706, 0.13725, 0.26667} + - \usecolortheme[named=UBCblue]{structure} + - \setbeamertemplate{frametitle}{\color{UBCblue}\bfseries\insertframetitle\par\vskip-6pt\hrulefill} + - \setbeamercolor{title in head/foot}{bg=white,fg=gray} + - \setbeamercolor{section in head/foot}{bg=white,fg=gray} + - \setbeamercolor{author in head/foot}{bg=white,fg=gray} + - \setbeamercolor{date in head/foot}{bg=white,fg=gray} + - \setbeamertemplate{page number in head/foot}{} + - \setbeamertemplate{frame numbering}[none] + - \setbeamercolor{alerted text}{bg=yellow} + - \AtBeginSection[]{\begin{frame}\frametitle{Today's lecture}{\Large\tableofcontents[currentsection]}\end{frame}} + - | + \makeatletter + \def\ps@titlepage{% + \setbeamertemplate{footline}{} + } + \addtobeamertemplate{title page}{\thispagestyle{titlepage}}{} + \makeatother + \include{toc} +--- + + +```{r setup, include=FALSE} +setwd("~/work/teaching/stat540_lectures/lect17-scRNA-topic-model/") +library(data.table) +library(tidyverse) +library(patchwork) +library(matrixTests) +library(mmutilR) +library(asapR) +source("Setup.R") +source("Util.R") +fig.dir <- "../Fig/scRNA_topic/" +dir.create(fig.dir, showWarnings=FALSE) +setup.env(fig.dir) +dir.create("../data", showWarnings=FALSE) +library(extrafont) +library(xkcd) +extrafont::font_import(pattern="[X/x]kcd", prompt=F) +extrafont::loadfonts() +theme_set(theme_xkcd() + + theme(title = element_text(size=10), + legend.background = element_blank()) + ) +``` + +## + +\large + +Source code available: + +`https://github.com/stat540-UBC/lectures` + +## Overview of single-cell data analysis + +\centerline{\includegraphics[width=\textwidth]{img/scRNA_overview/Karchenko_1.pdf}} + +\vfill + +\flushright +\tiny +Karchenko, \emph{Nature Methods} (2021) + +## Overview of single-cell data analysis cont'd + +\centerline{\includegraphics[width=\textwidth]{img/scRNA_overview/Karchenko_2.pdf}} + +\vfill + +\flushright +\tiny +Karchenko, \emph{Nature Methods} (2021) + +# (review) Principal Component Analysis + +## The goal is to find hidden patterns + +:::::: {.columns} +::: {.column width=.5} + + +```{r echo=FALSE} +U <- .rnorm(100, 2); V <- .rnorm(100, 2) +X <- U %*% t(V) + .rnorm(100, 100) +``` + +A data matrix was generated by the following: + +$$\mathbf{x}_{i} = \mathbf{u}_{i} V^{\top} + \epsilon_{i}$$ + +for all $i$ with $\epsilon_{i} \sim \mathcal{N}\!\left(\mathbf{0},I\right)$ + +```{r echo=FALSE} +X <- .sort.matrix(X) +``` + +```{r echo=FALSE, fig.width=2.5, fig.height=2, onslide.plot="1-"} +.matshow(X, .lab=0, .lw=0) +``` + +::: +::: {.column width=.5} + +```{r echo=FALSE} +.svd <- rsvd::rsvd(X, k=5) +u.hat <- sweep(.svd$u, 2, .svd$d, `*`) +``` + +Can we reverse engineer the $U$ and $V$ matrices from the data $X$? + +$$X \to U V^{\top}$$ + +```{r echo=FALSE, fig.width=2.7, fig.height=2.3, only.plot="2"} +p1 <- .matshow(u.hat, .lab=0, .lw=0) + + ggtitle("features") +p.times <- ggplot() + theme_void() + + geom_text(aes(0,0,label="X"), size=5) +p2 <- wrap_plots(.matshow(t(.svd$v), .lab=0, .lw=0), + ggplot(), + ncol=1, heights=c(1,9)) + + ggtitle("feature loading") +wrap_plots(list(p1, p.times, p2), nrow=1, widths=c(1,2,8)) +``` + +::: +:::::: + + +## Working Example: Pancreatic cells (Muraro _et al._ 2016) + +:::::: {.columns} +::: {.column width=.4} + +\centerline{\includegraphics[width=.9\linewidth]{img/Muraro_GSE85241.png}} + +::: +::: {.column width=.6} + +```{r read_raw_data_gse85241} +raw.data <- fileset.list("../data/GSE85241/GSE85241") +.info <- rcpp_mmutil_info(raw.data$mtx) +cell.index <- rcpp_mmutil_read_index(raw.data$idx) +X <- round(read.sparse(raw.data$mtx)) +.rows <- fread(raw.data$row, header=F) +.rows[, c("gene","chr") := tstrsplit(`V1`, split="[_]+", keep=1:2)] +rownames(X) <- .rows$gene +colnames(X) <- readLines(raw.data$col) +## remove ERCC spike in +.ercc <- which(str_detect(rownames(X), "ERCC")) +X <- X[-.ercc, , drop = F] +``` + +It's a sparse matrix... + +${}$ + +```{r echo = T} +X[1:10, 1:5] +``` + +* rows: `r num.int(nrow(X))` genes +* columns: `r num.int(ncol(X))` cells + +::: +:::::: + +## Why do we need to reduce dimensionality? + +\large + +\begin{enumerate} +\item<2-> High dimensionality, robust downstream analysis +\item<3-> Pattern recognition: meta genes, meta cells, etc. +\item<4-> Reduce unwanted variation of outliers +\item<5> Aggregate weak contributions of many features +\end{enumerate} + +## Visualization with or without dimensionality reduction + +```{r} +.file <- fig.dir %&% "/lect17_tSNE_raw_pca.rds" + +if.needed(.file, { + set.seed(1) + xx <- apply(log1p(t(as.matrix(X))), 2, scale) + .tsne.raw <- Rtsne::Rtsne(xx, num_threads=16, pca=F, check_duplicates=F, verbose=T) + .umap.raw <- uwot::umap(xx, n_threads = 16, init = "random", verbose=T) + + set.seed(1) + .rsvd <- rsvd::rsvd(xx, k = 10) + .tsne.pca <- Rtsne::Rtsne(.rsvd$u, num_threads=16, pca=F, check_duplicates=F, verbose=T) + .umap.pca <- uwot::umap(.rsvd$u, n_threads = 16, init = "random", verbose=T) + + saveRDS(list(tsne.raw = .tsne.raw$Y, + tsne.pca = .tsne.pca$Y, + umap.raw = .umap.raw, + umap.pca = .umap.pca), + .file) +}) +.tsne <- readRDS(.file) +``` + +```{r} +nnz <- apply(X > 0, 2, sum) +``` + +```{r fig.width=6, fig.height=3, only.plot="1"} +p1 <- ggplot(as.data.frame(cbind(.tsne$tsne.raw, nnz)), aes(V1, V2, color=nnz)) + + ggrastr::rasterise(geom_point(stroke=0, alpha=.7, size=.7), dpi=300) + + scale_color_distiller(palette = "Spectral", direction=-1, trans="sqrt", guide="none") + + ggtitle("tSNE on the raw data") + +p2 <- ggplot(as.data.frame(cbind(.tsne$tsne.pca, nnz)), aes(V1, V2, color=nnz)) + + ggrastr::rasterise(geom_point(stroke=0, alpha=.7, size=.7), dpi=300) + + scale_color_distiller(palette = "Spectral", direction=-1, trans="sqrt", guide="none") + + ggtitle("tSNE on the top 10 PC") + +p1 | p2 +``` + +```{r fig.width=6, fig.height=3, only.plot="2"} +p1 <- ggplot(as.data.frame(cbind(.tsne$umap.raw, nnz)), aes(V1, V2, color=nnz)) + + ggrastr::rasterise(geom_point(stroke=0, alpha=.7, size=.7), dpi=300) + + scale_color_distiller(palette = "Spectral", direction=-1, trans="sqrt", guide="none") + + ggtitle("umap on the raw data") + +p2 <- ggplot(as.data.frame(cbind(.tsne$umap.pca, nnz)), aes(V1, V2, color=nnz)) + + ggrastr::rasterise(geom_point(stroke=0, alpha=.7, size=.7), dpi=300) + + scale_color_distiller(palette = "Spectral", direction=-1, trans="sqrt", guide="none") + + ggtitle("umap on the top 10 PC") + +p1 | p2 +``` + +## Let's just use top 200 most variable genes and cells + +```{r} +.sd <- apply(X, 1, sd, na.rm=TRUE) +msv <- order(.sd, decreasing = TRUE)[1:200] +.sd <- apply(X, 2, sd, na.rm=TRUE) +msw <- order(.sd, decreasing = TRUE)[1:200] +``` + +Top 200 genes and cells + +:::::: {.columns} +::: {.column width=.25} + +```{r fig.width=2, fig.height=2, only.plot="1"} +x.sub <- as.matrix(X)[msv, msw, drop = FALSE] +.matshow(x.sub, .scale=FALSE, .lab=0, .lw=0) +``` + +```{r fig.width=2, fig.height=2, only.plot="2"} +x.sub <- t(scale(t(log1p(x.sub)))) +.matshow(x.sub, .scale=TRUE, .lab=0, .lw=0) +``` + +::: +::: {.column width=.65} + +* We will call such a high-dimensional matrix $X$ ($m \times n$) + +* X has $m$=`r num.int(nrow(X))` rows (transcripts/genes/features) + +* X has $n$=`r num.int(ncol(X))` columns (cells/#data points) + +* The rows were log-transformed and scaled by `scale()` for visualization + +* Each cell is a `r num.int(nrow(X))`-dimensional vector! + +* Each gene is a `r num.int(ncol(X))`-dimensional vector... + +::: +:::::: + + +## 1-dimensional representations/summary of data matrix + +:::::: {.columns} +::: {.column width=.25} + +Top 200 genes and cells: + +```{r fig.width=2, fig.height=2} +.matshow(x.sub, .scale=FALSE, .lab=0, .lw=0) +``` + +::: +::: {.column width=.65} + +```{r fig.width=4, fig.height=1.25, onslide.plot="1-"} +ggplot(as.data.table(X[msv[1],]), + aes(x=colnames(X), y=V1)) + + ggtitle("Most variable cell (column):") + + ylab("expression") + + xlab("cells") + + theme(axis.text.y = element_text(size=4)) + + theme(axis.text.x = element_blank()) + + theme(axis.ticks.x = element_blank()) + + geom_point(pch=21) +``` + +```{r fig.width=4, fig.height=1.25, onslide.plot="2"} +ggplot(as.data.table(X[,msw[1]]), + aes(x=1:nrow(X), y=V1)) + + ggtitle("Most variable gene (row):") + + ylab("expression") + + xlab("genes") + + theme(axis.text.y = element_text(size=4)) + + theme(axis.text.x = element_blank()) + + theme(axis.ticks.x = element_blank()) + + geom_point(pch=21, size=.2, alpha=.5) +``` + +::: +:::::: + + +## Can you see some common patterns? + +:::::: {.columns} +::: {.column width=.45} + +```{r fig.width=2, fig.height=2.5, onslide.plot="1-"} +.matshow(x.sub, .scale=F, .lab=0, .lw=0) + + ggtitle("200 genes and cells") +``` + +::: +::: {.column width=.45} + +```{r fig.width=2, fig.height=2.5, onslide.plot="2"} +x.sorted <- .sort.matrix(x.sub) +.matshow(x.sorted, .scale=F, .lab=0, .lw=0) + + ggtitle("Rearrange them...") +``` + +::: +:::::: + +## PCA: How do we recover "common" patterns from data? + +Principal Component Analysis + +:::::: {.columns} +::: {.column width=.45} + +::: {.block} + +### (Pearson 1901) + +* Projection [of the original data] that minimizes the projection cost between the original and projected + +* The cost = mean squared distance + +::: + +::: +::: {.column width=.45} + +::: {.block} + +### (Hotelling 1933) + +* Orthogonal projection of data into a lower-dimensional **[principal]** sub-space, + +* such that the total **variation of the projected** is maximized + +::: + +::: +:::::: + +## Let's see how much the first eigenvector can explain + +:::::: {.columns} +::: {.column width=.35} + +```{r fig.width=2, fig.height=2.5, only.plot="1"} +.matshow(x.sorted, .scale=F, .lab=0, .lw=0) + + ggtitle("Top 200 genes and cells") +``` + +```{r} +.eigen <- eigen(x.sorted %*% t(x.sorted)) +U <- .eigen$vector[,1,drop=FALSE] +.lm <- lm(x.sorted ~ U -1) +``` + +```{r fig.width=2, fig.height=2.5, only.plot="2"} +.matshow(.lm$fitted.values, .lab=0, .lw=0) + + ggtitle("eigen proj.") +``` + +```{r fig.width=2, fig.height=2.5, only.plot="3"} +.matshow(x.sorted, .scale=F, .lab=0, .lw=0) + + ggtitle("Top 200 genes and 200 cells") +``` + +::: +::: {.column width=.65} + +```{r fig.width=3.2, fig.height=2.7, only.plot="2"} +p0 <- ggplot() + theme_void() +p.eq <- ggplot() + theme_void() + geom_text(aes(0,0,label="="), size=10) +p.prod <- ggplot() + theme_void() + geom_text(aes(0,0,label="x"), size=10) +p1 <- .matshow(U, .lab=0, .lw=0, .scale=TRUE) + + ggtitle("e-vec") +p2 <- .matshow(matrix(.lm$coefficients, nrow=1), + .lab=0, .lw=0, .scale=TRUE) + + ggtitle("loading") +p2 <- wrap_plots(p2, p0, ncol=1, heights = c(1,13)) +wrap_plots(p.eq, p0, p1, p0, p.prod, p2, nrow=1, widths=c(1, 1, 1, 1, 1, 11)) +``` + +```{r fig.width=2, fig.height=2.5, only.plot="3"} +var.exp <- mean(apply(.lm$fitted, 2, var) / apply(x.sorted, 2, var)) +.matshow(.lm$fitted.values, .lab=0, .lw=0) + + ggtitle(round(var.exp*100) %&% "%") +``` + +::: +:::::: + + +## Top two eigen-vectors + +:::::: {.columns} +::: {.column width=.35} + +```{r fig.width=2, fig.height=2.5, only.plot="1"} +.matshow(x.sorted, .scale=F, .lab=0, .lw=0) + + ggtitle("Top 200 genes and 200 cells") +``` + +```{r} +U <- .eigen$vector[,1:2,drop=FALSE] +.lm <- lm(x.sorted ~ U -1) +``` + +```{r fig.width=2, fig.height=2.5, only.plot="2"} +.matshow(.lm$fitted.values, .lab=0, .lw=0) + + ggtitle("eigen proj.") +``` + +```{r fig.width=2, fig.height=2.5, only.plot="3"} +.matshow(x.sorted, .scale=F, .lab=0, .lw=0) + + ggtitle("Top 200 genes and 200 cells") +``` + +::: +::: {.column width=.65} + +```{r fig.width=3.2, fig.height=2.7, only.plot="2"} +p0 <- ggplot() + theme_void() +p.eq <- ggplot() + theme_void() + geom_text(aes(0,0,label="="), size=10) +p.prod <- ggplot() + theme_void() + geom_text(aes(0,0,label="x"), size=10) +p1 <- .matshow(U, .lab=0, .lw=0, .scale=TRUE) + + ggtitle("e-vec") +p2 <- .matshow(as.matrix(.lm$coefficients), + .lab=0, .lw=0, .scale=TRUE) + + ggtitle("loading") +p2 <- wrap_plots(p2, p0, ncol=1, heights = c(1,13)) +wrap_plots(p.eq, p0, p1, p0, p.prod, p2, nrow=1, widths=c(1, 1, 1, 1, 1, 11)) +``` + +```{r fig.width=2, fig.height=2.5, only.plot="3"} +var.exp <- mean(apply(.lm$fitted, 2, var) / apply(x.sorted, 2, var)) +.matshow(.lm$fitted.values, .lab=0, .lw=0) + + ggtitle(round(var.exp*100) %&% "%") +``` + +::: +:::::: + +## Top three eigen-vectors + +:::::: {.columns} +::: {.column width=.35} + +```{r fig.width=2, fig.height=2.5, only.plot="1"} +.matshow(x.sorted, .scale=F, .lab=0, .lw=0) + + ggtitle("Top 200 genes and 200 cells") +``` + +```{r} +U <- .eigen$vector[,1:3,drop=FALSE] +.lm <- lm(x.sorted ~ U -1) +``` + +```{r fig.width=2, fig.height=2.5, only.plot="2"} +.matshow(.lm$fitted.values, .lab=0, .lw=0) + + ggtitle("eigen proj.") +``` + +```{r fig.width=2, fig.height=2.5, only.plot="3"} +.matshow(x.sorted, .scale=F, .lab=0, .lw=0) + + ggtitle("Top 200 genes and 200 cells") +``` + +::: +::: {.column width=.65} + +```{r fig.width=3.2, fig.height=2.7, only.plot="2"} +p0 <- ggplot() + theme_void() +p.eq <- ggplot() + theme_void() + geom_text(aes(0,0,label="="), size=10) +p.prod <- ggplot() + theme_void() + geom_text(aes(0,0,label="x"), size=10) +p1 <- .matshow(U, .lab=0, .lw=0, .scale=TRUE) + + ggtitle("e-vec") +p2 <- .matshow(as.matrix(.lm$coefficients), + .lab=0, .lw=0, .scale=TRUE) + + ggtitle("loading") +p2 <- wrap_plots(p2, p0, ncol=1, heights = c(1,13)) +wrap_plots(p.eq, p0, p1, p0, p.prod, p2, nrow=1, widths=c(1, 1, 1, 1, 1, 11)) +``` + +```{r fig.width=2, fig.height=2.5, only.plot="3"} +var.exp <- mean(apply(.lm$fitted, 2, var) / apply(x.sorted, 2, var)) +.matshow(.lm$fitted.values, .lab=0, .lw=0) + + ggtitle(round(var.exp*100) %&% "%") +``` + +::: +:::::: + +## Singular Value Decomposition can find a PCA solution + +::: {.block} +### Singular Value Decomposition + +SVD identifies three matrices of $X$: + +$$X = U D V^{\top}$$ + +where both $U$ and $V$ vectors are orthonormal, +namely, + +- $U^{\top}U = I$, $\mathbf{u}_{k}^{\top}\mathbf{u}_{k}=1$ for all $k$, + +- $V^{\top}V = I$, $\mathbf{v}_{k}^{\top}\mathbf{v}_{k} = 1$ for all $k$. +::: + +## Run SVD to find principal components + +```{r echo=TRUE} +svd.out <- rsvd::rsvd(x.sorted, k = 7) +U <- svd.out$u; D <- diag(svd.out$d); V <- svd.out$v +``` + +:::::: {.columns} +::: {.column width=.25} + +```{r fig.width=2, fig.height=2.3} +.matshow(x.sorted, .scale=F, .lab=0, .lw=0) + + ggtitle("data matrix") +``` + +::: +::: {.column width=.65} + +```{r fig.width=3.2, fig.height=2.5} +p0 <- ggplot() + theme_void() + +p1 <- .matshow(U, .lab=0, .lw=.0, .scale=TRUE) + + ggtitle("U") + +p.D <- .matshow(D, .lab=0, .lw=.0, .scale=TRUE) + + ggtitle("D") + +p.V <- .matshow(t(V), .lab=0, .lw=.0, .scale=TRUE) + + ggtitle("transpose(V)") + +p.eq <- ggplot() + theme_void() + geom_text(aes(0,0,label="="), size=6) +p.prod <- ggplot() + theme_void() + geom_text(aes(0,0,label="x"), size=5) +p.prod <- wrap_plots(p.prod, p0, ncol=1, heights=c(1,8)) + +p2 <- wrap_plots(p.D, p0, ncol=1, heights=c(1,8)) +p3 <- wrap_plots(p.V, p0, ncol=1, heights=c(1,8)) + +wrap_plots(p.eq, p0, p1, p.prod, p2, p.prod, p3, + nrow=1, + widths=c(1, 1, 1, 1, 1, 1, 8)) +``` + +::: +:::::: + +## SVD decomposes covariance calculation + +:::::: {.columns} +::: {.column width=.45} + +::: {.block} +### Singular Value Decomposition + +SVD identifies three matrices of $X$: + +$$X = U D V^{\top}$$ + +where both $U$ and $V$ vectors are orthonormal, +namely, + +- $U^{\top}U = I$, $\mathbf{u}_{k}^{\top}\mathbf{u}_{k}=1$ for all $k$, + +- $V^{\top}V = I$, $\mathbf{v}_{k}^{\top}\mathbf{v}_{k} = 1$ for all $k$. + +::: + +::: +::: {.column width=.45} + +::: {.block} + +### Covariance by SVD + +Covariance across the columns (samples) + +$$X^{\top}X/m = V D^{2} V^{\top}/m$$ + +Covariance across the rows (genes) + +$$XX^{\top}/n = U D^{2} U^{\top}/n$$ + +::: + +*Remark*: standardized matrix + +::: +:::::: + +## Review: SVD finds eigenvectors of covariance + +\small + +\begin{eqnarray*} +\onslide<1->{ + \underbrace{\left( \frac{1}{m} X^{\top}X \right)}_{\textsf{\color{blue} sample covariance}} \mathbf{v}_{1} + &=& + \frac{1}{m} \left(\mathbf{v}_{1}, \mathbf{v}_{2},\ldots, \mathbf{v}_{k}\right) + \left( + \begin{array}{l l l l} + D_{1}^{2} & 0 & \ldots & \ldots \\ + 0 & D_{2}^{2} & 0 & \ldots \\ + 0 & \ldots & \ddots & 0 \\ + 0 & \ldots & 0 & D_{k}^{2} \\ + \end{array} \right) + \left( + \begin{array}{l} + \mathbf{v}_{1}^{\top}\\ + \mathbf{v}_{2}^{\top}\\ + \vdots\\ + \mathbf{v}_{k}^{\top} + \end{array} + \right) + \mathbf{v}_{1} \\ + } + \onslide<2->{ + &=& + \frac{1}{m} \left(\mathbf{v}_{1}, \mathbf{v}_{2},\ldots, \mathbf{v}_{k}\right) + \left( + \begin{array}{l l l l} + D_{1}^{2} & 0 & \ldots & \ldots \\ + 0 & D_{2}^{2} & 0 & \ldots \\ + 0 & \ldots & \ddots & 0 \\ + 0 & \ldots & 0 & D_{k}^{2} \\ + \end{array} \right) + \left( + \begin{array}{l} + 1 \\ + 0 \\ + \vdots\\ + 0 + \end{array} + \right) \\ + } +\onslide<4>{ +\boldsymbol{\hat{\Sigma}} \, \mathbf{v}_{1} +} +\onslide<3->{ + &=& + \frac{1}{m} \left(\mathbf{v}_{1}, \mathbf{v}_{2},\ldots, \mathbf{v}_{k}\right) + \left( + \begin{array}{l} + D_{1}^{2} \\ + 0 \\ + 0 \\ + 0 \\ + \end{array} \right)} + \onslide<4>{ + = + \underbrace{\frac{D_{1}^{2}}{m}}_{\textsf{\color{red}eigenvalue}} + \underbrace{\mathbf{v}_{1}}_{\textsf{\color{red}eigenvector}} + } +\end{eqnarray*} + + +## Eigen values $\lambda$'s quantify the proportion of variance + +Given this SVD, $X = U D V^{\top}$, where $V = [\mathbf{v}_{1},\ldots,\mathbf{v}_{k}]$, + +:::::: {.columns} +::: {.column width=.55} + +```{r fig.width=3.2, fig.height=2.5} +v.plots <- lapply(1:ncol(V), + function(k) { + list(.matshow(V[,k,drop=F]) + ggtitle("V[," %&% k %&% "]"), + ggplot() + theme_void()) + }) + +wrap_plots(unlist(v.plots, recursive = F), + widths = rep(c(2,1),7), nrow = 1) +``` +::: +::: {.column width=.45} + +$V$ is orthonormal (orthogonal and normalized): + +```{r fig.width=1.8, fig.height=2} +.matshow(t(V) %*% V, .scale=F, .lw=.1, .lab=2) + ggtitle("transpose(V) * V") +``` + +::: +:::::: + +## Eigen values $\lambda$'s quantify the proportion of variance + +Let's consider an element $(i,j)$ of the covariance + +$$\frac{1}{m} \left( X^{\top}X \right)_{ij} + = + \frac{1}{m} \left(V_{i1}, V_{i2},\ldots, V_{ik}\right) + \left( + \begin{array}{l l l l} + D_{1}^{2} & 0 & \ldots & \ldots \\ + 0 & D_{2}^{2} & 0 & \ldots \\ + 0 & \ldots & \ddots & 0 \\ + 0 & \ldots & 0 & D_{k}^{2} \\ + \end{array} \right) + \left( + \begin{array}{l} + V_{j1}\\ + V_{j2}\\ + \vdots\\ + V_{jk} + \end{array} + \right)$$ + + +:::::: {.columns} +::: {.column width=.2} +::: +::: {.column width=.3} + +```{r fig.width=1.5, fig.height=.4} +.matshow(V[1,,drop = F], .lab=2) +``` + +::: +::: {.column width=.05} +$${}$$ +$$\times$$ +::: +::: {.column width=.3} + +```{r fig.width=1.5, fig.height=1.5} +.matshow(D, .lab=2) +``` + +::: +::: {.column width=.05} +$${}$$ +$$\times$$ +::: +::: {.column width=.1} + +```{r fig.width=.4, fig.height=1.5} +.matshow(t(V[1,,drop = F]), .lab=2) +``` + +::: +:::::: + + +## Eigen values $\lambda$'s quantify the proportion of variance + +Let's consider an element $(i,j)$ of the covariance + +$$\frac{1}{m} \left( X^{\top}X \right)_{ij} + = + \frac{1}{m} \left(V_{i1}, V_{i2},\ldots, V_{ik}\right) + \left( + \begin{array}{l l l l} + D_{1}^{2} & 0 & \ldots & \ldots \\ + 0 & D_{2}^{2} & 0 & \ldots \\ + 0 & \ldots & \ddots & 0 \\ + 0 & \ldots & 0 & D_{k}^{2} \\ + \end{array} \right) + \left( + \begin{array}{l} + V_{j1}\\ + V_{j2}\\ + \vdots\\ + V_{jk} + \end{array} + \right)$$ + +Therefore, + +$$\hat{\Sigma}_{ij} = \frac{D_{1}^{2}}{m} V_{i1} V_{j1} + \frac{D_{2}^{2}}{m} V_{i2} V_{j2} + \ldots + +\frac{D_{k}^{2}}{m} V_{ik} V_{jk}$$ + +We have $k$ terms for the covariance between $i$ and $j$. + +## Sample covariance matrix + +$$\hat{\Sigma} = X^{\top}X/m$$ + +```{r fig.width=2.5, fig.height=2.5} +.matshow(cov(x.sorted), .lab = 0) +``` + +## PCs (SVs) decompose the covariance matrix + +```{r} +p.eq <- ggplot() + theme_void() + geom_text(aes(0,0,label="="), size=10) + +eigen.values <- svd.out$d^2 / (nrow(x.sorted) - 1) +.show.cov.pc <- function(j) { + p0 <- ggplot() + theme_void() + geom_text(aes(0,0,label="+"),size=5) + .title <- round(eigen.values[j], 2) %&% "%" + p <- .matshow(V[,j,drop=FALSE] %*% t(V[,j,drop=FALSE]), .lab = 0) + + ggtitle(.title) + theme(title = element_text(size=8)) + wrap_plots(p, p0, nrow=1, widths=c(8,2)) +} + +v.prod <- lapply(1:ncol(V), .show.cov.pc) +``` + +$\hat{\Sigma} = X^{\top}X/m +\onslide<2->{ += V D^{2} V^{\top} / m +} +\onslide<3>{ += \sum_{k=1} \lambda_{k} \mathbf{v}_{k} \mathbf{v}_{k}^{\top},\quad \lambda_{k} +} +\onslide<3>{ +=D_{k}^{2}/m +}$ + +:::::: {.columns} +::: {.column width=.25} + +```{r fig.width=1.25, fig.height=1.65} +.matshow(cov(x.sorted), .lab = 0) + ggtitle("n x n") +``` + +::: + +::: {.column width=.02} +$${\large\boldsymbol{=}}$$ +::: + +::: {.column width=.75} + +```{r fig.width=4.3, fig.height=1.65} +wrap_plots(v.prod[1:3], nrow = 1) +``` + +* How much variance is explained by each component? + +::: +:::::: + +## How much variance is explained by each principal component? + +```{r fig.width=5, fig.height=2.5, only.plot="1"} +.dt <- data.table(eigen.values) %>% mutate(k=1:n()) %>% as.data.table +vtot <- sum(.dt$eigen.values) +.dt[, pr := eigen.values / vtot] +.dt[, cpr := cumsum(pr)] +ggplot(.dt, aes(k, eigen.values, fill=eigen.values)) + + geom_line() + + ylab("eigenvalues (lambda)") + + xlab("principal components") + + geom_point(pch=21) + + scale_fill_distiller(palette = "RdPu", direction=1, guide="none") +``` + +```{r fig.width=5, fig.height=2.5, only.plot="2"} +ggplot(.dt, aes(x=k, fill=eigen.values)) + + geom_line(aes(y=pr)) + + geom_text_repel(aes(y=pr, label=round(100*pr) %&% "%"), + data = .dt[1:10, ], + size = 3, nudge_x = 10, nudge_y = .1, + segment.lw = .2, + segment.color = 2, + vjust=0, hjust=0, colour=2) + + geom_text_repel(aes(y=cpr, label=round(100*cpr) %&% "%"), + data = .dt[2:5,], + size = 3, nudge_x = 10, nudge_y = 0, + segment.lw = .2, + segment.color = "gray20", + vjust=0, hjust=0, colour="gray40") + + geom_line(aes(y=cpr), lty=2) + + geom_point(aes(y=pr), pch=21) + + ylab("% var. explained\n(lambda/sum(lambda))") + + xlab("principal components") + + scale_fill_distiller(palette = "RdPu", direction=1, guide="none") +``` + +# Non-negative matrix factorization + +## + +\large + +SVD works great. Why do we need another factorization method? + +\normalsize + +## Why another factorization method? + +:::::: {.columns} +::: {.column width=.45} + +* The count matrix $X$ is non-negative. + +```{r echo = T, size = "large"} +sum(X < 0) +``` + +$${}$$ + +* Singular vectors are signed. + + - The "positive" and "negative" values can cancel each other and become "zero." + +* "Negative" contributions are difficult to interpret. + +::: +::: {.column width=.45} + +```{r fig.width=3, fig.height=1.75} +par(cex=.5) +hist(U[,1], 20, main="U", xlab="") +``` + +```{r fig.width=3, fig.height=1.75} +par(cex=.5) +hist(V[,1], 20, main="V", xlab="") +``` + +::: +:::::: + + +## Non-negative factorization + +```{r} +xx <- .sort.matrix(as.matrix(X[msv, msw])) +rownames(xx) <- rownames(X)[msv] + +rescale.nmf <- function(.beta, .theta) { + uu <- apply(.beta, 2, sum) + beta <- sweep(.beta, 2, uu, `/`) + prop <- sweep(.theta, 2, uu, `*`) + zz <- apply(t(prop), 2, sum) + prop <- sweep(prop, 1, zz, `/`) + list(beta = beta, prop = prop, depth = zz) +} +``` + +```{r echo = T, size="large"} +out <- NMF::nmf(xx, 7) +U <- NMF::basis(out) +V <- t(NMF::coef(out)) +``` + +## NMF naturally induces (1) gene sets (2) cellular topics + +```{r include = FALSE} +nmf.out <- rescale.nmf(NMF::basis(out), t(NMF::coef(out))) +``` + +:::::: {.columns} +::: {.column width=.4} + +```{r fig.width=2, fig.height=2.5} +.matshow(xx, .scale=FALSE) + ggtitle("data (gene X cell)") +``` + +::: +::: {.column width=.05} +$${}$$ +$${\Huge\boldsymbol{\sim}}$$ +::: +::: {.column width=.1} + +```{r fig.width=.7, fig.height=2.5} +.matshow(nmf.out$beta, .scale=FALSE) + ggtitle("U") +``` + +::: +::: {.column width=.05} +$${}$$ +$${\Huge\boldsymbol{\times}}$$ +::: +::: {.column width=.4} + +```{r fig.width=2, fig.height=1} +.matshow(t(nmf.out$prop), .scale=FALSE) + ggtitle("transpose(V)") +``` + +::: +:::::: + +## Several notable `R` packages to fit NMF for single-cell data analysis + +\large + +* `fastTopics`: estimates Poisson NMF and provides plotting and differential expression utility functions + +* `rliger`: provides integrative analysis routines and fast online learning methods; but, it will select features arbitrarily + +## `fastTopics` relates NMF results to topic models + +```{r echo = T} +library(fastTopics) +out <- fit_topic_model(t(xx), k=7) +``` + +\vfill +\flushright +\tiny +Carbonetto .. Stephens, Genome Biology (2023) + +## A structure plot in `fastTopics` package + +```{r echo = T, fig.width=5.5, fig.height=1.8} +structure_plot(out, verbose=F) + xlab("cells") +``` + +\vfill +\flushright +\tiny +Carbonetto .. Stephens, Genome Biology (2023) + +## Poisson NMF by `fastTopics` + +:::::: {.columns} +::: {.column width=.4} + +```{r fig.width=2, fig.height=2.5} +.matshow(xx, .scale=FALSE) + ggtitle("data (gene X cell)") +``` + +::: +::: {.column width=.05} +$${}$$ +$${\Huge\boldsymbol{\sim}}$$ +::: +::: {.column width=.1} + +```{r fig.width=.7, fig.height=2.5} +.matshow(out$F, .scale=FALSE) + ggtitle("gene x factor") +``` + +::: +::: {.column width=.05} +$${}$$ +$${\Huge\boldsymbol{\times}}$$ +::: +::: {.column width=.4} + +```{r fig.width=2, fig.height=1} +.matshow(t(out$L), .scale=FALSE) + ggtitle("factor x cell") +``` + +::: +:::::: + +## `fastTopics`' differential expression analysis to find topic-specific genes + +```{r echo = T} +de.out <- de_analysis(out, t(xx)) +``` + +\vfill +\flushright +\tiny +Carbonetto .. Stephens, Genome Biology (2023) + +## `fastTopics`' differential expression analysis to find topic-specific genes + +```{r fig.width=5.8, fig.height=2.2} +volcano_plot(de.out, 4) | volcano_plot(de.out, 5) +``` + +## `rliger` provides a more scalable option for NMF estimation + +```{r echo = T} +library(rliger) +.liger <- createLiger(list(rna=X)) +.liger <- normalize(.liger) +.liger <- selectGenes(.liger, var.thresh=0.2, do.plot=F) +.liger <- scaleNotCenter(.liger) +.liger <- online_iNMF(.liger, k=7, miniBatch_size=100) +.liger <- rliger::quantile_norm(.liger) +.liger <- rliger::runUMAP(.liger) +.liger <- louvainCluster(.liger, resolution = 0.25) +``` + +\vfill +\flushright +\tiny +Gao .. Welch, Nature Biotech. (2021) + +## `rliger`'s NMF result + +:::::: {.columns} +::: {.column width=.5} + +gene $\times$ factor matrix + +```{r fig.width=1.5, fig.height=2.5} +pheatmap::pheatmap(.liger@B[[1]], show_rownames=F, cluster_cols = F) +``` + +::: +::: {.column width=.5} + +cell $\times$ factor matrix + +```{r fig.width=1.5, fig.height=2.5} +pheatmap::pheatmap(.liger@H.norm, show_rownames=F, cluster_cols = F) +``` + +::: +:::::: + +## `rliger`: clustering cells by the topic/factor loading + +```{r fig.width=6, fig.height=2.7} +.plts <- plotByDatasetAndCluster(.liger, return.plots=T) +p1 <- .plts[[2]] + + scale_color_brewer(palette = "Paired") + + theme_classic() + + ggtitle("Liger NMF factor loading") + +.df <- data.frame(cbind(.tsne$umap.pca, k=.liger@clusters)) +p2 <- ggplot(.df, aes(V1, V2, color=as.factor(k))) + + theme_classic() + + ggrastr::rasterise(geom_point(stroke=0, alpha=.7, size=.7), dpi=300) + + scale_color_brewer(palette = "Paired") + + theme(legend.position = "none") + + xlab("UMAP1") + ylab("UMAP2") + + ggtitle("umap on the top 10 PC") + +p1 | p2 +``` + +## Once we have gene $\times$ factor dictionary + +```{r} +.bulk <- asap_random_bulk(raw.data$mtx, + raw.data$row, + raw.data$col, + raw.data$idx, + 10, + NUM_THREADS=16) + +.nmf <- asap_fit_nmf(.bulk$PB, 7, NUM_THREADS=16) + +.asap <- asap_topic_stat(raw.data$mtx, + raw.data$row, + raw.data$col, + raw.data$idx, + .nmf$log.beta, + .bulk$rownames) + +.asap.nmf <- asap_topic_pmf(.asap$beta, + .asap$corr, + .asap$colsum, + NUM_THREADS=16) + +.asap.topic <- pmf2topic(.asap.nmf$beta, .asap.nmf$theta) +``` + +```{r run_gsea_beta} +.beta <- .asap.topic$beta +.rows <- data.table(.bulk$rownames) +.rows[, c("gene","chr") := tstrsplit(`V1`, split="[_]+", keep=1:2)] +rownames(.beta) <- .rows$gene + +.db <- read.panglao("Pancreas") +.genes <- unique(unlist(.db)) +.dt <- sort.beta(.beta, genes.selected = .genes) +.gsea <- run.beta.gsea(.dt, .db) +.show <- select.gsea.beta(.gsea, .dt, sd.cutoff = 1e-4) +``` + +```{r} +p1 <- + .gg.plot(.show$beta, aes(row, variable, fill=`value`)) + + theme(axis.text.x = element_blank()) + + theme(axis.ticks.x = element_blank()) + + theme(legend.position = "none") + + ggrastr::rasterise(geom_tile(), dpi=300) + + ylab("NMF factors") + xlab("genes") + + scale_fill_distiller(direction=1, trans="sqrt", palette="RdPu") + +p2 <- + .gg.plot(.show$leading.edges, aes(row, ct)) + + theme(axis.text.x = element_blank()) + + theme(axis.ticks.x = element_blank()) + + theme(panel.grid = element_blank()) + + geom_tile(linewidth=0) + + ylab("cell types") + +p3 <- + .gg.plot(.show$gsea, aes(variable, ct, fill=-log10(pmax(padj, 1e-4)))) + + xlab("NMF factors") + ylab("") + + scale_y_discrete(position = "right") + + geom_tile(linewidth=.1, color="black") + + theme(legend.key.width=unit(.2, "lines")) + + theme(legend.text = element_text(size=4)) + + scale_fill_distiller("Padj", palette = "YlGnBu", direction=1, labels=function(x) num.sci(10^(-x))) +``` + +```{r fig.width=4, fig.height=1.5} +print(p1) +``` + +## Annotate topics to cell types by enrichment (`fgsea`) + +```{r fig.width=6, fig.height=3, only.plot="1"} +print(p1/p2) +``` + +```{r fig.width=6, fig.height=3, only.plot="2"} +C <- length(unique(.show$gsea$ct)) +K <- length(unique(.show$beta$variable)) +p0 <- ggplot() + theme_void() +wrap_plots(p1, p0, p2, p3, nrow = 2, heights=c(K,C), widths=c(4,1)) +``` + +# Differential gene expression analysis + +