diff --git a/lect17-scRNA-topic-model/Setup.R b/lect17-scRNA-topic-model/Setup.R
new file mode 100644
index 0000000..ebb6081
--- /dev/null
+++ b/lect17-scRNA-topic-model/Setup.R
@@ -0,0 +1,231 @@
+options(stringsAsFactors = FALSE)
+
+`%&%` <- function(a,b) paste0(a, b)
+`%r%` <- function(mat,rr) mat[rr, , drop = FALSE]
+`%c%` <- function(mat,cc) mat[, cc, drop = FALSE]
+
+library(tidyverse)
+library(data.table)
+library(ggrepel)
+
+num.int <- function(...) format(..., justify="none", big.mark=",", drop0trailing = TRUE)
+
+num.sci <- function(...) format(..., justify="none", digits=2, scientific = TRUE)
+
+row.order <- function(mat) {
+    require(cba)
+    require(proxy)
+
+    if(nrow(mat) < 3) {
+        return(1:nrow(mat))
+    }
+
+    D = proxy::dist(mat, method <- function(a,b) 1 - cor(a,b, method = 'spearman'))
+    D[!is.finite(D)] = 0
+    h.out = hclust(D)
+    o.out = cba::order.optimal(D, h.out$merge)
+    return(o.out$order)
+}
+
+col.order <- function(pair.tab, .ro, ret.tab = FALSE) {
+
+    M = pair.tab %>%
+        dplyr::select(row, col, weight) %>%
+        dplyr::mutate(row = factor(row, .ro)) %>%
+        tidyr::spread(key = col, value = weight, fill = 0)
+
+    co = order(apply(M[, -1], 2, which.max), decreasing = TRUE)
+    .co = colnames(M)[-1][co]
+    if(ret.tab) {
+        ret = pair.tab %>%
+            dplyr::mutate(row = factor(row, .ro)) %>%
+            dplyr::mutate(col = factor(col, .co))
+    } else {
+        ret = .co
+    }
+    return(ret)
+}
+
+order.pair <- function(pair.tab, ret.tab=FALSE) {
+
+    require(tidyr)
+    require(dplyr)
+
+    .tab = pair.tab %>% dplyr::select(row, col, weight)
+
+    M = .tab %>% tidyr::spread(key = col, value = weight, fill = 0)
+    rr = M[, 1] %>% unlist(use.names = FALSE)
+    cc = colnames(M)[-1] %>% unlist(use.names = FALSE)
+
+    ## log.msg('Built the Mat: %d x %d', nrow(M), ncol(M))
+    ro = row.order(M %>% dplyr::select(-row) %>% as.matrix())
+
+    ## log.msg('Sort the rows: %d', length(ro))
+    co = order(apply(M[ro, -1], 2, which.max), decreasing = TRUE)
+
+    ## co = row.order(t(M %>% dplyr::select(-row) %>% as.matrix()))
+    ## log.msg('Sort the columns: %d', length(co))
+
+    if(ret.tab){
+        ret = pair.tab %>%
+            dplyr::mutate(row = factor(row, rr[ro])) %>%
+            dplyr::mutate(col = factor(col, cc[co]))
+    } else {
+        ret = list(rows = rr[ro], cols = cc[co], M = M)
+    }
+
+    return(ret)
+}
+
+
+.sort.matrix <- function(.X) {
+    as.matrix(.X) %>%
+        reshape2::melt() %>%
+        rename(row = Var1, col = Var2, weight = value) %>%
+        order.pair(ret.tab=TRUE) %>%
+        as.data.table %>%
+        dcast(row ~ col, value.var = "weight") %>%
+        dplyr::select(-row) %>%
+        as.matrix
+}
+
+.rnorm <- function(d1, d2) {
+    matrix(rnorm(d1 * d2), d1, d2)
+}
+
+###############################################################
+.matshow <- function(.mat, .lab=0, .axis.lab=0, .lw=0, .scale=TRUE) {
+
+    library(ggrastr)
+
+    .mat <- as.matrix(.mat)
+    .cols <- colnames(.mat)
+    .rows <- rownames(.mat)
+    if(length(.cols) < ncol(.mat)){
+        colnames(.mat) <- str_c("c", 1:ncol(.mat))
+    }
+    if(length(.rows) < nrow(.mat)){
+        rownames(.mat) <- str_c("r", 1:nrow(.mat))
+    }
+    .cols <- colnames(.mat)
+    .rows <- rownames(.mat)
+
+    .dt <-
+        as.data.frame(.mat) %>%
+        rownames_to_column("row") %>% 
+        as.data.table %>%
+        melt(id.vars = "row", variable.name = "col") %>%
+        dplyr::mutate(row = factor(as.character(row), rev(.rows))) %>%
+        dplyr::mutate(col = factor(as.character(col), .cols))
+
+    ret <-
+        ggplot(.dt, aes(y = row, x = col, fill = value)) +
+        theme(legend.position = "none")
+    if(.axis.lab > 0){
+        ret <-
+            ret +
+            theme(axis.text.x = element_text(size=.axis.lab, angle=90, vjust=1, hjust=1)) +
+            theme(axis.text.y = element_text(size=.axis.lab))
+    } else {
+        ret <-
+            ret +
+            theme(axis.text = element_blank()) +
+            theme(axis.ticks = element_blank())
+    }
+    ret <- ret +
+        theme(axis.title = element_blank())
+
+    if(.lw > 0){
+        ret <- ret +
+            ggrastr::rasterise(geom_tile(linewidth = .lw, colour = "black"), dpi=300)
+    } else {
+        ret <- ret + ggrastr::rasterise(geom_tile(), dpi=300)
+    }
+
+    if(.scale){
+        ret <- ret +
+            scale_fill_distiller(palette = "RdBu", direction = -1)
+    } else {
+        ret <- ret +
+            scale_fill_distiller(palette = "Greys", direction = 1)
+    }
+
+    if(.lab > 0) {
+        ret <- ret +
+            geom_text(aes(label = round(value,1)), size = .lab)
+    }
+
+    return(ret)
+}
+
+################################################################
+.gg.plot <- function(...) {
+    ggplot(...) +
+        theme_classic() +
+        theme(axis.title = element_text(size=8)) +
+        theme(axis.text = element_text(size=6)) +
+        theme(legend.spacing = unit(.1, "lines"),
+              legend.key.size = unit(.5, "lines"),
+              legend.text = element_text(size=5),
+              legend.title = element_text(size=6),
+              panel.background = element_rect(fill='transparent'),
+              plot.background = element_rect(fill='transparent', color=NA),
+              legend.background = element_rect(fill='transparent', linewidth=0.05),
+              legend.box.background = element_rect(fill='transparent'))
+}
+
+as.dt <- function(x, col.names=NULL) {
+    .mat <- as.matrix(x)
+    if(is.null(col.names)) col.names <- str_c("V",1:ncol(.mat))
+    colnames(.mat) <- col.names
+    as.data.table(.mat)
+}
+
+################################################################
+if.needed <- function(.file, .code) {
+    if(!all(file.exists(unlist(.file)))){ .code }
+    stopifnot(all(file.exists(unlist(.file))))
+}
+
+################################################################
+setup.env <- function(fig.dir) {
+
+    ## save figures here
+    dir.create(fig.dir, showWarnings = FALSE, recursive = TRUE)
+    knitr::opts_chunk$set(warning = FALSE, message = FALSE, fig.path = fig.dir)
+    knitr::opts_chunk$set(echo=FALSE, fig.align="center")
+
+    ## allow the code to chunk set size="tiny" ##
+    hook.chunk  <- knitr::knit_hooks$get("chunk")
+
+    ## Default: normalsize -> scriptsize
+    ## This will redefine normalsize
+    knitr::knit_hooks$set(chunk = function(x, options) {
+        x <- hook.chunk(x, options)
+        ifelse(options$size != "normalsize",
+               paste0("\n \\", options$size, "\n\n", x, "\n\n \\normalsize"),
+               paste0("\n \\", "scriptsize", "\n\n", x, "\n\n \\normalsize"))
+    })
+
+    ## show plot one by one in beamer ##
+    hook.plot <- knitr::knit_hooks$get("plot")
+
+    knitr::knit_hooks$set(plot = function(x, options) {
+        if (!is.null(options$onslide.plot)) {
+            bf <- paste0("\\onslide<", options$onslide.plot, ">{")
+            ret <- paste(c(bf, knitr::hook_plot_tex(x, options), "}"),
+                         collapse = "\n")
+            return(ret)
+        } else if (!is.null(options$only.plot)) {
+            bf <- paste0("\\only<", options$only.plot, ">{")
+            ret <- paste(c(bf, knitr::hook_plot_tex(x, options), "}"),
+                         collapse = "\n")
+            return(ret)
+        }
+        return(hook.plot(x, options))
+    })
+
+    knitr::opts_chunk$set(tidy.opts = list(width.cutoff = 60), tidy = TRUE)
+    knitr::opts_chunk$set(dev = "cairo_pdf")
+}
+
diff --git a/lect17-scRNA-topic-model/Util.R b/lect17-scRNA-topic-model/Util.R
new file mode 100644
index 0000000..0726bb2
--- /dev/null
+++ b/lect17-scRNA-topic-model/Util.R
@@ -0,0 +1,169 @@
+sort.theta <- function(.dt){
+
+    .co <- order(apply(as.matrix(.dt[, -1]), 1, which.max))
+    .to <- colnames(.dt)[-1]
+
+    .melt <- .dt %>%
+        mutate(col = factor(1:n(), rev(.co))) %>%
+        melt(id.vars = c("col", "celltype")) %>%
+        mutate(variable = factor(variable, .to))
+
+    .cto <-
+        .melt[,
+              .(value = mean(value)),
+              by = .(celltype, variable)] %>%
+        dcast(celltype ~ variable, value.var = "value") %>%
+        (function(x){
+            .mat <- as.matrix(x[,-1])
+            .o <- order(apply(.mat, 1, which.max), decreasing = T)
+            rev(x$celltype[.o])
+        })
+
+    .melt[, celltype := factor(celltype, .cto)]
+    return(.melt)
+}
+
+read.theta <- function(.file){
+    return(sort.theta(fread(.file)))
+}
+
+plot.theta <- function(.file, with.ct = F){
+    .dt <- read.theta(.file)
+    .thm <-
+        theme(axis.text.y = element_blank()) +
+        theme(axis.ticks.y = element_blank()) +
+        theme(legend.title = element_blank()) +
+        theme(legend.position = c(1,1)) +
+        theme(legend.justification = c(1,1))
+
+    ret <-
+        ggplot(.dt, aes(variable, col, fill = value)) +
+        ggrastr::rasterise(geom_tile(), dpi=300) +
+        scale_fill_distiller(direction=1) +
+        .thm
+    
+    if(with.ct){
+        ret <- ret +
+            theme(strip.text.y = element_text(angle=0, hjust=0)) +
+            facet_grid(celltype ~ ., scales="free", space="free")
+    }
+    return(ret)
+}
+
+################################################################
+
+sort.beta <- function(.dt, genes.selected){
+
+    .genes <- rownames(.dt)
+    .mat <- as.matrix(.dt)
+    colnames(.mat) <- 1:ncol(.mat)
+    rownames(.mat) <- .genes
+
+    .mat <- .mat[rownames(.mat) %in% genes.selected, , drop = F]
+    .melt <- reshape2::melt(.mat) %>% as.data.table()
+    .melt <- .melt[order(`value`, decreasing = T), head(.SD, 1),
+                   by = .(Var1, Var2)] # remove redundancy
+
+    .mat <- dcast(.melt, Var1 ~ Var2, value.var="value", fun.aggregate = max)
+    .rows <- unlist(.mat[,1], use.names = F)
+    .mat <- .mat[,-1]
+
+    .ro <- apply(t(.mat), 2, which.max) %>%
+        order(decreasing = F) %>%
+        (function(x) .rows[x])
+    .to <- colnames(.mat)
+
+    .melt[, row := factor(`Var1`, .ro)]
+    .melt[, variable := factor(`Var2`, .to)]
+
+    return(.melt)
+}
+
+read.beta <- function(.file, ...){
+    return(sort.beta(fread(.file), ...))
+}
+
+################################################################
+make.gs.lol <- function(.dt) {
+    .dt <- as.data.table(.dt) %>% unique()
+    .list <-
+        .dt[, .(gene = .(gene_symbol)), by = .(gs_name)] %>%
+        as.list()
+    .names <- .list$gs_name
+    .ret <- .list$gene
+    names(.ret) <- .names
+    return(.ret)
+}
+
+run.beta.gsea <- function(.dt, .db){
+
+    vars <- unique(.dt$Var2)
+    ret <- data.table()
+    for(v in vars){
+
+        .scores <- as.vector(scale(.dt[Var2 == v]$value))
+        names(.scores) <- .dt[Var2 == v]$Var1
+
+        .gsea <- fgsea::fgsea(pathways = .db,
+                              stats = .scores,
+                              scoreType = "pos") %>%
+            as.data.table()
+        .gsea[, Var2 := v]
+        ret <- rbind(ret, .gsea)
+    }
+    return(ret)
+}
+
+read.panglao <- function(.organs = NULL, .file = "../data/PanglaoDB_markers_27_Mar_2020.tsv.gz"){
+
+    if(is.null(.organs)){
+        .dt <- fread(.file)
+    } else {
+        .dt <- fread(.file)[organ %in% .organs]
+    }
+
+    ret <- .dt %>% 
+        rename(gene_symbol = `official gene symbol`, gs_name = `cell type`) %>% 
+        mutate(gs_name = gsub(pattern="[ ]*cells", replacement = "", gs_name)) %>% 
+        make.gs.lol()
+}
+
+select.gsea.beta <- function(.gsea, .beta, ntop = 3, nmax = 30, sd.cutoff = 1e-4, padj.cutoff = .1){
+
+    ## cell types (pathways) to show
+    .cts <- unlist(.gsea[order(pval),
+                         head(.SD, ntop),
+                         by = .(Var2)][padj < padj.cutoff,
+                                       .(pathway)])
+
+    .gsea.show <- .gsea[pathway %in% .cts]
+    .gsea.show[, variable := factor(`Var2`, 0:100)] # 100 should be enough
+
+    .temp <- dcast(.gsea.show, variable ~ pathway,
+                   value.var = "pval",
+                   fun.aggregate = min)
+
+    .po <- order(apply(-log10(1e-20 + .temp[,-1]), 2, which.max))
+    .po <- colnames(.temp)[-1][.po]
+    .gsea.show[, ct := factor(pathway, .po)]
+
+    ## genes to show
+    .sd <- .beta[, .(s = sd(`value`)), by = .(Var1)]
+
+    .genes.show <- intersect(unlist(.gsea.show$leadingEdge),
+                             .sd[s > sd.cutoff]$Var1)
+    .beta.show <- .beta[Var1 %in% .genes.show]
+
+    ## leading edges to show
+    .temp <- copy(.gsea.show)
+    .temp[, r := as.character(.I)]
+    .leading.edges <- .temp[, leadingEdge[[1]], by=r] %>%
+        left_join(.temp[, .(r, ct)]) %>%
+        mutate(row = factor(V1, sort(unique(.beta.show$row))))
+
+    .leading.genes <- .beta.show[row %in% .leading.edges$row][order(value, decreasing = T), head(.SD, 10), by = .(variable)][order(value, decreasing = T)]$row
+    .leading.genes <- head(unique(.leading.genes), nmax)
+
+    list(gsea = .gsea.show, beta = .beta.show, leading.edges = .leading.edges,
+         leading.genes = .leading.genes)
+}
diff --git a/lect17-scRNA-topic-model/lect17-scRNA-topic-model-combined.pdf b/lect17-scRNA-topic-model/lect17-scRNA-topic-model-combined.pdf
new file mode 100644
index 0000000..d210f0e
Binary files /dev/null and b/lect17-scRNA-topic-model/lect17-scRNA-topic-model-combined.pdf differ
diff --git a/lect17-scRNA-topic-model/lect17-scRNA-topic-model.Rmd b/lect17-scRNA-topic-model/lect17-scRNA-topic-model.Rmd
new file mode 100644
index 0000000..3fe8bda
--- /dev/null
+++ b/lect17-scRNA-topic-model/lect17-scRNA-topic-model.Rmd
@@ -0,0 +1,1272 @@
+---
+title: "Statistical Methods for single cell data analysis 2"
+author: "Yongjin Park, UBC Path + Stat, BC Cancer"
+date: "`r format(Sys.time(), '%d %B %Y')`"
+mainfont: Roboto
+classoption: "aspectratio=169"
+fontsize: 12pt
+output:
+    powerpoint_presentation:
+        reference_doc: "_template.pptx"
+    html_document:
+        self_contained: true
+    beamer_presentation:
+        theme: Madrid
+        keep_md: true
+        keep_tex: true
+        latex_engine: xelatex
+        slide_level: 2
+header-includes:
+  - \usepackage{cancel}
+  - \definecolor{UBCblue}{rgb}{0.04706, 0.13725, 0.26667}
+  - \usecolortheme[named=UBCblue]{structure}
+  - \setbeamertemplate{frametitle}{\color{UBCblue}\bfseries\insertframetitle\par\vskip-6pt\hrulefill}
+  - \setbeamercolor{title in head/foot}{bg=white,fg=gray}
+  - \setbeamercolor{section in head/foot}{bg=white,fg=gray}
+  - \setbeamercolor{author in head/foot}{bg=white,fg=gray}
+  - \setbeamercolor{date in head/foot}{bg=white,fg=gray}
+  - \setbeamertemplate{page number in head/foot}{}
+  - \setbeamertemplate{frame numbering}[none]
+  - \setbeamercolor{alerted text}{bg=yellow}
+  - \AtBeginSection[]{\begin{frame}\frametitle{Today's lecture}{\Large\tableofcontents[currentsection]}\end{frame}}
+  - |
+    \makeatletter
+    \def\ps@titlepage{%
+      \setbeamertemplate{footline}{}
+    }
+    \addtobeamertemplate{title page}{\thispagestyle{titlepage}}{}
+    \makeatother
+    \include{toc}
+---
+
+
+```{r setup, include=FALSE}
+setwd("~/work/teaching/stat540_lectures/lect17-scRNA-topic-model/")
+library(data.table)
+library(tidyverse)
+library(patchwork)
+library(matrixTests)
+library(mmutilR)
+library(asapR)
+source("Setup.R")
+source("Util.R")
+fig.dir <- "../Fig/scRNA_topic/"
+dir.create(fig.dir, showWarnings=FALSE)
+setup.env(fig.dir)
+dir.create("../data", showWarnings=FALSE)
+library(extrafont)
+library(xkcd)
+extrafont::font_import(pattern="[X/x]kcd", prompt=F)
+extrafont::loadfonts()
+theme_set(theme_xkcd() +
+          theme(title = element_text(size=10),
+                legend.background = element_blank())
+          )
+```
+
+##
+
+\large
+
+Source code available:
+
+`https://github.com/stat540-UBC/lectures`
+
+## Overview of single-cell data analysis
+
+\centerline{\includegraphics[width=\textwidth]{img/scRNA_overview/Karchenko_1.pdf}}
+
+\vfill
+
+\flushright
+\tiny
+Karchenko, \emph{Nature Methods} (2021)
+
+## Overview of single-cell data analysis cont'd
+
+\centerline{\includegraphics[width=\textwidth]{img/scRNA_overview/Karchenko_2.pdf}}
+
+\vfill
+
+\flushright
+\tiny
+Karchenko, \emph{Nature Methods} (2021)
+
+# (review) Principal Component Analysis
+
+## The goal is to find hidden patterns
+
+:::::: {.columns}
+::: {.column width=.5}
+
+
+```{r echo=FALSE}
+U <- .rnorm(100, 2); V <- .rnorm(100, 2)
+X <- U %*% t(V) + .rnorm(100, 100)
+```
+
+A data matrix was generated by the following:
+
+$$\mathbf{x}_{i} = \mathbf{u}_{i} V^{\top} + \epsilon_{i}$$
+
+for all $i$ with $\epsilon_{i} \sim \mathcal{N}\!\left(\mathbf{0},I\right)$
+
+```{r echo=FALSE}
+X <- .sort.matrix(X)
+```
+
+```{r echo=FALSE, fig.width=2.5, fig.height=2, onslide.plot="1-"}
+.matshow(X, .lab=0, .lw=0)
+```
+
+:::
+::: {.column width=.5}
+
+```{r echo=FALSE}
+.svd <- rsvd::rsvd(X, k=5)
+u.hat <- sweep(.svd$u, 2, .svd$d, `*`)
+```
+
+Can we reverse engineer the $U$ and $V$ matrices from the data $X$?
+
+$$X \to U V^{\top}$$
+
+```{r echo=FALSE, fig.width=2.7, fig.height=2.3, only.plot="2"}
+p1 <- .matshow(u.hat, .lab=0, .lw=0) +
+    ggtitle("features")
+p.times <- ggplot() + theme_void() +
+    geom_text(aes(0,0,label="X"), size=5)
+p2 <- wrap_plots(.matshow(t(.svd$v), .lab=0, .lw=0),
+                 ggplot(),
+                 ncol=1, heights=c(1,9)) +
+    ggtitle("feature loading")
+wrap_plots(list(p1, p.times, p2), nrow=1, widths=c(1,2,8))
+```
+
+:::
+::::::
+
+
+## Working Example: Pancreatic cells (Muraro _et al._ 2016)
+
+:::::: {.columns}
+::: {.column width=.4}
+
+\centerline{\includegraphics[width=.9\linewidth]{img/Muraro_GSE85241.png}}
+
+:::
+::: {.column width=.6}
+
+```{r read_raw_data_gse85241}
+raw.data <- fileset.list("../data/GSE85241/GSE85241")
+.info <- rcpp_mmutil_info(raw.data$mtx)
+cell.index <- rcpp_mmutil_read_index(raw.data$idx)
+X <- round(read.sparse(raw.data$mtx))
+.rows <- fread(raw.data$row, header=F)
+.rows[, c("gene","chr") := tstrsplit(`V1`, split="[_]+", keep=1:2)]
+rownames(X) <- .rows$gene
+colnames(X) <- readLines(raw.data$col)
+## remove ERCC spike in
+.ercc <- which(str_detect(rownames(X), "ERCC"))
+X <- X[-.ercc, , drop = F]
+```
+
+It's a sparse matrix...
+
+${}$
+
+```{r echo = T}
+X[1:10, 1:5]
+```
+
+* rows: `r num.int(nrow(X))` genes
+* columns: `r num.int(ncol(X))` cells
+
+:::
+::::::
+
+## Why do we need to reduce dimensionality?
+
+\large
+
+\begin{enumerate}
+\item<2-> High dimensionality, robust downstream analysis
+\item<3-> Pattern recognition: meta genes, meta cells, etc.
+\item<4-> Reduce unwanted variation of outliers
+\item<5> Aggregate weak contributions of many features
+\end{enumerate}
+
+## Visualization with or without dimensionality reduction
+
+```{r}
+.file <- fig.dir %&% "/lect17_tSNE_raw_pca.rds"
+
+if.needed(.file, {
+    set.seed(1)
+    xx <- apply(log1p(t(as.matrix(X))), 2, scale)
+    .tsne.raw <- Rtsne::Rtsne(xx, num_threads=16, pca=F, check_duplicates=F, verbose=T)
+    .umap.raw <- uwot::umap(xx, n_threads = 16, init = "random", verbose=T)
+
+    set.seed(1)
+    .rsvd <- rsvd::rsvd(xx, k = 10)
+    .tsne.pca <- Rtsne::Rtsne(.rsvd$u, num_threads=16, pca=F, check_duplicates=F, verbose=T)
+    .umap.pca <- uwot::umap(.rsvd$u, n_threads = 16, init = "random", verbose=T)
+
+    saveRDS(list(tsne.raw = .tsne.raw$Y,
+                 tsne.pca = .tsne.pca$Y,
+                 umap.raw = .umap.raw,
+                 umap.pca = .umap.pca),
+            .file)
+})
+.tsne <- readRDS(.file)
+```
+
+```{r}
+nnz <- apply(X > 0, 2, sum)
+```
+
+```{r fig.width=6, fig.height=3, only.plot="1"}
+p1 <- ggplot(as.data.frame(cbind(.tsne$tsne.raw, nnz)), aes(V1, V2, color=nnz)) +
+    ggrastr::rasterise(geom_point(stroke=0, alpha=.7, size=.7), dpi=300) +
+    scale_color_distiller(palette = "Spectral", direction=-1, trans="sqrt", guide="none") +
+    ggtitle("tSNE on the raw data")
+
+p2 <- ggplot(as.data.frame(cbind(.tsne$tsne.pca, nnz)), aes(V1, V2, color=nnz)) +
+    ggrastr::rasterise(geom_point(stroke=0, alpha=.7, size=.7), dpi=300) +
+    scale_color_distiller(palette = "Spectral", direction=-1, trans="sqrt", guide="none") +
+    ggtitle("tSNE on the top 10 PC")
+
+p1 | p2
+```
+
+```{r fig.width=6, fig.height=3, only.plot="2"}
+p1 <- ggplot(as.data.frame(cbind(.tsne$umap.raw, nnz)), aes(V1, V2, color=nnz)) +
+    ggrastr::rasterise(geom_point(stroke=0, alpha=.7, size=.7), dpi=300) +
+    scale_color_distiller(palette = "Spectral", direction=-1, trans="sqrt", guide="none") +
+    ggtitle("umap on the raw data")
+
+p2 <- ggplot(as.data.frame(cbind(.tsne$umap.pca, nnz)), aes(V1, V2, color=nnz)) +
+    ggrastr::rasterise(geom_point(stroke=0, alpha=.7, size=.7), dpi=300) +
+    scale_color_distiller(palette = "Spectral", direction=-1, trans="sqrt", guide="none") +
+    ggtitle("umap on the top 10 PC")
+
+p1 | p2
+```
+
+## Let's just use top 200 most variable genes and cells
+
+```{r}
+.sd <- apply(X, 1, sd, na.rm=TRUE)
+msv <- order(.sd, decreasing = TRUE)[1:200]
+.sd <- apply(X, 2, sd, na.rm=TRUE)
+msw <- order(.sd, decreasing = TRUE)[1:200]
+```
+
+Top 200 genes and cells
+
+:::::: {.columns}
+::: {.column width=.25}
+
+```{r fig.width=2, fig.height=2, only.plot="1"}
+x.sub <- as.matrix(X)[msv, msw, drop = FALSE]
+.matshow(x.sub, .scale=FALSE, .lab=0, .lw=0)
+```
+
+```{r fig.width=2, fig.height=2, only.plot="2"}
+x.sub <- t(scale(t(log1p(x.sub))))
+.matshow(x.sub, .scale=TRUE, .lab=0, .lw=0)
+```
+
+:::
+::: {.column width=.65}
+
+* We will call such a high-dimensional matrix $X$ ($m \times n$)
+
+* X has $m$=`r num.int(nrow(X))` rows (transcripts/genes/features)
+
+* X has $n$=`r num.int(ncol(X))` columns (cells/#data points)
+
+* The rows were log-transformed and scaled by `scale()` for visualization
+
+* Each cell is a `r num.int(nrow(X))`-dimensional vector!
+
+* Each gene is a `r num.int(ncol(X))`-dimensional vector...
+
+:::
+::::::
+
+
+## 1-dimensional representations/summary of data matrix
+
+:::::: {.columns}
+::: {.column width=.25}
+
+Top 200 genes and cells:
+
+```{r fig.width=2, fig.height=2}
+.matshow(x.sub, .scale=FALSE, .lab=0, .lw=0)
+```
+
+:::
+::: {.column width=.65}
+
+```{r fig.width=4, fig.height=1.25, onslide.plot="1-"}
+ggplot(as.data.table(X[msv[1],]),
+         aes(x=colnames(X), y=V1)) +
+    ggtitle("Most variable cell (column):") +
+    ylab("expression") +
+    xlab("cells") +
+    theme(axis.text.y = element_text(size=4)) +
+    theme(axis.text.x = element_blank()) +
+    theme(axis.ticks.x = element_blank()) +
+    geom_point(pch=21)
+```
+
+```{r fig.width=4, fig.height=1.25, onslide.plot="2"}
+ggplot(as.data.table(X[,msw[1]]),
+         aes(x=1:nrow(X), y=V1)) +
+    ggtitle("Most variable gene (row):") +
+    ylab("expression") +
+    xlab("genes") +
+    theme(axis.text.y = element_text(size=4)) +
+    theme(axis.text.x = element_blank()) +
+    theme(axis.ticks.x = element_blank()) +
+    geom_point(pch=21, size=.2, alpha=.5)
+```
+
+:::
+::::::
+
+
+## Can you see some common patterns?
+
+:::::: {.columns}
+::: {.column width=.45}
+
+```{r fig.width=2, fig.height=2.5, onslide.plot="1-"}
+.matshow(x.sub, .scale=F, .lab=0, .lw=0) +
+    ggtitle("200 genes and cells")
+```
+
+:::
+::: {.column width=.45}
+
+```{r fig.width=2, fig.height=2.5, onslide.plot="2"}
+x.sorted <- .sort.matrix(x.sub)
+.matshow(x.sorted, .scale=F, .lab=0, .lw=0) +
+    ggtitle("Rearrange them...")
+```
+
+:::
+::::::
+
+## PCA: How do we recover "common" patterns from data?
+
+Principal Component Analysis
+
+:::::: {.columns}
+::: {.column width=.45}
+
+::: {.block}
+
+### (Pearson 1901)
+
+* Projection [of the original data] that minimizes the projection cost between the original and projected
+
+* The cost = mean squared distance
+
+:::
+
+:::
+::: {.column width=.45}
+
+::: {.block}
+
+### (Hotelling 1933)
+
+* Orthogonal projection of data into a lower-dimensional **[principal]** sub-space,
+
+* such that the total **variation of the projected** is maximized
+
+:::
+
+:::
+::::::
+
+## Let's see how much the first eigenvector can explain
+
+:::::: {.columns}
+::: {.column width=.35}
+
+```{r fig.width=2, fig.height=2.5, only.plot="1"}
+.matshow(x.sorted, .scale=F, .lab=0, .lw=0) +
+    ggtitle("Top 200 genes and cells")
+```
+
+```{r}
+.eigen <- eigen(x.sorted %*% t(x.sorted))
+U <- .eigen$vector[,1,drop=FALSE]
+.lm <- lm(x.sorted ~ U -1)
+```
+
+```{r fig.width=2, fig.height=2.5, only.plot="2"}
+.matshow(.lm$fitted.values, .lab=0, .lw=0) +
+    ggtitle("eigen proj.")
+```
+
+```{r fig.width=2, fig.height=2.5, only.plot="3"}
+.matshow(x.sorted, .scale=F, .lab=0, .lw=0) +
+    ggtitle("Top 200 genes and 200 cells")
+```
+
+:::
+::: {.column width=.65}
+
+```{r fig.width=3.2, fig.height=2.7, only.plot="2"}
+p0 <- ggplot() + theme_void()
+p.eq <- ggplot() + theme_void() + geom_text(aes(0,0,label="="), size=10)
+p.prod <- ggplot() + theme_void() + geom_text(aes(0,0,label="x"), size=10)
+p1 <- .matshow(U, .lab=0, .lw=0, .scale=TRUE) +
+    ggtitle("e-vec")
+p2 <- .matshow(matrix(.lm$coefficients, nrow=1),
+               .lab=0, .lw=0, .scale=TRUE) +
+    ggtitle("loading")
+p2 <- wrap_plots(p2, p0, ncol=1, heights = c(1,13))
+wrap_plots(p.eq, p0, p1, p0, p.prod, p2, nrow=1, widths=c(1, 1, 1, 1, 1, 11))
+```
+
+```{r fig.width=2, fig.height=2.5, only.plot="3"}
+var.exp <- mean(apply(.lm$fitted, 2, var) / apply(x.sorted, 2, var))
+.matshow(.lm$fitted.values, .lab=0, .lw=0) +
+    ggtitle(round(var.exp*100) %&% "%")
+```
+
+:::
+::::::
+
+
+## Top two eigen-vectors
+
+:::::: {.columns}
+::: {.column width=.35}
+
+```{r fig.width=2, fig.height=2.5, only.plot="1"}
+.matshow(x.sorted, .scale=F, .lab=0, .lw=0) +
+    ggtitle("Top 200 genes and 200 cells")
+```
+
+```{r}
+U <- .eigen$vector[,1:2,drop=FALSE]
+.lm <- lm(x.sorted ~ U -1)
+```
+
+```{r fig.width=2, fig.height=2.5, only.plot="2"}
+.matshow(.lm$fitted.values, .lab=0, .lw=0) +
+    ggtitle("eigen proj.")
+```
+
+```{r fig.width=2, fig.height=2.5, only.plot="3"}
+.matshow(x.sorted, .scale=F, .lab=0, .lw=0) +
+    ggtitle("Top 200 genes and 200 cells")
+```
+
+:::
+::: {.column width=.65}
+
+```{r fig.width=3.2, fig.height=2.7, only.plot="2"}
+p0 <- ggplot() + theme_void()
+p.eq <- ggplot() + theme_void() + geom_text(aes(0,0,label="="), size=10)
+p.prod <- ggplot() + theme_void() + geom_text(aes(0,0,label="x"), size=10)
+p1 <- .matshow(U, .lab=0, .lw=0, .scale=TRUE) +
+    ggtitle("e-vec")
+p2 <- .matshow(as.matrix(.lm$coefficients),
+               .lab=0, .lw=0, .scale=TRUE) +
+    ggtitle("loading")
+p2 <- wrap_plots(p2, p0, ncol=1, heights = c(1,13))
+wrap_plots(p.eq, p0, p1, p0, p.prod, p2, nrow=1, widths=c(1, 1, 1, 1, 1, 11))
+```
+
+```{r fig.width=2, fig.height=2.5, only.plot="3"}
+var.exp <- mean(apply(.lm$fitted, 2, var) / apply(x.sorted, 2, var))
+.matshow(.lm$fitted.values, .lab=0, .lw=0) +
+    ggtitle(round(var.exp*100) %&% "%")
+```
+
+:::
+::::::
+
+## Top three eigen-vectors
+
+:::::: {.columns}
+::: {.column width=.35}
+
+```{r fig.width=2, fig.height=2.5, only.plot="1"}
+.matshow(x.sorted, .scale=F, .lab=0, .lw=0) +
+    ggtitle("Top 200 genes and 200 cells")
+```
+
+```{r}
+U <- .eigen$vector[,1:3,drop=FALSE]
+.lm <- lm(x.sorted ~ U -1)
+```
+
+```{r fig.width=2, fig.height=2.5, only.plot="2"}
+.matshow(.lm$fitted.values, .lab=0, .lw=0) +
+    ggtitle("eigen proj.")
+```
+
+```{r fig.width=2, fig.height=2.5, only.plot="3"}
+.matshow(x.sorted, .scale=F, .lab=0, .lw=0) +
+    ggtitle("Top 200 genes and 200 cells")
+```
+
+:::
+::: {.column width=.65}
+
+```{r fig.width=3.2, fig.height=2.7, only.plot="2"}
+p0 <- ggplot() + theme_void()
+p.eq <- ggplot() + theme_void() + geom_text(aes(0,0,label="="), size=10)
+p.prod <- ggplot() + theme_void() + geom_text(aes(0,0,label="x"), size=10)
+p1 <- .matshow(U, .lab=0, .lw=0, .scale=TRUE) +
+    ggtitle("e-vec")
+p2 <- .matshow(as.matrix(.lm$coefficients),
+               .lab=0, .lw=0, .scale=TRUE) +
+    ggtitle("loading")
+p2 <- wrap_plots(p2, p0, ncol=1, heights = c(1,13))
+wrap_plots(p.eq, p0, p1, p0, p.prod, p2, nrow=1, widths=c(1, 1, 1, 1, 1, 11))
+```
+
+```{r fig.width=2, fig.height=2.5, only.plot="3"}
+var.exp <- mean(apply(.lm$fitted, 2, var) / apply(x.sorted, 2, var))
+.matshow(.lm$fitted.values, .lab=0, .lw=0) +
+    ggtitle(round(var.exp*100) %&% "%")
+```
+
+:::
+::::::
+
+## Singular Value Decomposition can find a PCA solution
+
+::: {.block}
+### Singular Value Decomposition
+
+SVD identifies three matrices of $X$:
+
+$$X = U D V^{\top}$$
+
+where both $U$ and $V$ vectors are orthonormal,
+namely, 
+
+- $U^{\top}U = I$, $\mathbf{u}_{k}^{\top}\mathbf{u}_{k}=1$ for all $k$,
+
+- $V^{\top}V = I$, $\mathbf{v}_{k}^{\top}\mathbf{v}_{k} = 1$ for all $k$.
+:::
+
+## Run SVD to find principal components
+
+```{r echo=TRUE}
+svd.out <- rsvd::rsvd(x.sorted, k = 7)
+U <- svd.out$u; D <- diag(svd.out$d); V <- svd.out$v
+```
+
+:::::: {.columns}
+::: {.column width=.25}
+
+```{r fig.width=2, fig.height=2.3}
+.matshow(x.sorted, .scale=F, .lab=0, .lw=0) +
+    ggtitle("data matrix")
+```
+
+:::
+::: {.column width=.65}
+
+```{r fig.width=3.2, fig.height=2.5}
+p0 <- ggplot() + theme_void()
+
+p1 <- .matshow(U, .lab=0, .lw=.0, .scale=TRUE) +
+    ggtitle("U")
+
+p.D <- .matshow(D, .lab=0, .lw=.0, .scale=TRUE) +
+    ggtitle("D")
+
+p.V <- .matshow(t(V), .lab=0, .lw=.0, .scale=TRUE) +
+    ggtitle("transpose(V)")
+
+p.eq <- ggplot() + theme_void() + geom_text(aes(0,0,label="="), size=6)
+p.prod <- ggplot() + theme_void() + geom_text(aes(0,0,label="x"), size=5)
+p.prod <- wrap_plots(p.prod, p0, ncol=1, heights=c(1,8))
+
+p2 <- wrap_plots(p.D, p0, ncol=1, heights=c(1,8))
+p3 <- wrap_plots(p.V, p0, ncol=1, heights=c(1,8))
+
+wrap_plots(p.eq, p0, p1, p.prod, p2, p.prod, p3,
+     nrow=1,
+     widths=c(1, 1, 1, 1, 1, 1, 8))
+```
+
+:::
+::::::
+
+## SVD decomposes covariance calculation
+
+:::::: {.columns}
+::: {.column width=.45}
+
+::: {.block}
+### Singular Value Decomposition
+
+SVD identifies three matrices of $X$:
+
+$$X = U D V^{\top}$$
+
+where both $U$ and $V$ vectors are orthonormal,
+namely, 
+
+- $U^{\top}U = I$, $\mathbf{u}_{k}^{\top}\mathbf{u}_{k}=1$ for all $k$,
+
+- $V^{\top}V = I$, $\mathbf{v}_{k}^{\top}\mathbf{v}_{k} = 1$ for all $k$.
+
+:::
+
+:::
+::: {.column width=.45}
+
+::: {.block}
+
+### Covariance by SVD
+
+Covariance across the columns (samples)
+
+$$X^{\top}X/m = V D^{2} V^{\top}/m$$
+
+Covariance across the rows (genes)
+
+$$XX^{\top}/n = U D^{2} U^{\top}/n$$
+
+:::
+
+*Remark*: standardized matrix
+
+:::
+::::::
+
+## Review: SVD finds eigenvectors of covariance
+
+\small
+
+\begin{eqnarray*}
+\onslide<1->{
+	\underbrace{\left( \frac{1}{m} X^{\top}X \right)}_{\textsf{\color{blue} sample covariance}} \mathbf{v}_{1}
+	&=& 
+	\frac{1}{m} \left(\mathbf{v}_{1}, \mathbf{v}_{2},\ldots, \mathbf{v}_{k}\right)
+      \left(
+      \begin{array}{l l l l}
+        D_{1}^{2} & 0 & \ldots & \ldots \\
+        0 & D_{2}^{2} & 0 & \ldots \\
+        0 & \ldots & \ddots & 0 \\
+        0 & \ldots & 0 & D_{k}^{2} \\
+      \end{array} \right)
+  \left(
+  \begin{array}{l}
+    \mathbf{v}_{1}^{\top}\\
+    \mathbf{v}_{2}^{\top}\\
+    \vdots\\
+    \mathbf{v}_{k}^{\top}
+  \end{array}
+  \right)
+  \mathbf{v}_{1} \\
+	}
+	\onslide<2->{
+  &=&
+      \frac{1}{m} \left(\mathbf{v}_{1}, \mathbf{v}_{2},\ldots, \mathbf{v}_{k}\right)
+      \left(
+      \begin{array}{l l l l}
+        D_{1}^{2} & 0 & \ldots & \ldots \\
+        0 & D_{2}^{2} & 0 & \ldots \\
+        0 & \ldots & \ddots & 0 \\
+        0 & \ldots & 0 & D_{k}^{2} \\
+      \end{array} \right)
+  \left(
+  \begin{array}{l}
+    1 \\
+    0 \\
+    \vdots\\
+    0
+  \end{array}
+  \right) \\
+  }
+\onslide<4>{
+\boldsymbol{\hat{\Sigma}} \, \mathbf{v}_{1}
+}
+\onslide<3->{
+  &=&
+      \frac{1}{m} \left(\mathbf{v}_{1}, \mathbf{v}_{2},\ldots, \mathbf{v}_{k}\right)
+      \left(
+      \begin{array}{l}
+        D_{1}^{2} \\
+        0 \\
+        0 \\
+        0 \\
+      \end{array} \right)}
+	  \onslide<4>{
+	  =
+  \underbrace{\frac{D_{1}^{2}}{m}}_{\textsf{\color{red}eigenvalue}}
+  \underbrace{\mathbf{v}_{1}}_{\textsf{\color{red}eigenvector}}
+  }
+\end{eqnarray*}
+
+
+## Eigen values $\lambda$'s quantify the proportion of variance
+
+Given this SVD, $X = U D V^{\top}$, where $V = [\mathbf{v}_{1},\ldots,\mathbf{v}_{k}]$,
+
+:::::: {.columns}
+::: {.column width=.55}
+
+```{r fig.width=3.2, fig.height=2.5}
+v.plots <- lapply(1:ncol(V),
+                  function(k) {
+                      list(.matshow(V[,k,drop=F]) + ggtitle("V[," %&% k %&% "]"),
+                           ggplot() + theme_void())
+                  })
+
+wrap_plots(unlist(v.plots, recursive = F),
+           widths = rep(c(2,1),7), nrow = 1)
+```
+:::
+::: {.column width=.45}
+
+$V$ is orthonormal (orthogonal and normalized):
+
+```{r fig.width=1.8, fig.height=2}
+.matshow(t(V) %*% V, .scale=F, .lw=.1, .lab=2) + ggtitle("transpose(V) * V")
+```
+
+:::
+::::::
+
+## Eigen values $\lambda$'s quantify the proportion of variance
+
+Let's consider an element $(i,j)$ of the covariance
+
+$$\frac{1}{m} \left( X^{\top}X \right)_{ij}
+	= 
+	\frac{1}{m} \left(V_{i1}, V_{i2},\ldots, V_{ik}\right)
+      \left(
+      \begin{array}{l l l l}
+        D_{1}^{2} & 0 & \ldots & \ldots \\
+        0 & D_{2}^{2} & 0 & \ldots \\
+        0 & \ldots & \ddots & 0 \\
+        0 & \ldots & 0 & D_{k}^{2} \\
+      \end{array} \right)
+  \left(
+  \begin{array}{l}
+    V_{j1}\\
+    V_{j2}\\
+    \vdots\\
+    V_{jk}
+  \end{array}
+  \right)$$
+
+
+:::::: {.columns}
+::: {.column width=.2}
+:::
+::: {.column width=.3}
+
+```{r fig.width=1.5, fig.height=.4}
+.matshow(V[1,,drop = F], .lab=2)
+```
+
+:::
+::: {.column width=.05}
+$${}$$
+$$\times$$
+:::
+::: {.column width=.3}
+
+```{r fig.width=1.5, fig.height=1.5}
+.matshow(D, .lab=2)
+```
+
+:::
+::: {.column width=.05}
+$${}$$
+$$\times$$
+:::
+::: {.column width=.1}
+
+```{r fig.width=.4, fig.height=1.5}
+.matshow(t(V[1,,drop = F]), .lab=2)
+```
+
+:::
+::::::
+
+
+## Eigen values $\lambda$'s quantify the proportion of variance
+
+Let's consider an element $(i,j)$ of the covariance
+
+$$\frac{1}{m} \left( X^{\top}X \right)_{ij}
+	= 
+	\frac{1}{m} \left(V_{i1}, V_{i2},\ldots, V_{ik}\right)
+      \left(
+      \begin{array}{l l l l}
+        D_{1}^{2} & 0 & \ldots & \ldots \\
+        0 & D_{2}^{2} & 0 & \ldots \\
+        0 & \ldots & \ddots & 0 \\
+        0 & \ldots & 0 & D_{k}^{2} \\
+      \end{array} \right)
+  \left(
+  \begin{array}{l}
+    V_{j1}\\
+    V_{j2}\\
+    \vdots\\
+    V_{jk}
+  \end{array}
+  \right)$$
+
+Therefore,
+
+$$\hat{\Sigma}_{ij} = \frac{D_{1}^{2}}{m} V_{i1} V_{j1} + \frac{D_{2}^{2}}{m} V_{i2} V_{j2} + \ldots +
+\frac{D_{k}^{2}}{m} V_{ik} V_{jk}$$
+
+We have $k$ terms for the covariance between $i$ and $j$.
+
+## Sample covariance matrix
+
+$$\hat{\Sigma} = X^{\top}X/m$$
+
+```{r fig.width=2.5, fig.height=2.5}
+.matshow(cov(x.sorted), .lab = 0)
+```
+
+## PCs (SVs) decompose the covariance matrix
+
+```{r}
+p.eq <- ggplot() + theme_void() + geom_text(aes(0,0,label="="), size=10)
+
+eigen.values <- svd.out$d^2 / (nrow(x.sorted) - 1)
+.show.cov.pc <- function(j) {
+    p0 <- ggplot() + theme_void() + geom_text(aes(0,0,label="+"),size=5)
+    .title <- round(eigen.values[j], 2) %&% "%"
+    p <- .matshow(V[,j,drop=FALSE] %*% t(V[,j,drop=FALSE]), .lab = 0) +
+        ggtitle(.title) + theme(title = element_text(size=8))
+    wrap_plots(p, p0, nrow=1, widths=c(8,2))
+}
+
+v.prod <- lapply(1:ncol(V), .show.cov.pc)
+```
+
+$\hat{\Sigma} = X^{\top}X/m
+\onslide<2->{
+= V D^{2} V^{\top} / m
+}
+\onslide<3>{
+= \sum_{k=1} \lambda_{k} \mathbf{v}_{k} \mathbf{v}_{k}^{\top},\quad \lambda_{k}
+}
+\onslide<3>{
+=D_{k}^{2}/m
+}$
+
+:::::: {.columns}
+::: {.column width=.25}
+
+```{r fig.width=1.25, fig.height=1.65}
+.matshow(cov(x.sorted), .lab = 0) + ggtitle("n x n")
+```
+
+:::
+
+::: {.column width=.02}
+$${\large\boldsymbol{=}}$$
+:::
+
+::: {.column width=.75}
+
+```{r fig.width=4.3, fig.height=1.65}
+wrap_plots(v.prod[1:3], nrow = 1)
+```
+
+* How much variance is explained by each component?
+
+:::
+::::::
+
+## How much variance is explained by each principal component?
+
+```{r fig.width=5, fig.height=2.5, only.plot="1"}
+.dt <- data.table(eigen.values) %>% mutate(k=1:n()) %>% as.data.table
+vtot <- sum(.dt$eigen.values)
+.dt[, pr := eigen.values / vtot]
+.dt[, cpr := cumsum(pr)]
+ggplot(.dt, aes(k, eigen.values, fill=eigen.values)) +
+    geom_line() +
+    ylab("eigenvalues (lambda)") +
+    xlab("principal components") +
+    geom_point(pch=21) +
+    scale_fill_distiller(palette = "RdPu", direction=1, guide="none")
+```
+
+```{r fig.width=5, fig.height=2.5, only.plot="2"}
+ggplot(.dt, aes(x=k, fill=eigen.values)) +
+    geom_line(aes(y=pr)) +
+    geom_text_repel(aes(y=pr, label=round(100*pr) %&% "%"),
+                    data = .dt[1:10, ],
+                    size = 3, nudge_x = 10, nudge_y = .1,
+                    segment.lw = .2,
+                    segment.color = 2,
+                    vjust=0, hjust=0, colour=2) +
+    geom_text_repel(aes(y=cpr, label=round(100*cpr) %&% "%"),
+                    data = .dt[2:5,],
+                    size = 3, nudge_x = 10, nudge_y = 0,
+                    segment.lw = .2,
+                    segment.color = "gray20",
+                    vjust=0, hjust=0, colour="gray40") +
+    geom_line(aes(y=cpr), lty=2) +
+    geom_point(aes(y=pr), pch=21) +
+    ylab("% var. explained\n(lambda/sum(lambda))") +
+    xlab("principal components") +
+    scale_fill_distiller(palette = "RdPu", direction=1, guide="none")
+```
+
+# Non-negative matrix factorization
+
+## 
+
+\large
+
+SVD works great. Why do we need another factorization method?
+
+\normalsize
+
+## Why another factorization method?
+
+:::::: {.columns}
+::: {.column width=.45}
+
+* The count matrix $X$ is non-negative.
+
+```{r echo = T, size = "large"}
+sum(X < 0)
+```
+
+$${}$$
+
+* Singular vectors are signed.
+
+    - The "positive" and "negative" values can cancel each other and become "zero."
+
+* "Negative" contributions are difficult to interpret.
+
+:::
+::: {.column width=.45}
+
+```{r fig.width=3, fig.height=1.75}
+par(cex=.5)
+hist(U[,1], 20, main="U", xlab="")
+```
+
+```{r fig.width=3, fig.height=1.75}
+par(cex=.5)
+hist(V[,1], 20, main="V", xlab="")
+```
+
+:::
+::::::
+
+
+## Non-negative factorization
+
+```{r}
+xx <- .sort.matrix(as.matrix(X[msv, msw]))
+rownames(xx) <- rownames(X)[msv]
+
+rescale.nmf <- function(.beta, .theta) {
+    uu <- apply(.beta, 2, sum)
+    beta <- sweep(.beta, 2, uu, `/`)
+    prop <- sweep(.theta, 2, uu, `*`)
+    zz <- apply(t(prop), 2, sum)
+    prop <- sweep(prop, 1, zz, `/`)
+    list(beta = beta, prop = prop, depth = zz)
+}
+```
+
+```{r echo = T, size="large"}
+out <- NMF::nmf(xx, 7)
+U <- NMF::basis(out)
+V <- t(NMF::coef(out))
+```
+
+## NMF naturally induces (1) gene sets (2) cellular topics
+
+```{r include = FALSE}
+nmf.out <- rescale.nmf(NMF::basis(out), t(NMF::coef(out)))
+```
+
+:::::: {.columns}
+::: {.column width=.4}
+
+```{r fig.width=2, fig.height=2.5}
+.matshow(xx, .scale=FALSE) + ggtitle("data (gene X cell)")
+```
+
+:::
+::: {.column width=.05}
+$${}$$
+$${\Huge\boldsymbol{\sim}}$$
+:::
+::: {.column width=.1}
+
+```{r fig.width=.7, fig.height=2.5}
+.matshow(nmf.out$beta, .scale=FALSE) + ggtitle("U")
+```
+
+:::
+::: {.column width=.05}
+$${}$$
+$${\Huge\boldsymbol{\times}}$$
+:::
+::: {.column width=.4}
+
+```{r fig.width=2, fig.height=1}
+.matshow(t(nmf.out$prop), .scale=FALSE) + ggtitle("transpose(V)")
+```
+
+:::
+::::::
+
+## Several notable `R` packages to fit NMF for single-cell data analysis
+
+\large
+
+* `fastTopics`: estimates Poisson NMF and provides plotting and differential expression utility functions
+
+* `rliger`: provides integrative analysis routines and fast online learning methods; but, it will select features arbitrarily
+
+## `fastTopics` relates NMF results to topic models
+
+```{r echo = T}
+library(fastTopics)
+out <- fit_topic_model(t(xx), k=7)
+```
+
+\vfill
+\flushright
+\tiny
+Carbonetto .. Stephens, Genome Biology (2023)
+
+## A structure plot in `fastTopics` package
+
+```{r echo = T, fig.width=5.5, fig.height=1.8}
+structure_plot(out, verbose=F) + xlab("cells")
+```
+
+\vfill
+\flushright
+\tiny
+Carbonetto .. Stephens, Genome Biology (2023)
+
+## Poisson NMF by `fastTopics`
+
+:::::: {.columns}
+::: {.column width=.4}
+
+```{r fig.width=2, fig.height=2.5}
+.matshow(xx, .scale=FALSE) + ggtitle("data (gene X cell)")
+```
+
+:::
+::: {.column width=.05}
+$${}$$
+$${\Huge\boldsymbol{\sim}}$$
+:::
+::: {.column width=.1}
+
+```{r fig.width=.7, fig.height=2.5}
+.matshow(out$F, .scale=FALSE) + ggtitle("gene x factor")
+```
+
+:::
+::: {.column width=.05}
+$${}$$
+$${\Huge\boldsymbol{\times}}$$
+:::
+::: {.column width=.4}
+
+```{r fig.width=2, fig.height=1}
+.matshow(t(out$L), .scale=FALSE) + ggtitle("factor x cell")
+```
+
+:::
+::::::
+
+## `fastTopics`' differential expression analysis to find topic-specific genes
+
+```{r echo = T}
+de.out <- de_analysis(out, t(xx))
+```
+
+\vfill
+\flushright
+\tiny
+Carbonetto .. Stephens, Genome Biology (2023)
+
+## `fastTopics`' differential expression analysis to find topic-specific genes
+
+```{r fig.width=5.8, fig.height=2.2}
+volcano_plot(de.out, 4) | volcano_plot(de.out, 5)
+```
+
+## `rliger` provides a more scalable option for NMF estimation
+
+```{r echo = T}
+library(rliger)
+.liger <- createLiger(list(rna=X))
+.liger <- normalize(.liger)
+.liger <- selectGenes(.liger, var.thresh=0.2, do.plot=F)
+.liger <- scaleNotCenter(.liger)
+.liger <- online_iNMF(.liger, k=7, miniBatch_size=100)
+.liger <- rliger::quantile_norm(.liger)
+.liger <- rliger::runUMAP(.liger)
+.liger <- louvainCluster(.liger, resolution = 0.25)
+```
+
+\vfill
+\flushright
+\tiny
+Gao .. Welch, Nature Biotech. (2021)
+
+## `rliger`'s NMF result
+
+:::::: {.columns}
+::: {.column width=.5}
+
+gene $\times$ factor matrix
+
+```{r fig.width=1.5, fig.height=2.5}
+pheatmap::pheatmap(.liger@B[[1]], show_rownames=F, cluster_cols = F)
+```
+
+:::
+::: {.column width=.5}
+
+cell $\times$ factor matrix
+
+```{r fig.width=1.5, fig.height=2.5}
+pheatmap::pheatmap(.liger@H.norm, show_rownames=F, cluster_cols = F)
+```
+
+:::
+::::::
+
+## `rliger`: clustering cells by the topic/factor loading
+
+```{r fig.width=6, fig.height=2.7}
+.plts <- plotByDatasetAndCluster(.liger, return.plots=T)
+p1 <- .plts[[2]] +
+    scale_color_brewer(palette = "Paired") +
+    theme_classic() +
+    ggtitle("Liger NMF factor loading")
+
+.df <- data.frame(cbind(.tsne$umap.pca, k=.liger@clusters))
+p2 <- ggplot(.df, aes(V1, V2, color=as.factor(k))) +
+    theme_classic() +
+    ggrastr::rasterise(geom_point(stroke=0, alpha=.7, size=.7), dpi=300) +
+    scale_color_brewer(palette = "Paired") +
+    theme(legend.position = "none") +
+    xlab("UMAP1") + ylab("UMAP2") +
+    ggtitle("umap on the top 10 PC")
+
+p1 | p2
+```
+
+## Once we have gene $\times$ factor dictionary
+
+```{r}
+.bulk <- asap_random_bulk(raw.data$mtx,
+                          raw.data$row,
+                          raw.data$col,
+                          raw.data$idx,
+                          10,
+                          NUM_THREADS=16)
+
+.nmf <- asap_fit_nmf(.bulk$PB, 7, NUM_THREADS=16)
+
+.asap <- asap_topic_stat(raw.data$mtx,
+                         raw.data$row,
+                         raw.data$col,
+                         raw.data$idx,
+                         .nmf$log.beta,
+                         .bulk$rownames)
+
+.asap.nmf <- asap_topic_pmf(.asap$beta,
+                            .asap$corr,
+                            .asap$colsum,
+                            NUM_THREADS=16)
+
+.asap.topic <- pmf2topic(.asap.nmf$beta, .asap.nmf$theta)
+```
+
+```{r run_gsea_beta}
+.beta <- .asap.topic$beta
+.rows <- data.table(.bulk$rownames)
+.rows[, c("gene","chr") := tstrsplit(`V1`, split="[_]+", keep=1:2)]
+rownames(.beta) <- .rows$gene
+
+.db <- read.panglao("Pancreas")
+.genes <- unique(unlist(.db))
+.dt <- sort.beta(.beta, genes.selected = .genes)
+.gsea <- run.beta.gsea(.dt, .db)
+.show <- select.gsea.beta(.gsea, .dt, sd.cutoff = 1e-4)
+```
+
+```{r}
+p1 <-
+    .gg.plot(.show$beta, aes(row, variable, fill=`value`)) +
+    theme(axis.text.x = element_blank()) +
+    theme(axis.ticks.x = element_blank()) +
+    theme(legend.position = "none") +
+    ggrastr::rasterise(geom_tile(), dpi=300) +
+    ylab("NMF factors") + xlab("genes") +
+    scale_fill_distiller(direction=1, trans="sqrt", palette="RdPu")
+
+p2 <- 
+    .gg.plot(.show$leading.edges, aes(row, ct)) +
+    theme(axis.text.x = element_blank()) +
+    theme(axis.ticks.x = element_blank()) +
+    theme(panel.grid = element_blank()) +
+    geom_tile(linewidth=0) +
+    ylab("cell types")
+
+p3 <- 
+    .gg.plot(.show$gsea, aes(variable, ct, fill=-log10(pmax(padj, 1e-4)))) +
+    xlab("NMF factors") + ylab("") +
+    scale_y_discrete(position = "right") +
+    geom_tile(linewidth=.1, color="black") +
+    theme(legend.key.width=unit(.2, "lines")) +
+    theme(legend.text = element_text(size=4)) +
+    scale_fill_distiller("Padj", palette = "YlGnBu", direction=1, labels=function(x) num.sci(10^(-x)))
+```
+
+```{r fig.width=4, fig.height=1.5}
+print(p1)
+```
+
+## Annotate topics to cell types by enrichment (`fgsea`)
+
+```{r fig.width=6, fig.height=3, only.plot="1"}
+print(p1/p2)
+```
+
+```{r fig.width=6, fig.height=3, only.plot="2"}
+C <- length(unique(.show$gsea$ct))
+K <- length(unique(.show$beta$variable))
+p0 <- ggplot() + theme_void()
+wrap_plots(p1, p0, p2, p3, nrow = 2, heights=c(K,C), widths=c(4,1))
+```
+
+# Differential gene expression analysis
+
+