From 3f7d8d9954b3bd810be753d7e9e4d41143bccfa8 Mon Sep 17 00:00:00 2001 From: Andrew Ghazi <6763470+andrewGhazi@users.noreply.github.com> Date: Tue, 6 Aug 2024 11:35:08 -0400 Subject: [PATCH 1/5] intermediate questions --- episodes/eda_qc.Rmd | 149 +++++++++++++++++++++++++++++++++-------- episodes/intro-sce.Rmd | 64 +++++++++++++----- 2 files changed, 167 insertions(+), 46 deletions(-) diff --git a/episodes/eda_qc.Rmd b/episodes/eda_qc.Rmd index c601e46..0647b3c 100644 --- a/episodes/eda_qc.Rmd +++ b/episodes/eda_qc.Rmd @@ -59,24 +59,46 @@ From the experiment, we expect to have only a few thousand cells, while we can s ```{r} library(DropletUtils) +library(ggplot2) + bcrank <- barcodeRanks(counts(sce)) # Only showing unique points for plotting speed. uniq <- !duplicated(bcrank$rank) -plot(bcrank$rank[uniq], bcrank$total[uniq], log="xy", - xlab="Rank", ylab="Total UMI count", cex.lab=1.2) - -abline(h=metadata(bcrank)$inflection, col="darkgreen", lty=2) -abline(h=metadata(bcrank)$knee, col="dodgerblue", lty=2) -legend("bottomleft", legend=c("Inflection", "Knee"), - col=c("darkgreen", "dodgerblue"), lty=2, cex=1.2) +line_df = data.frame(cutoff = names(metadata(bcrank)), + value = unlist(metadata(bcrank))) + +ggplot(bcrank[uniq,], aes(rank, total)) + + geom_point() + + geom_hline(data = line_df, + aes(color = cutoff, + yintercept = value), + lty = 2) + + scale_x_log10() + + scale_y_log10() + + labs(y = "Total UMI count") ``` The distribution of total counts (called the unique molecular identifier or UMI count) exhibits a sharp transition between barcodes with large and small total counts, probably corresponding to cell-containing and empty droplets respectively. A simple approach would be to apply a threshold on the total count to only retain those barcodes with large totals. However, this may unnecessarily discard libraries derived from cell types with low RNA content. +:::: challenge + +What is the median number of total counts in the raw data? + +::: solution + +```{r} +median(bcrank$total) +``` + +Just 2! Clearly many barcodes produce practically no output. +::: + +:::: + ### Testing for empty droplets A better approach is to test whether the expression profile for each cell barcode is significantly different from the ambient RNA pool[^1]. Any significant deviation indicates that the barcode corresponds to a cell-containing droplet. This allows us to discriminate between well-sequenced empty droplets and droplets derived from cells with little RNA, both of which would have similar total counts. @@ -141,8 +163,11 @@ library(EnsDb.Mmusculus.v79) ``` ```{r} -chr.loc <- mapIds(EnsDb.Mmusculus.v79, keys=rownames(sce), - keytype="GENEID", column="SEQNAME") +chr.loc <- mapIds(EnsDb.Mmusculus.v79, + keys = rownames(sce), + keytype = "GENEID", + column = "SEQNAME") + is.mito <- which(chr.loc=="MT") ``` @@ -171,17 +196,29 @@ summary(df$detected) summary(df$subsets_Mito_percent) ``` -This can be achieved with the `perCellQCFilters` function. By default, we consider a value to be an outlier if it is more than 3 median absolute deviations (MADs) from the median in the "problematic" direction. This is loosely motivated by the fact that such a filter will retain 99% of non-outlier values that follow a normal distribution. +We can use the `perCellQCFilters` function to apply a set of common adaptive filters to identify low-quality cells. By default, we consider a value to be an outlier if it is more than 3 median absolute deviations (MADs) from the median in the "problematic" direction. This is loosely motivated by the fact that such a filter will retain 99% of non-outlier values that follow a normal distribution. ```{r} reasons <- perCellQCFilters(df, sub.fields="subsets_Mito_percent") -colSums(as.matrix(reasons)) -summary(reasons$discard) +reasons -# include in object sce$discard <- reasons$discard ``` +:::: challenge + +We've removed empty cells and low-quality cells to be discarded. How many cells are we left with at this point? + +::: solution +```{r} +table(sce$discard) +``` + +There are `r unname(table(sce$discard)[1])` cells that *haven't* been flagged to be discarded, so that's how many we have left. +::: + +:::: + ### Diagnostic plots It is always a good idea to check the distribution of the QC metrics and to visualize the cells that were removed, to identify possible problems with the procedure. @@ -224,13 +261,18 @@ The _library size factor_ for each cell is directly proportional to its library ```{r} lib.sf <- librarySizeFactors(sce) summary(lib.sf) -hist(log10(lib.sf), xlab="Log10[Size factor]", col='grey80', breaks = 30) + +sf_df = data.frame(size_factor = lib.sf) + +ggplot(sf_df, aes(size_factor)) + + geom_histogram() + + scale_x_log10() ``` ### Normalization by deconvolution -Library size normalization is not optimal, as it assumes that the total sum of UMI counts differ between cells only for technical and not biological reason. This can be a problem if a highly-expressed subset of genes is differentially expressed between cells or cell types. +Library size normalization is not optimal, as it assumes that the total sum of UMI counts differ between cells only for technical and not biological reasons. This can be a problem if a highly-expressed subset of genes is differentially expressed between cells or cell types. Several robust normalization methods have been proposed for bulk RNA-seq. However, these methods may underperform in single-cell data due to the dominance of low and zero counts. To overcome this, one solution is to _pool_ counts from many cells to increase the size of the counts for accurate size factor estimation[^3]. Pool-based size factors are then _deconvolved_ into cell-based factors for normalization of each cell's expression profile. @@ -250,10 +292,15 @@ table(clust) deconv.sf <- calculateSumFactors(sce, cluster=clust) summary(deconv.sf) -plot(lib.sf, deconv.sf, xlab="Library size factor", - ylab="Deconvolution size factor", log='xy', pch=16, - col=as.integer(clust)) -abline(a=0, b=1, col="red") +sf_df$deconv_sf = deconv.sf +sf_df$clust = clust + +ggplot(sf_df, aes(size_factor, deconv_sf)) + + geom_abline() + + geom_point(aes(color = clust)) + + scale_x_log10() + + scale_y_log10() + ``` Once we have computed the size factors, we compute the normalized expression values for each cell by dividing the count for each gene with the appropriate size factor for that cell. Since we are typically going to work with log-transformed counts, the function `logNormCounts` also log-transforms the normalized values, creating a new assay called `logcounts`. @@ -264,6 +311,18 @@ sce <- logNormCounts(sce) sce ``` +:::: challenge + +Some sophisticated experiments perform additional steps so that they can estimate size factors from so-called "spike-ins". Judging by the name, what do you think "spike-ins" are, and what additional steps are required to use them? + +::: solution + +Spike-ins are deliberately introduced exogeneous RNA from an exotic or synthetic source at a known concentration. This provides a known signal to normalize to. Exotic or synthetic RNA (e.g. soil bacteria RNA in a study of human cells) is used in order to avoid confusing spike-in RNA with sample RNA. This has the obvious advantage of accounting for cell-wise variation, but adds additional sample-preparation work. + +::: + +:::: + ## Feature Selection The typical next steps in the analysis of single-cell data are dimensionality reduction and clustering, which involve measuring the similarity between cells. @@ -280,9 +339,16 @@ Calculation of the per-gene variance is simple but feature selection requires mo dec.sce <- modelGeneVar(sce) fit.sce <- metadata(dec.sce) -plot(fit.sce$mean, fit.sce$var, xlab = "Mean of log-expression", - ylab = "Variance of log-expression") -curve(fit.sce$trend(x), col = "dodgerblue", add = TRUE, lwd = 2) +mean_var_df = data.frame(mean = fit.sce$mean, + var = fit.sce$var) + +ggplot(mean_var_df, aes(mean, var)) + + geom_point() + + geom_function(fun = fit.sce$trend, + color = "dodgerblue") + + labs(x = "Mean of log-expression", + y = "Variance of log-expression") + ``` The blue line represents the uninteresting "technical" variance for any given gene abundance. The genes with a lot of additional variance exhibit interesting "biological" variation. @@ -296,6 +362,11 @@ hvg.sce.var <- getTopHVGs(dec.sce, n=1000) head(hvg.sce.var) ``` +:::: challenge + +Run an internet search for some of the most highly variable genes we've identified here. See if you can identify the type of protein they produce or what sort of process they're involved in. Do they make biological sense to you? + +:::: ## Dimensionality Reduction @@ -319,14 +390,26 @@ sce By default, `runPCA` computes the first 50 principal components. We can check how much original variability they explain. ```{r} -percent.var <- attr(reducedDim(sce), "percentVar") -plot(percent.var, log="y", xlab="PC", ylab="Variance explained (%)") +pct_var_df = data.frame(PC = 1:50, + pct_var = attr(reducedDim(sce), "percentVar")) + +ggplot(pct_var_df, + aes(PC, pct_var)) + + geom_point() + + labs(y = "Variance explained (%)") ``` +You can see the first two PCs capture the largest amount of variation, but in this case you have to take the first 8 PCs before you've captured 50% of the total. + And we can of course visualize the first 2-3 components, perhaps color-coding each point by an interesting feature, in this case the total number of UMIs per cell. ```{r} plotPCA(sce, colour_by="sum") +``` + +It can be helpful to compare pairs of PCs. This can be done with the `ncomponents` argument to `plotReducedDim()`. For example if one batch or cell type splits off on a particular PC, this can help visualize the effect of that. + +```{r} plotReducedDim(sce, dimred="PCA", ncomponents=3) ``` @@ -334,7 +417,7 @@ plotReducedDim(sce, dimred="PCA", ncomponents=3) While PCA is a simple and effective way to visualize (and interpret!) scRNA-seq data, non-linear methods such as t-SNE (_t-stochastic neighbor embedding_) and UMAP (_uniform manifold approximation and projection_) have gained much popularity in the literature. -These methods attempt to find a low-dimensional representation of the data that preserves the distances between each point and its neighbors in the high-dimensional space. +These methods attempt to find a low-dimensional representation of the data that attempt to preserve pair-wise distance and structure in high-dimensional gene space as best as possible. ```{r} set.seed(100) @@ -352,15 +435,25 @@ It is easy to over-interpret t-SNE and UMAP plots. We note that the relative siz In addition, these methods are not guaranteed to preserve the global structure of the data (e.g., the relative locations of non-neighboring clusters), such that we cannot use their positions to determine relationships between distant clusters. -Note that the `sce` object now includes all the computed dimensionality reduced representations of the data for ease of reusing and replotting without the need for recomputing. +Note that the `sce` object now includes all the computed dimensionality reduced representations of the data for ease of reusing and replotting without the need for recomputing. Note the added `reducedDimNames` row when printing `sce` here: ```{r} sce ``` -Despite their shortcomings, t-SNE and UMAP may be useful visualization techniques. +Despite their shortcomings, t-SNE and UMAP can be useful visualization techniques. When using them, it is important to consider that they are stochastic methods that involve a random component (each run will lead to different plots) and that there are key parameters to be set that change the results substantially (e.g., the "perplexity" parameter of t-SNE). +:::: challenge + +Can dimensionality reduction techniques provide a perfectly accurate representation of the data? + +::: solution +Mathematically, this would require the data to fall on a two-dimensional plane (for linear methods like PCA) or a smooth 2D manifold (for methods like UMAP). You can be confident that this will never happen in real-world data, so the reduction from ~2500-dimensional gene space to two-dimensional plot space always involves some degree of information loss. +::: + +:::: + ## Doublet identification _Doublets_ are artifactual libraries generated from two cells. They typically arise due to errors in cell sorting or capture. Specifically, in droplet-based protocols, it may happen that two cells are captured in the same droplet. @@ -371,7 +464,7 @@ It is not easy to computationally identify doublets as they can be hard to disti There are several computational methods to identify doublets; we describe only one here based on in-silico simulation of doublets. -### Computing doublet desities +### Computing doublet densities At a high level, the algorithm can be defined by the following steps: diff --git a/episodes/intro-sce.Rmd b/episodes/intro-sce.Rmd index 37a33d8..5d6e87b 100644 --- a/episodes/intro-sce.Rmd +++ b/episodes/intro-sce.Rmd @@ -25,6 +25,7 @@ exercises: 10 # Minutes of exercises in the lesson ```{r chunk-opts, include=FALSE} knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE) library(BiocStyle) +options(digits = 3) ``` ```{r setup} @@ -32,15 +33,15 @@ library(SingleCellExperiment) library(MouseGastrulationData) ``` +It's normal to see lot of startup messages when loading these packages. + ## Bioconductor ### Overview Within the R ecosystem, the Bioconductor project provides tools for the analysis and comprehension of high-throughput genomics data. The scope of the project covers microarray data, various forms of sequencing (RNA-seq, ChIP-seq, bisulfite, genotyping, etc.), proteomics, flow cytometry and more. -One of Bioconductor's main selling points is the use of common data structures to promote interoperability between packages, -allowing code written by different people (from different organizations, in different countries) to work together seamlessly in complex analyses. -By extending R to genomics, Bioconductor serves as a powerful addition to the computational biologist's toolkit. +One of Bioconductor's main selling points is the use of common data structures to promote interoperability between packages, allowing code written by different people (from different organizations, in different countries) to work together seamlessly in complex analyses. ### Installing Bioconductor Packages @@ -62,9 +63,6 @@ After that, we can use `r CRANpkg("BiocManager")`'s `install()` function to inst For example, the code chunk below uses this approach to install the `r Biocpkg("SingleCellExperiment")` package. ```{r, eval=FALSE} -## The command below is a one-line shortcut for: -## library(BiocManager) -## install("SingleCellExperiment") BiocManager::install("SingleCellExperiment") ``` @@ -99,13 +97,15 @@ This will check for more recent versions of each package (within a Bioconductor BiocManager::install() ``` +Be careful: if you have a lot of packages to update, this can take a long time. + ## The `SingleCellExperiment` class One of the main strengths of the Bioconductor project lies in the use of a common data infrastructure that powers interoperability across packages. Users should be able to analyze their data using functions from different Bioconductor packages without the need to convert between formats. To this end, the `SingleCellExperiment` class (from the _SingleCellExperiment_ package) serves as the common currency for data exchange across 70+ single-cell-related Bioconductor packages. -This class implements a data structure that stores all aspects of our single-cell data - gene-by-cell expression data, per-cell metadata and per-gene annotation - and manipulate them in a synchronized manner. +This class implements a data structure that stores all aspects of our single-cell data - gene-by-cell expression data, cell-wise metadata, and gene-wise annotation - and lets us manipulate them in an organized manner. ```{r, echo=FALSE} knitr::include_graphics("http://bioconductor.org/books/release/OSCA.intro/images/SingleCellExperiment.png") @@ -118,42 +118,70 @@ sce <- WTChimeraData(samples=5) sce ``` -We can think of this (and other) class as a _container_, that contains several different pieces of data in so-called _slots_. +We can think of this (and other) class as a _container_, that contains several different pieces of data in so-called _slots_. SingleCellExperiment objects come with dedicated methods for _getting_ and _setting_ the data in their slots. + +Depending on the object, slots can contain different types of data (e.g., numeric matrices, lists, etc.). Here we'll review the main slots of the SingleCellExperiment class as well as their getter/setter methods. + + +:::: challenge + +Before SingleCellExperiments, coders working with single cell data would sometimes keep all of these components in separate objects e.g. a matrix of counts, a data.frame of sample metadata, a data.frame of gene annotations and so on. What are the main disadvantage of this sort of "from scratch" approach? + +::: solution + +1. You have to do tons of book-keeping! If you perform a QC step that removes dead cells, now you also have to remember to remove that same set of cells from the cell-wise metadata. Dropped un-expressed genes? Don't forget to filter the gene metadata table too. + +2. All the downstream steps have to be "from scratch" as well! If your tables have some slight format difference from those of your lab-mate, suddenly the plotting code you're trying to re-use doesn't work! Agh! -The _getter_ methods are used to extract information from the slots and the _setter_ methods are used to add information into the slots. These are the only ways to interact with the objects (rather than directly accessing the slots). +::: -Depending on the object, slots can contain different types of data (e.g., numeric matrices, lists, etc.). We will here review the main slots of the SingleCellExperiment class as well as their getter/setter methods. +:::: -### The `assays` +### `assays` This is arguably the most fundamental part of the object that contains the count matrix, and potentially other matrices with transformed data. We can access the _list_ of matrices with the `assays` function and individual matrices with the `assay` function. If one of these matrices is called "counts", we can use the special `counts` getter (and the analogous `logcounts`). ```{r} -identical(assay(sce), counts(sce)) +names(assays(sce)) counts(sce)[1:3, 1:3] ``` You will notice that in this case we have a sparse matrix of class "dgTMatrix" inside the object. More generally, any "matrix-like" object can be used, e.g., dense matrices or HDF5-backed matrices (see "Working with large data"). -### The `colData` and `rowData` +### `colData` and `rowData` Conceptually, these are two data frames that annotate the columns and the rows of your assay, respectively. One can interact with them as usual, e.g., by extracting columns or adding additional variables as columns. ```{r} -colData(sce) -rowData(sce) +colData(sce)[1:3, 1:4] +rowData(sce)[1:3, 1:2] ``` -Note the `$` short cut. +You can access columns of the colData with the `$` accessor to quickly add cell-wise metadata to the colData ```{r} -identical(colData(sce)$sum, sce$sum) sce$my_sum <- colSums(counts(sce)) -colData(sce) +colData(sce)[1:3,] ``` +:::: challenge + +Try to add a column of gene-wise metadata to the rowData. + +::: solution + +Here we add a column called "conservation" that is just an integer sequence from 1 to the number of genes. + +```{r} +rowData(sce)$conservation = 1:nrow(sce) +``` + +::: + +:::: + ### The `reducedDims` Everything that we have described so far (except for the `counts` getter) is part of the `SummarizedExperiment` class that SingleCellExperiment extends. You can find a complete lesson on the `SummarizedExperiment` class in [Introduction to data analysis with R and Bioconductor](https://carpentries-incubator.github.io/bioc-intro/60-next-steps.html) course. From 4451b42f8c6f1e76fddfc99b2c2747d6e01f73b5 Mon Sep 17 00:00:00 2001 From: Andrew Ghazi <6763470+andrewGhazi@users.noreply.github.com> Date: Tue, 6 Aug 2024 16:48:18 -0400 Subject: [PATCH 2/5] k exercise --- episodes/cell_type_annotation.Rmd | 35 +++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/episodes/cell_type_annotation.Rmd b/episodes/cell_type_annotation.Rmd index d9e7e8c..6a538bb 100644 --- a/episodes/cell_type_annotation.Rmd +++ b/episodes/cell_type_annotation.Rmd @@ -8,8 +8,8 @@ editor_options: --- ::: questions -- How to identify groups of cells with similar expression profiles? -- How to identify genes that drive separation between these groups of cells? +- How can we identify groups of cells with similar expression profiles? +- How can we identify genes that drive separation between these groups of cells? - How to leverage reference datasets and known marker genes for the cell type annotation of new datasets? ::: @@ -38,12 +38,14 @@ library(scran) ## Data retrieval +We'll be using the same set of WT chimeric mouse embryo data: + ```{r data, message = FALSE} sce <- WTChimeraData(samples = 5, type = "processed") sce ``` -To speed up the computations, we subsample the dataset to 1,000 cells. +To speed up the computations, we take a random subset of 1,000 cells. ```{r} set.seed(123) @@ -53,6 +55,8 @@ sce <- sce[,ind] ## Preprocessing +The SCE object needs to contain log-normalized expression counts as well as PCA coordinates in the reduced dimensions, so we compute those here: + ```{r preproc, warning = FALSE} sce <- logNormCounts(sce) sce <- runPCA(sce) @@ -89,24 +93,37 @@ algorithm](https://doi.org/10.1088/1742-5468/2008/10/P10008) for community detection. All calculations are performed using the top PCs to take advantage of data compression and denoising. This function returns a vector containing cluster assignments for each cell in our -`SingleCellExperiment` object. +`SingleCellExperiment` object. We use the `colLabels()` function to assign the +cluster labels as a factor in the column data. ```{r cluster} colLabels(sce) <- clusterCells(sce, use.dimred = "PCA", - BLUSPARAM = NNGraphParam(cluster.fun = "louvain")) + BLUSPARAM = NNGraphParam(cluster.fun = "louvain", k = 20)) + table(colLabels(sce)) ``` -We assign the cluster assignments back into our `SingleCellExperiment` -object as a `factor` in the column metadata. This allows us to -conveniently visualize the distribution of clusters in eg. a *t*-SNE or -a UMAP. +We can now overlay the cluster labels as color on a UMAP plot: ```{r cluster-viz} sce <- runUMAP(sce, dimred = "PCA") + plotReducedDim(sce, "UMAP", color_by = "label") ``` +:::: challenge + +Our clusters look semi-reasonable, but what if we wanted to make them less granular? Look at the help documentation for `?clusterCells` and `?NNGraphParam` to find out what we'd need to change to get fewer, larger clusters. + +::: solution + +We see in the help documentation for `?clusterCells` that all of the clustering algorithm details are handled through the `BLUSPARAM` argument, which needs to provide a `BlusterParam` object (of which `NNGraphParam` is a sub-class). Each type of clustering algorithm will have some sort of hyper-parameter that controls the granularity of the output clusters. Looking at `?NNGraphParam` specifically, we see an argument called `k` which is described as "An integer scalar specifying the number of nearest neighbors to consider during graph construction." If the clustering process has to connect larger sets of neighbors, the graph will tend to be cut into larger groups, resulting in less granular clusters. Try the two code blocks again with `k = 20`. Given their visual differences, do you think one set of clusters is "right" and the other "wrong"? +::: + +:::: + + + ## Marker gene detection To interpret clustering results as obtained in the previous section, we From e04574bbd758ae953e83883309e8733dd0d3e5c8 Mon Sep 17 00:00:00 2001 From: Andrew Ghazi <6763470+andrewGhazi@users.noreply.github.com> Date: Tue, 6 Aug 2024 16:49:32 -0400 Subject: [PATCH 3/5] k exercise --- episodes/cell_type_annotation.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/episodes/cell_type_annotation.Rmd b/episodes/cell_type_annotation.Rmd index 6a538bb..f50ceac 100644 --- a/episodes/cell_type_annotation.Rmd +++ b/episodes/cell_type_annotation.Rmd @@ -117,7 +117,7 @@ Our clusters look semi-reasonable, but what if we wanted to make them less granu ::: solution -We see in the help documentation for `?clusterCells` that all of the clustering algorithm details are handled through the `BLUSPARAM` argument, which needs to provide a `BlusterParam` object (of which `NNGraphParam` is a sub-class). Each type of clustering algorithm will have some sort of hyper-parameter that controls the granularity of the output clusters. Looking at `?NNGraphParam` specifically, we see an argument called `k` which is described as "An integer scalar specifying the number of nearest neighbors to consider during graph construction." If the clustering process has to connect larger sets of neighbors, the graph will tend to be cut into larger groups, resulting in less granular clusters. Try the two code blocks again with `k = 20`. Given their visual differences, do you think one set of clusters is "right" and the other "wrong"? +We see in the help documentation for `?clusterCells` that all of the clustering algorithm details are handled through the `BLUSPARAM` argument, which needs to provide a `BlusterParam` object (of which `NNGraphParam` is a sub-class). Each type of clustering algorithm will have some sort of hyper-parameter that controls the granularity of the output clusters. Looking at `?NNGraphParam` specifically, we see an argument called `k` which is described as "An integer scalar specifying the number of nearest neighbors to consider during graph construction." If the clustering process has to connect larger sets of neighbors, the graph will tend to be cut into larger groups, resulting in less granular clusters. Try the two code blocks above once more with `k = 20`. Given their visual differences, do you think one set of clusters is "right" and the other is "wrong"? ::: :::: From 80ac2772b1650083af3460f3eeec5a707138464b Mon Sep 17 00:00:00 2001 From: Andrew Ghazi <6763470+andrewGhazi@users.noreply.github.com> Date: Wed, 7 Aug 2024 14:56:00 -0400 Subject: [PATCH 4/5] marker question --- episodes/cell_type_annotation.Rmd | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/episodes/cell_type_annotation.Rmd b/episodes/cell_type_annotation.Rmd index f50ceac..8ab1110 100644 --- a/episodes/cell_type_annotation.Rmd +++ b/episodes/cell_type_annotation.Rmd @@ -98,7 +98,7 @@ cluster labels as a factor in the column data. ```{r cluster} colLabels(sce) <- clusterCells(sce, use.dimred = "PCA", - BLUSPARAM = NNGraphParam(cluster.fun = "louvain", k = 20)) + BLUSPARAM = NNGraphParam(cluster.fun = "louvain")) table(colLabels(sce)) ``` @@ -123,7 +123,6 @@ We see in the help documentation for `?clusterCells` that all of the clustering :::: - ## Marker gene detection To interpret clustering results as obtained in the previous section, we @@ -138,7 +137,7 @@ testing for differential expression between clusters. If a gene is strongly DE between clusters, it is likely to have driven the separation of cells in the clustering algorithm. -Here, we perform a Wilcoxon rank sum test against a log2 fold change +Here, we use `findMarkers()` to perform a Wilcoxon rank sum test against a log2 fold change threshold of 1, focusing on up-regulated (positive) markers in one cluster when compared to another cluster. @@ -148,6 +147,8 @@ markers <- findMarkers(sce, test.type = "wilcox", direction = "up", lfc = 1) markers ``` + + The resulting object contains a sorted marker gene list for each cluster, in which the top genes are those that contribute the most to the separation of that cluster from mall other clusters. @@ -176,6 +177,22 @@ top.markers <- head(rownames(markers[[1]])) plotExpression(sce, features = top.markers, x = "label", color_by = "label") ``` +Clearly, not every marker gene distinguishes cluster 1 from every other cluster. However, with a combination of multiple marker genes it's possible to clearly identify gene patterns that are unique to cluster 1. It's sort of like the 20 questions game - with answers to the right questions about a cell, you can clearly identify what cluster it falls in. + +:::: challenge + +Why do you think marker genes are found by aggregating pairwise comparisons rather than iteratively comparing each cluster to all other clusters? + +::: solution + +One important reason why is because averages over all other clusters can be sensitive to the cell type composition. If a rare cell type shows up in one sample, the most discriminative marker genes found in this way could be very different from those found in another sample where the rare cell type is absent. + +Generally, it's good to keep in mind that the concept of "everything else" is not a stable basis for comparison. + +::: +:::: + + ## Cell type annotation The most challenging task in scRNA-seq data analysis is arguably the From 9822ad662b605d14b7bec5abf4e8c44bad856272 Mon Sep 17 00:00:00 2001 From: Andrew Ghazi <6763470+andrewGhazi@users.noreply.github.com> Date: Mon, 12 Aug 2024 14:10:24 -0400 Subject: [PATCH 5/5] type annotation exercises --- episodes/cell_type_annotation.Rmd | 73 +++++++++++++++++++++++++------ 1 file changed, 60 insertions(+), 13 deletions(-) diff --git a/episodes/cell_type_annotation.Rmd b/episodes/cell_type_annotation.Rmd index 8ab1110..a3f731a 100644 --- a/episodes/cell_type_annotation.Rmd +++ b/episodes/cell_type_annotation.Rmd @@ -177,7 +177,7 @@ top.markers <- head(rownames(markers[[1]])) plotExpression(sce, features = top.markers, x = "label", color_by = "label") ``` -Clearly, not every marker gene distinguishes cluster 1 from every other cluster. However, with a combination of multiple marker genes it's possible to clearly identify gene patterns that are unique to cluster 1. It's sort of like the 20 questions game - with answers to the right questions about a cell, you can clearly identify what cluster it falls in. +Clearly, not every marker gene distinguishes cluster 1 from every other cluster. However, with a combination of multiple marker genes it's possible to clearly identify gene patterns that are unique to cluster 1. It's sort of like the 20 questions game - with answers to the right questions about a cell (e.g. "Do you highly express Ptn?"), you can clearly identify what cluster it falls in. :::: challenge @@ -187,7 +187,7 @@ Why do you think marker genes are found by aggregating pairwise comparisons rath One important reason why is because averages over all other clusters can be sensitive to the cell type composition. If a rare cell type shows up in one sample, the most discriminative marker genes found in this way could be very different from those found in another sample where the rare cell type is absent. -Generally, it's good to keep in mind that the concept of "everything else" is not a stable basis for comparison. +Generally, it's good to keep in mind that the concept of "everything else" is not a stable basis for comparison. Read that sentence again, because its a subtle but broadly applicable point. Think about it and you can probably identify analogous issues in fields outside of single-cell analysis. It frequently comes up when comparisons between multiple categories are involved. ::: :::: @@ -261,6 +261,8 @@ ind <- sample(ncol(ref), 1000) ref <- ref[,ind] ``` +You can see we have an assortment of different cell types in the reference (with varying frequency): + ```{r ref-celltypes} tab <- sort(table(ref$celltype), decreasing = TRUE) tab @@ -304,29 +306,35 @@ sce.mat <- as.matrix(assay(sce, "logcounts")) ref.mat <- as.matrix(assay(ref, "logcounts")) ``` +Finally, run SingleR with the query and reference datasets: + ```{r singler} -res <- SingleR(test = sce.mat, ref = ref.mat, labels = ref$celltype) +res <- SingleR(test = sce.mat, + ref = ref.mat, + labels = ref$celltype) res ``` We inspect the results using a heatmap of the per-cell and label scores. Ideally, each cell should exhibit a high score in one label relative to all of the others, indicating that the assignment to that label was -unambiguous. This is largely the case for mesenchyme and endothelial -cells, whereas we see expectedly more ambiguity between the two -erythroid cell populations. +unambiguous. ```{r score-heat, fig.width = 10, fig.height = 10} plotScoreHeatmap(res) ``` -We also compare the cell type assignments with the clustering results to -determine the identity of each cluster. Here, several cell type classes -are nested within the same cluster, indicating that these clusters are -composed of several transcriptomically similar cell populations (such as -cluster 4 and 6). On the other hand, there are also instances where we -have several clusters for the same cell type, indicating that the -clustering represents finer subdivisions within these cell types. +We obtained fairly unambiguous predictions for mesenchyme and endothelial +cells, whereas we see expectedly more ambiguity between the two +erythroid cell populations. + +We can also compare the cell type assignments with the unsupervised clustering +results to determine the identity of each cluster. Here, several cell type +classes are nested within the same cluster, indicating that these clusters are +composed of several transcriptomically similar cell populations. On the other +hand, there are also instances where we have several clusters for the same cell +type, indicating that the clustering represents finer subdivisions within these +cell types. ```{r, fig.width = 10, fig.height = 10} library(pheatmap) @@ -345,6 +353,32 @@ tab <- table(res$pruned.labels, sce$celltype.mapped) pheatmap(log2(tab + 10), color = colorRampPalette(c("white", "blue"))(101)) ``` +:::: challenge + +SingleR can be computationally expensive. How do you set it to run in parallel? + +::: solution + +Use `BiocParallel` and the `BPPARAM` argument! This example will set it to use four cores on your laptop, but you can also configure BiocParallel to use cluster jobs. + +```{r eval=FALSE, echo = TRUE} + +library(BiocParallel) + +my_bpparam = MulticoreParam(workers = 4) + +res <- SingleR(test = sce.mat, + ref = ref.mat, + labels = ref$celltype, + BPPARAM = my_bpparam) +``` + +`BiocParallel` is the most common way to enable parallel computation in Bioconductor packages, so you can expect to see it elsewhere outside of SingleR. + +::: + +:::: + ### Assigning cell labels from gene sets A related strategy is to explicitly identify sets of marker genes that @@ -437,6 +471,19 @@ mixture, and the grey curve represents a fitted normal distribution. Vertical lines represent threshold estimates corresponding to each estimate of the distribution. +:::: challenge + +The diagnostics don't look so good for some of the examples here. Which ones? Why? + +::: solution + +The example that jumps out most strongly to the eye is ExE endoderm, which doesn't show clear separate modes. Simultaneously, Endothelium seems to have three or four modes. + +Remember, this is an exploratory diagnostic, not the final word! At this point it'd be good to engage in some critical inspection of the results. Maybe we don't have enough / the best marker genes. In this particular case, the fact that we subsetted the reference set to 1000 cells probably didn't help. +::: + +:::: + ## Session Info ```{r sessionInfo}