Skip to content

Commit

Permalink
Add 'batch' parameter to the CellQC function
Browse files Browse the repository at this point in the history
  • Loading branch information
zhanghao-njmu committed Aug 29, 2023
1 parent a087c8f commit 2b03eff
Show file tree
Hide file tree
Showing 2 changed files with 173 additions and 112 deletions.
261 changes: 150 additions & 111 deletions R/SCP-cellqc.R
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,6 @@ RunDoubletCalling <- function(srt, assay = "RNA", db_method = "scDblFinder", db_
stop("Data type is not raw counts!")
}
if (db_method %in% c("scDblFinder", "Scrublet", "DoubletDetection", "scds_cxds", "scds_bcds", "scds_hybrid")) {
message("Run doublet-calling with ", db_method)
methods <- unlist(strsplit(db_method, "_"))
method1 <- methods[1]
method2 <- methods[2]
Expand Down Expand Up @@ -271,15 +270,36 @@ isOutlier <- function(x, nmads = 2.5, constant = 1.4826, type = c("both", "lower
#' @examples
#' data("pancreas_sub")
#' pancreas_sub <- RunCellQC(pancreas_sub)
#' @importFrom Seurat Assays as.SingleCellExperiment PercentageFeatureSet WhichCells
#' CellStatPlot(
#' srt = pancreas_sub,
#' stat.by = c(
#' "db_qc", "outlier_qc", "umi_qc", "gene_qc",
#' "mito_qc", "ribo_qc", "ribo_mito_ratio_qc", "species_qc"
#' ),
#' plot_type = "upset", stat_level = "Fail"
#' )
#' table(pancreas_sub$CellQC)
#'
#' data("ifnb_sub")
#' ifnb_sub <- RunCellQC(ifnb_sub, batch = "stim", UMI_threshold = 1000, gene_threshold = 550)
#' CellStatPlot(
#' srt = ifnb_sub,
#' stat.by = c(
#' "db_qc", "outlier_qc", "umi_qc", "gene_qc",
#' "mito_qc", "ribo_qc", "ribo_mito_ratio_qc", "species_qc"
#' ),
#' plot_type = "upset", stat_level = "Fail"
#' )
#' table(ifnb_sub$CellQC)
#' @importFrom Seurat Assays as.SingleCellExperiment PercentageFeatureSet WhichCells SplitObject
#' @importFrom stats loess predict aggregate
#' @importFrom Matrix colSums t
#' @export
#'
RunCellQC <- function(srt, assay = "RNA",
RunCellQC <- function(srt, assay = "RNA", batch = NULL,
qc_metrics = c("doublets", "outlier", "umi", "gene", "mito", "ribo", "ribo_mito_ratio", "species"),
return_filtered = FALSE,
db_method = "scDblFinder", db_rate = ncol(srt) / 1000 * 0.01,
db_method = "scDblFinder", db_rate = NULL,
outlier_cutoff = c(
"log10_nCount:lower:2.5",
"log10_nCount:higher:5",
Expand Down Expand Up @@ -317,128 +337,147 @@ RunCellQC <- function(srt, assay = "RNA",
if (!paste0("nFeature_", assay) %in% colnames(srt@meta.data)) {
srt@meta.data[[paste0("nFeature_", assay)]] <- colSums(srt[[assay]]@counts > 0)
}
if (!is.null(batch)) {
srtList <- SplitObject(srt, split.by = batch)
} else {
srtList <- list(srt)
}

ntotal <- ncol(srt)

db_qc <- c()
if ("doublets" %in% qc_metrics) {
if (!is.null(db_method)) {
for (dbm in db_method) {
srt <- RunDoubletCalling(srt = srt, db_method = dbm, db_rate = db_rate)
db_qc <- unique(c(db_qc, colnames(srt)[srt[[paste0("db.", dbm, "_class"), drop = TRUE]] == "doublet"]))
}
for (i in seq_along(srtList)) {
srt <- srtList[[i]]
if (!is.null(batch)) {
cat("===", srt@meta.data[[batch]][1], "===\n")
}
}
ntotal <- ncol(srt)

outlier_qc <- c()
for (n in 1:length(species)) {
if (n == 0) {
break
db_qc <- c()
if ("doublets" %in% qc_metrics) {
if (!is.null(db_method)) {
if (is.null(db_rate)) {
db_rate <- ncol(srt) / 1000 * 0.01
}
if (db_rate >= 1) {
stop("The db_rate is equal to or greater than 1!")
}
for (dbm in db_method) {
srt <- RunDoubletCalling(srt = srt, db_method = dbm, db_rate = db_rate)
db_qc <- unique(c(db_qc, colnames(srt)[srt[[paste0("db.", dbm, "_class"), drop = TRUE]] == "doublet"]))
}
}
}
sp <- species[n]
prefix <- species_gene_prefix[n]
sp_genes <- rownames(srt[[assay]])[grep(pattern = paste0("^", prefix), x = rownames(srt[[assay]]))]
nCount <- srt[[paste0(c(paste0("nCount_", assay), sp), collapse = ".")]] <- colSums(srt[[assay]]@counts[sp_genes, ])
nFeature <- srt[[paste0(c(paste0("nFeature_", assay), sp), collapse = ".")]] <- colSums(srt[[assay]]@counts[sp_genes, ] > 0)
percent.mito <- srt[[paste0(c("percent.mito", sp), collapse = ".")]] <- PercentageFeatureSet(object = srt, assay = assay, pattern = paste0("(", paste0("^", prefix, "-*", mito_pattern), ")", collapse = "|"), features = mito_gene)[[1]]
percent.ribo <- srt[[paste0(c("percent.ribo", sp), collapse = ".")]] <- PercentageFeatureSet(object = srt, assay = assay, pattern = paste0("(", paste0("^", prefix, "-*", ribo_pattern), ")", collapse = "|"), features = ribo_gene)[[1]]
percent.ribo <- srt[[paste0(c("ribo.mito.ratio", sp), collapse = ".")]] <- srt[[paste0(c("percent.ribo", sp), collapse = "."), drop = TRUE]] / srt[[paste0(c("percent.mito", sp), collapse = "."), drop = TRUE]]
percent.genome <- srt[[paste0(c("percent.genome", sp), collapse = ".")]] <- PercentageFeatureSet(object = srt, assay = assay, pattern = paste0("^", prefix))[[1]]

if (n == 1) {
if ("outlier" %in% qc_metrics) {
message("Calculate outlier cells")
outlier_qc <- c()
for (n in 1:length(species)) {
if (n == 0) {
break
}
sp <- species[n]
prefix <- species_gene_prefix[n]
sp_genes <- rownames(srt[[assay]])[grep(pattern = paste0("^", prefix), x = rownames(srt[[assay]]))]
nCount <- srt[[paste0(c(paste0("nCount_", assay), sp), collapse = ".")]] <- colSums(srt[[assay]]@counts[sp_genes, ])
nFeature <- srt[[paste0(c(paste0("nFeature_", assay), sp), collapse = ".")]] <- colSums(srt[[assay]]@counts[sp_genes, ] > 0)
percent.mito <- srt[[paste0(c("percent.mito", sp), collapse = ".")]] <- PercentageFeatureSet(object = srt, assay = assay, pattern = paste0("(", paste0("^", prefix, "-*", mito_pattern), ")", collapse = "|"), features = mito_gene)[[1]]
percent.ribo <- srt[[paste0(c("percent.ribo", sp), collapse = ".")]] <- PercentageFeatureSet(object = srt, assay = assay, pattern = paste0("(", paste0("^", prefix, "-*", ribo_pattern), ")", collapse = "|"), features = ribo_gene)[[1]]
percent.ribo <- srt[[paste0(c("ribo.mito.ratio", sp), collapse = ".")]] <- srt[[paste0(c("percent.ribo", sp), collapse = "."), drop = TRUE]] / srt[[paste0(c("percent.mito", sp), collapse = "."), drop = TRUE]]
percent.genome <- srt[[paste0(c("percent.genome", sp), collapse = ".")]] <- PercentageFeatureSet(object = srt, assay = assay, pattern = paste0("^", prefix))[[1]]

# "percent.top_20:higher:5"
# countx <- as(srt[[assay]]@counts[sp_genes, ], "sparseMatrix")
# agg <- aggregate(x = countx@x, by = list(rep(colnames(countx), diff(countx@p))), FUN = function(x) {
# sum(head(x, 20))
# })
# rownames(agg) <- agg[[1]]
# percent.top_20 <- srt[[paste0(c("percent.top_20", sp), collapse = ".")]] <- agg[colnames(srt), "x"]
if (n == 1) {
if ("outlier" %in% qc_metrics) {
# "percent.top_20:higher:5"
# countx <- as(srt[[assay]]@counts[sp_genes, ], "sparseMatrix")
# agg <- aggregate(x = countx@x, by = list(rep(colnames(countx), diff(countx@p))), FUN = function(x) {
# sum(head(x, 20))
# })
# rownames(agg) <- agg[[1]]
# percent.top_20 <- srt[[paste0(c("percent.top_20", sp), collapse = ".")]] <- agg[colnames(srt), "x"]

log10_nFeature <- srt[[paste0(c(paste0("log10_nFeature_", assay), sp), collapse = ".")]] <- log10(nFeature)
log10_nCount <- srt[[paste0(c(paste0("log10_nCount_", assay), sp), collapse = ".")]] <- log10(nCount)
log10_nCount[is.infinite(log10_nCount)] <- NA
log10_nFeature[is.infinite(log10_nFeature)] <- NA
mod <- loess(log10_nFeature ~ log10_nCount, span = 1)
pred <- predict(mod, newdata = data.frame(log10_nCount = log10_nCount))
featurecount_dist <- srt[[paste0(c("featurecount_dist", sp), collapse = ".")]] <- log10_nFeature - pred
log10_nFeature <- srt[[paste0(c(paste0("log10_nFeature_", assay), sp), collapse = ".")]] <- log10(nFeature)
log10_nCount <- srt[[paste0(c(paste0("log10_nCount_", assay), sp), collapse = ".")]] <- log10(nCount)
log10_nCount[is.infinite(log10_nCount)] <- NA
log10_nFeature[is.infinite(log10_nFeature)] <- NA
mod <- loess(log10_nFeature ~ log10_nCount)
pred <- predict(mod, newdata = data.frame(log10_nCount = log10_nCount))
featurecount_dist <- srt[[paste0(c("featurecount_dist", sp), collapse = ".")]] <- log10_nFeature - pred

# df <- data.frame(cell = colnames(srt), ribo = srt$percent.ribo.Homo_sapiens, y = log10_nFeature, x = log10_nCount, pred = pred, featurecount_dist = featurecount_dist)
# lower_df <- subset(df, featurecount_dist < median(df$featurecount_dist) - 2.5 * mad(df$featurecount_dist))
# higher_df <- subset(df, featurecount_dist > median(df$featurecount_dist) + 2.5 * mad(df$featurecount_dist))
# ggplot(df) +
# geom_point(aes(x = x, y = y, color = featurecount_dist)) +
# scale_color_gradientn(colors = c("green", "white", "orange"), values = scales::rescale(c(min(df$featurecount_dist), 0, max(df$featurecount_dist)))) +
# geom_point(data = lower_df, aes(x = x, y = y), shape = 21, fill = "transparent", color = "blue") +
# geom_point(data = higher_df, aes(x = x, y = y), shape = 21, fill = "transparent", color = "red") +
# geom_line(aes(x = x, y = pred), color = "black")+
# theme(panel.background = element_rect(fill = "grey"))
# nrow(lower_df)
# df <- data.frame(cell = colnames(srt), ribo = srt$percent.ribo.Homo_sapiens, y = log10_nFeature, x = log10_nCount, pred = pred, featurecount_dist = featurecount_dist)
# lower_df <- subset(df, featurecount_dist < median(df$featurecount_dist) - 2.5 * mad(df$featurecount_dist))
# higher_df <- subset(df, featurecount_dist > median(df$featurecount_dist) + 2.5 * mad(df$featurecount_dist))
# ggplot(df) +
# geom_point(aes(x = x, y = y, color = featurecount_dist)) +
# scale_color_gradientn(colors = c("green", "white", "orange"), values = scales::rescale(c(min(df$featurecount_dist), 0, max(df$featurecount_dist)))) +
# geom_point(data = lower_df, aes(x = x, y = y), shape = 21, fill = "transparent", color = "blue") +
# geom_point(data = higher_df, aes(x = x, y = y), shape = 21, fill = "transparent", color = "red") +
# geom_line(aes(x = x, y = pred), color = "black")+
# theme(panel.background = element_rect(fill = "grey"))
# nrow(lower_df)

var <- sapply(strsplit(outlier_cutoff, ":"), function(x) x[[1]])
var_valid <- var %in% colnames(srt@meta.data) | sapply(var, FUN = function(x) exists(x, where = environment()))
if (any(!var_valid)) {
stop("Variable ", paste0(names(var_valid)[!var_valid], collapse = ","), " is not found in the srt object.")
}
outlier <- lapply(strsplit(outlier_cutoff, ":"), function(m) {
colnames(srt)[isOutlier(get(m[1]), nmads = as.numeric(m[3]), type = m[2])]
})
names(outlier) <- outlier_cutoff
print(unlist(lapply(outlier, length)))
outlier_tb <- table(unlist(outlier))
outlier_qc <- c(outlier_qc, names(outlier_tb)[outlier_tb >= outlier_n])
for (nm in names(outlier)) {
srt[[make.names(nm)]] <- colnames(srt) %in% outlier[[nm]]
var <- sapply(strsplit(outlier_cutoff, ":"), function(x) x[[1]])
var_valid <- var %in% colnames(srt@meta.data) | sapply(var, FUN = function(x) exists(x, where = environment()))
if (any(!var_valid)) {
stop("Variable ", paste0(names(var_valid)[!var_valid], collapse = ","), " is not found in the srt object.")
}
outlier <- lapply(strsplit(outlier_cutoff, ":"), function(m) {
colnames(srt)[isOutlier(get(m[1]), nmads = as.numeric(m[3]), type = m[2])]
})
names(outlier) <- outlier_cutoff
# print(unlist(lapply(outlier, length)))
outlier_tb <- table(unlist(outlier))
outlier_qc <- c(outlier_qc, names(outlier_tb)[outlier_tb >= outlier_n])
for (nm in names(outlier)) {
srt[[make.names(nm)]] <- colnames(srt) %in% outlier[[nm]]
}
}
}
}
}

umi_qc <- gene_qc <- mito_qc <- ribo_qc <- ribo_mito_ratio_qc <- species_qc <- c()
if ("umi" %in% qc_metrics) {
umi_qc <- colnames(srt)[srt[[paste0(c(paste0("nCount_", assay), species[1]), collapse = "."), drop = TRUE]] < UMI_threshold]
}
if ("gene" %in% qc_metrics) {
gene_qc <- colnames(srt)[srt[[paste0(c(paste0("nFeature_", assay), species[1]), collapse = "."), drop = TRUE]] < gene_threshold]
}
if ("mito" %in% qc_metrics) {
mito_qc <- colnames(srt)[srt[[paste0(c("percent.mito", species[1]), collapse = "."), drop = TRUE]] > mito_threshold]
}
if ("ribo" %in% qc_metrics) {
ribo_qc <- colnames(srt)[srt[[paste0(c("percent.ribo", species[1]), collapse = "."), drop = TRUE]] > ribo_threshold]
}
if ("ribo_mito_ratio" %in% qc_metrics) {
ribo_mito_ratio_qc <- colnames(srt)[srt[[paste0(c("ribo.mito.ratio", species[1]), collapse = "."), drop = TRUE]] < ribo_mito_ratio_range[1] | srt[[paste0(c("ribo.mito.ratio", species[1]), collapse = "."), drop = TRUE]] > ribo_mito_ratio_range[2]]
}
if ("species" %in% qc_metrics) {
species_qc <- colnames(srt)[srt[[paste0(c("percent.genome", species[1]), collapse = "."), drop = TRUE]] < species_percent]
}
umi_qc <- gene_qc <- mito_qc <- ribo_qc <- ribo_mito_ratio_qc <- species_qc <- c()
if ("umi" %in% qc_metrics) {
umi_qc <- colnames(srt)[srt[[paste0(c(paste0("nCount_", assay), species[1]), collapse = "."), drop = TRUE]] < UMI_threshold]
}
if ("gene" %in% qc_metrics) {
gene_qc <- colnames(srt)[srt[[paste0(c(paste0("nFeature_", assay), species[1]), collapse = "."), drop = TRUE]] < gene_threshold]
}
if ("mito" %in% qc_metrics) {
mito_qc <- colnames(srt)[srt[[paste0(c("percent.mito", species[1]), collapse = "."), drop = TRUE]] > mito_threshold]
}
if ("ribo" %in% qc_metrics) {
ribo_qc <- colnames(srt)[srt[[paste0(c("percent.ribo", species[1]), collapse = "."), drop = TRUE]] > ribo_threshold]
}
if ("ribo_mito_ratio" %in% qc_metrics) {
ribo_mito_ratio_qc <- colnames(srt)[srt[[paste0(c("ribo.mito.ratio", species[1]), collapse = "."), drop = TRUE]] < ribo_mito_ratio_range[1] | srt[[paste0(c("ribo.mito.ratio", species[1]), collapse = "."), drop = TRUE]] > ribo_mito_ratio_range[2]]
}
if ("species" %in% qc_metrics) {
species_qc <- colnames(srt)[srt[[paste0(c("percent.genome", species[1]), collapse = "."), drop = TRUE]] < species_percent]
}

CellQC <- unique(c(db_qc, outlier_qc, umi_qc, gene_qc, mito_qc, ribo_qc, ribo_mito_ratio_qc, species_qc))
cat(">>>", "Total cells:", ntotal, "\n")
cat(">>>", "Cells which are filtered out:", length(CellQC), "\n")
cat("...", length(db_qc), "potential doublets", "\n")
cat("...", length(outlier_qc), "outlier cells", "\n")
cat("...", length(umi_qc), "low-UMI cells", "\n")
cat("...", length(gene_qc), "low-gene cells", "\n")
cat("...", length(mito_qc), "high-mito cells", "\n")
cat("...", length(ribo_qc), "high-ribo cells", "\n")
cat("...", length(ribo_mito_ratio_qc), " ribo_mito_ratio outlier cells", "\n")
cat("...", length(species_qc), "species-contaminated cells", "\n")
cat(">>>", "Remained cells after filtering:", ntotal - length(CellQC), "\n")
CellQC <- unique(c(db_qc, outlier_qc, umi_qc, gene_qc, mito_qc, ribo_qc, ribo_mito_ratio_qc, species_qc))
cat(">>>", "Total cells:", ntotal, "\n")
cat(">>>", "Cells which are filtered out:", length(CellQC), "\n")
cat("...", length(db_qc), "potential doublets", "\n")
cat("...", length(outlier_qc), "outlier cells", "\n")
cat("...", length(umi_qc), "low-UMI cells", "\n")
cat("...", length(gene_qc), "low-gene cells", "\n")
cat("...", length(mito_qc), "high-mito cells", "\n")
cat("...", length(ribo_qc), "high-ribo cells", "\n")
cat("...", length(ribo_mito_ratio_qc), "ribo_mito_ratio outlier cells", "\n")
cat("...", length(species_qc), "species-contaminated cells", "\n")
cat(">>>", "Remained cells after filtering:", ntotal - length(CellQC), "\n")

qc_nm <- c("db_qc", "outlier_qc", "umi_qc", "gene_qc", "mito_qc", "ribo_qc", "ribo_mito_ratio_qc", "species_qc", "CellQC")
for (qc in qc_nm) {
srt[[qc]] <- ifelse(colnames(srt) %in% get(qc), "Fail", "Pass")
srt[[qc]] <- factor(srt[[qc, drop = TRUE]], levels = c("Pass", "Fail"))
}
qc_nm <- c("db_qc", "outlier_qc", "umi_qc", "gene_qc", "mito_qc", "ribo_qc", "ribo_mito_ratio_qc", "species_qc", "CellQC")
for (qc in qc_nm) {
srt[[qc]] <- ifelse(colnames(srt) %in% get(qc), "Fail", "Pass")
srt[[qc]] <- factor(srt[[qc, drop = TRUE]], levels = c("Pass", "Fail"))
}

if (return_filtered) {
srt <- srt[, srt$CellQC == "Pass"]
srt@meta.data[, intersect(qc_nm, colnames(srt@meta.data))] <- NULL
if (return_filtered) {
srt <- srt[, srt$CellQC == "Pass"]
srt@meta.data[, intersect(qc_nm, colnames(srt@meta.data))] <- NULL
}
srtList[[i]] <- srt
}
if (length(srtList) > 1) {
return(Reduce(merge, srtList))
} else {
return(srtList[[1]])
}

return(srt)
}
24 changes: 23 additions & 1 deletion man/RunCellQC.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 2b03eff

Please sign in to comment.