diff --git a/R/SCP-cellqc.R b/R/SCP-cellqc.R index d0c56253..53939fec 100644 --- a/R/SCP-cellqc.R +++ b/R/SCP-cellqc.R @@ -177,7 +177,6 @@ RunDoubletCalling <- function(srt, assay = "RNA", db_method = "scDblFinder", db_ stop("Data type is not raw counts!") } if (db_method %in% c("scDblFinder", "Scrublet", "DoubletDetection", "scds_cxds", "scds_bcds", "scds_hybrid")) { - message("Run doublet-calling with ", db_method) methods <- unlist(strsplit(db_method, "_")) method1 <- methods[1] method2 <- methods[2] @@ -271,15 +270,36 @@ isOutlier <- function(x, nmads = 2.5, constant = 1.4826, type = c("both", "lower #' @examples #' data("pancreas_sub") #' pancreas_sub <- RunCellQC(pancreas_sub) -#' @importFrom Seurat Assays as.SingleCellExperiment PercentageFeatureSet WhichCells +#' CellStatPlot( +#' srt = pancreas_sub, +#' stat.by = c( +#' "db_qc", "outlier_qc", "umi_qc", "gene_qc", +#' "mito_qc", "ribo_qc", "ribo_mito_ratio_qc", "species_qc" +#' ), +#' plot_type = "upset", stat_level = "Fail" +#' ) +#' table(pancreas_sub$CellQC) +#' +#' data("ifnb_sub") +#' ifnb_sub <- RunCellQC(ifnb_sub, batch = "stim", UMI_threshold = 1000, gene_threshold = 550) +#' CellStatPlot( +#' srt = ifnb_sub, +#' stat.by = c( +#' "db_qc", "outlier_qc", "umi_qc", "gene_qc", +#' "mito_qc", "ribo_qc", "ribo_mito_ratio_qc", "species_qc" +#' ), +#' plot_type = "upset", stat_level = "Fail" +#' ) +#' table(ifnb_sub$CellQC) +#' @importFrom Seurat Assays as.SingleCellExperiment PercentageFeatureSet WhichCells SplitObject #' @importFrom stats loess predict aggregate #' @importFrom Matrix colSums t #' @export #' -RunCellQC <- function(srt, assay = "RNA", +RunCellQC <- function(srt, assay = "RNA", batch = NULL, qc_metrics = c("doublets", "outlier", "umi", "gene", "mito", "ribo", "ribo_mito_ratio", "species"), return_filtered = FALSE, - db_method = "scDblFinder", db_rate = ncol(srt) / 1000 * 0.01, + db_method = "scDblFinder", db_rate = NULL, outlier_cutoff = c( "log10_nCount:lower:2.5", "log10_nCount:higher:5", @@ -317,128 +337,147 @@ RunCellQC <- function(srt, assay = "RNA", if (!paste0("nFeature_", assay) %in% colnames(srt@meta.data)) { srt@meta.data[[paste0("nFeature_", assay)]] <- colSums(srt[[assay]]@counts > 0) } + if (!is.null(batch)) { + srtList <- SplitObject(srt, split.by = batch) + } else { + srtList <- list(srt) + } - ntotal <- ncol(srt) - - db_qc <- c() - if ("doublets" %in% qc_metrics) { - if (!is.null(db_method)) { - for (dbm in db_method) { - srt <- RunDoubletCalling(srt = srt, db_method = dbm, db_rate = db_rate) - db_qc <- unique(c(db_qc, colnames(srt)[srt[[paste0("db.", dbm, "_class"), drop = TRUE]] == "doublet"])) - } + for (i in seq_along(srtList)) { + srt <- srtList[[i]] + if (!is.null(batch)) { + cat("===", srt@meta.data[[batch]][1], "===\n") } - } + ntotal <- ncol(srt) - outlier_qc <- c() - for (n in 1:length(species)) { - if (n == 0) { - break + db_qc <- c() + if ("doublets" %in% qc_metrics) { + if (!is.null(db_method)) { + if (is.null(db_rate)) { + db_rate <- ncol(srt) / 1000 * 0.01 + } + if (db_rate >= 1) { + stop("The db_rate is equal to or greater than 1!") + } + for (dbm in db_method) { + srt <- RunDoubletCalling(srt = srt, db_method = dbm, db_rate = db_rate) + db_qc <- unique(c(db_qc, colnames(srt)[srt[[paste0("db.", dbm, "_class"), drop = TRUE]] == "doublet"])) + } + } } - sp <- species[n] - prefix <- species_gene_prefix[n] - sp_genes <- rownames(srt[[assay]])[grep(pattern = paste0("^", prefix), x = rownames(srt[[assay]]))] - nCount <- srt[[paste0(c(paste0("nCount_", assay), sp), collapse = ".")]] <- colSums(srt[[assay]]@counts[sp_genes, ]) - nFeature <- srt[[paste0(c(paste0("nFeature_", assay), sp), collapse = ".")]] <- colSums(srt[[assay]]@counts[sp_genes, ] > 0) - percent.mito <- srt[[paste0(c("percent.mito", sp), collapse = ".")]] <- PercentageFeatureSet(object = srt, assay = assay, pattern = paste0("(", paste0("^", prefix, "-*", mito_pattern), ")", collapse = "|"), features = mito_gene)[[1]] - percent.ribo <- srt[[paste0(c("percent.ribo", sp), collapse = ".")]] <- PercentageFeatureSet(object = srt, assay = assay, pattern = paste0("(", paste0("^", prefix, "-*", ribo_pattern), ")", collapse = "|"), features = ribo_gene)[[1]] - percent.ribo <- srt[[paste0(c("ribo.mito.ratio", sp), collapse = ".")]] <- srt[[paste0(c("percent.ribo", sp), collapse = "."), drop = TRUE]] / srt[[paste0(c("percent.mito", sp), collapse = "."), drop = TRUE]] - percent.genome <- srt[[paste0(c("percent.genome", sp), collapse = ".")]] <- PercentageFeatureSet(object = srt, assay = assay, pattern = paste0("^", prefix))[[1]] - if (n == 1) { - if ("outlier" %in% qc_metrics) { - message("Calculate outlier cells") + outlier_qc <- c() + for (n in 1:length(species)) { + if (n == 0) { + break + } + sp <- species[n] + prefix <- species_gene_prefix[n] + sp_genes <- rownames(srt[[assay]])[grep(pattern = paste0("^", prefix), x = rownames(srt[[assay]]))] + nCount <- srt[[paste0(c(paste0("nCount_", assay), sp), collapse = ".")]] <- colSums(srt[[assay]]@counts[sp_genes, ]) + nFeature <- srt[[paste0(c(paste0("nFeature_", assay), sp), collapse = ".")]] <- colSums(srt[[assay]]@counts[sp_genes, ] > 0) + percent.mito <- srt[[paste0(c("percent.mito", sp), collapse = ".")]] <- PercentageFeatureSet(object = srt, assay = assay, pattern = paste0("(", paste0("^", prefix, "-*", mito_pattern), ")", collapse = "|"), features = mito_gene)[[1]] + percent.ribo <- srt[[paste0(c("percent.ribo", sp), collapse = ".")]] <- PercentageFeatureSet(object = srt, assay = assay, pattern = paste0("(", paste0("^", prefix, "-*", ribo_pattern), ")", collapse = "|"), features = ribo_gene)[[1]] + percent.ribo <- srt[[paste0(c("ribo.mito.ratio", sp), collapse = ".")]] <- srt[[paste0(c("percent.ribo", sp), collapse = "."), drop = TRUE]] / srt[[paste0(c("percent.mito", sp), collapse = "."), drop = TRUE]] + percent.genome <- srt[[paste0(c("percent.genome", sp), collapse = ".")]] <- PercentageFeatureSet(object = srt, assay = assay, pattern = paste0("^", prefix))[[1]] - # "percent.top_20:higher:5" - # countx <- as(srt[[assay]]@counts[sp_genes, ], "sparseMatrix") - # agg <- aggregate(x = countx@x, by = list(rep(colnames(countx), diff(countx@p))), FUN = function(x) { - # sum(head(x, 20)) - # }) - # rownames(agg) <- agg[[1]] - # percent.top_20 <- srt[[paste0(c("percent.top_20", sp), collapse = ".")]] <- agg[colnames(srt), "x"] + if (n == 1) { + if ("outlier" %in% qc_metrics) { + # "percent.top_20:higher:5" + # countx <- as(srt[[assay]]@counts[sp_genes, ], "sparseMatrix") + # agg <- aggregate(x = countx@x, by = list(rep(colnames(countx), diff(countx@p))), FUN = function(x) { + # sum(head(x, 20)) + # }) + # rownames(agg) <- agg[[1]] + # percent.top_20 <- srt[[paste0(c("percent.top_20", sp), collapse = ".")]] <- agg[colnames(srt), "x"] - log10_nFeature <- srt[[paste0(c(paste0("log10_nFeature_", assay), sp), collapse = ".")]] <- log10(nFeature) - log10_nCount <- srt[[paste0(c(paste0("log10_nCount_", assay), sp), collapse = ".")]] <- log10(nCount) - log10_nCount[is.infinite(log10_nCount)] <- NA - log10_nFeature[is.infinite(log10_nFeature)] <- NA - mod <- loess(log10_nFeature ~ log10_nCount, span = 1) - pred <- predict(mod, newdata = data.frame(log10_nCount = log10_nCount)) - featurecount_dist <- srt[[paste0(c("featurecount_dist", sp), collapse = ".")]] <- log10_nFeature - pred + log10_nFeature <- srt[[paste0(c(paste0("log10_nFeature_", assay), sp), collapse = ".")]] <- log10(nFeature) + log10_nCount <- srt[[paste0(c(paste0("log10_nCount_", assay), sp), collapse = ".")]] <- log10(nCount) + log10_nCount[is.infinite(log10_nCount)] <- NA + log10_nFeature[is.infinite(log10_nFeature)] <- NA + mod <- loess(log10_nFeature ~ log10_nCount) + pred <- predict(mod, newdata = data.frame(log10_nCount = log10_nCount)) + featurecount_dist <- srt[[paste0(c("featurecount_dist", sp), collapse = ".")]] <- log10_nFeature - pred - # df <- data.frame(cell = colnames(srt), ribo = srt$percent.ribo.Homo_sapiens, y = log10_nFeature, x = log10_nCount, pred = pred, featurecount_dist = featurecount_dist) - # lower_df <- subset(df, featurecount_dist < median(df$featurecount_dist) - 2.5 * mad(df$featurecount_dist)) - # higher_df <- subset(df, featurecount_dist > median(df$featurecount_dist) + 2.5 * mad(df$featurecount_dist)) - # ggplot(df) + - # geom_point(aes(x = x, y = y, color = featurecount_dist)) + - # scale_color_gradientn(colors = c("green", "white", "orange"), values = scales::rescale(c(min(df$featurecount_dist), 0, max(df$featurecount_dist)))) + - # geom_point(data = lower_df, aes(x = x, y = y), shape = 21, fill = "transparent", color = "blue") + - # geom_point(data = higher_df, aes(x = x, y = y), shape = 21, fill = "transparent", color = "red") + - # geom_line(aes(x = x, y = pred), color = "black")+ - # theme(panel.background = element_rect(fill = "grey")) - # nrow(lower_df) + # df <- data.frame(cell = colnames(srt), ribo = srt$percent.ribo.Homo_sapiens, y = log10_nFeature, x = log10_nCount, pred = pred, featurecount_dist = featurecount_dist) + # lower_df <- subset(df, featurecount_dist < median(df$featurecount_dist) - 2.5 * mad(df$featurecount_dist)) + # higher_df <- subset(df, featurecount_dist > median(df$featurecount_dist) + 2.5 * mad(df$featurecount_dist)) + # ggplot(df) + + # geom_point(aes(x = x, y = y, color = featurecount_dist)) + + # scale_color_gradientn(colors = c("green", "white", "orange"), values = scales::rescale(c(min(df$featurecount_dist), 0, max(df$featurecount_dist)))) + + # geom_point(data = lower_df, aes(x = x, y = y), shape = 21, fill = "transparent", color = "blue") + + # geom_point(data = higher_df, aes(x = x, y = y), shape = 21, fill = "transparent", color = "red") + + # geom_line(aes(x = x, y = pred), color = "black")+ + # theme(panel.background = element_rect(fill = "grey")) + # nrow(lower_df) - var <- sapply(strsplit(outlier_cutoff, ":"), function(x) x[[1]]) - var_valid <- var %in% colnames(srt@meta.data) | sapply(var, FUN = function(x) exists(x, where = environment())) - if (any(!var_valid)) { - stop("Variable ", paste0(names(var_valid)[!var_valid], collapse = ","), " is not found in the srt object.") - } - outlier <- lapply(strsplit(outlier_cutoff, ":"), function(m) { - colnames(srt)[isOutlier(get(m[1]), nmads = as.numeric(m[3]), type = m[2])] - }) - names(outlier) <- outlier_cutoff - print(unlist(lapply(outlier, length))) - outlier_tb <- table(unlist(outlier)) - outlier_qc <- c(outlier_qc, names(outlier_tb)[outlier_tb >= outlier_n]) - for (nm in names(outlier)) { - srt[[make.names(nm)]] <- colnames(srt) %in% outlier[[nm]] + var <- sapply(strsplit(outlier_cutoff, ":"), function(x) x[[1]]) + var_valid <- var %in% colnames(srt@meta.data) | sapply(var, FUN = function(x) exists(x, where = environment())) + if (any(!var_valid)) { + stop("Variable ", paste0(names(var_valid)[!var_valid], collapse = ","), " is not found in the srt object.") + } + outlier <- lapply(strsplit(outlier_cutoff, ":"), function(m) { + colnames(srt)[isOutlier(get(m[1]), nmads = as.numeric(m[3]), type = m[2])] + }) + names(outlier) <- outlier_cutoff + # print(unlist(lapply(outlier, length))) + outlier_tb <- table(unlist(outlier)) + outlier_qc <- c(outlier_qc, names(outlier_tb)[outlier_tb >= outlier_n]) + for (nm in names(outlier)) { + srt[[make.names(nm)]] <- colnames(srt) %in% outlier[[nm]] + } } } } - } - umi_qc <- gene_qc <- mito_qc <- ribo_qc <- ribo_mito_ratio_qc <- species_qc <- c() - if ("umi" %in% qc_metrics) { - umi_qc <- colnames(srt)[srt[[paste0(c(paste0("nCount_", assay), species[1]), collapse = "."), drop = TRUE]] < UMI_threshold] - } - if ("gene" %in% qc_metrics) { - gene_qc <- colnames(srt)[srt[[paste0(c(paste0("nFeature_", assay), species[1]), collapse = "."), drop = TRUE]] < gene_threshold] - } - if ("mito" %in% qc_metrics) { - mito_qc <- colnames(srt)[srt[[paste0(c("percent.mito", species[1]), collapse = "."), drop = TRUE]] > mito_threshold] - } - if ("ribo" %in% qc_metrics) { - ribo_qc <- colnames(srt)[srt[[paste0(c("percent.ribo", species[1]), collapse = "."), drop = TRUE]] > ribo_threshold] - } - if ("ribo_mito_ratio" %in% qc_metrics) { - ribo_mito_ratio_qc <- colnames(srt)[srt[[paste0(c("ribo.mito.ratio", species[1]), collapse = "."), drop = TRUE]] < ribo_mito_ratio_range[1] | srt[[paste0(c("ribo.mito.ratio", species[1]), collapse = "."), drop = TRUE]] > ribo_mito_ratio_range[2]] - } - if ("species" %in% qc_metrics) { - species_qc <- colnames(srt)[srt[[paste0(c("percent.genome", species[1]), collapse = "."), drop = TRUE]] < species_percent] - } + umi_qc <- gene_qc <- mito_qc <- ribo_qc <- ribo_mito_ratio_qc <- species_qc <- c() + if ("umi" %in% qc_metrics) { + umi_qc <- colnames(srt)[srt[[paste0(c(paste0("nCount_", assay), species[1]), collapse = "."), drop = TRUE]] < UMI_threshold] + } + if ("gene" %in% qc_metrics) { + gene_qc <- colnames(srt)[srt[[paste0(c(paste0("nFeature_", assay), species[1]), collapse = "."), drop = TRUE]] < gene_threshold] + } + if ("mito" %in% qc_metrics) { + mito_qc <- colnames(srt)[srt[[paste0(c("percent.mito", species[1]), collapse = "."), drop = TRUE]] > mito_threshold] + } + if ("ribo" %in% qc_metrics) { + ribo_qc <- colnames(srt)[srt[[paste0(c("percent.ribo", species[1]), collapse = "."), drop = TRUE]] > ribo_threshold] + } + if ("ribo_mito_ratio" %in% qc_metrics) { + ribo_mito_ratio_qc <- colnames(srt)[srt[[paste0(c("ribo.mito.ratio", species[1]), collapse = "."), drop = TRUE]] < ribo_mito_ratio_range[1] | srt[[paste0(c("ribo.mito.ratio", species[1]), collapse = "."), drop = TRUE]] > ribo_mito_ratio_range[2]] + } + if ("species" %in% qc_metrics) { + species_qc <- colnames(srt)[srt[[paste0(c("percent.genome", species[1]), collapse = "."), drop = TRUE]] < species_percent] + } - CellQC <- unique(c(db_qc, outlier_qc, umi_qc, gene_qc, mito_qc, ribo_qc, ribo_mito_ratio_qc, species_qc)) - cat(">>>", "Total cells:", ntotal, "\n") - cat(">>>", "Cells which are filtered out:", length(CellQC), "\n") - cat("...", length(db_qc), "potential doublets", "\n") - cat("...", length(outlier_qc), "outlier cells", "\n") - cat("...", length(umi_qc), "low-UMI cells", "\n") - cat("...", length(gene_qc), "low-gene cells", "\n") - cat("...", length(mito_qc), "high-mito cells", "\n") - cat("...", length(ribo_qc), "high-ribo cells", "\n") - cat("...", length(ribo_mito_ratio_qc), " ribo_mito_ratio outlier cells", "\n") - cat("...", length(species_qc), "species-contaminated cells", "\n") - cat(">>>", "Remained cells after filtering:", ntotal - length(CellQC), "\n") + CellQC <- unique(c(db_qc, outlier_qc, umi_qc, gene_qc, mito_qc, ribo_qc, ribo_mito_ratio_qc, species_qc)) + cat(">>>", "Total cells:", ntotal, "\n") + cat(">>>", "Cells which are filtered out:", length(CellQC), "\n") + cat("...", length(db_qc), "potential doublets", "\n") + cat("...", length(outlier_qc), "outlier cells", "\n") + cat("...", length(umi_qc), "low-UMI cells", "\n") + cat("...", length(gene_qc), "low-gene cells", "\n") + cat("...", length(mito_qc), "high-mito cells", "\n") + cat("...", length(ribo_qc), "high-ribo cells", "\n") + cat("...", length(ribo_mito_ratio_qc), "ribo_mito_ratio outlier cells", "\n") + cat("...", length(species_qc), "species-contaminated cells", "\n") + cat(">>>", "Remained cells after filtering:", ntotal - length(CellQC), "\n") - qc_nm <- c("db_qc", "outlier_qc", "umi_qc", "gene_qc", "mito_qc", "ribo_qc", "ribo_mito_ratio_qc", "species_qc", "CellQC") - for (qc in qc_nm) { - srt[[qc]] <- ifelse(colnames(srt) %in% get(qc), "Fail", "Pass") - srt[[qc]] <- factor(srt[[qc, drop = TRUE]], levels = c("Pass", "Fail")) - } + qc_nm <- c("db_qc", "outlier_qc", "umi_qc", "gene_qc", "mito_qc", "ribo_qc", "ribo_mito_ratio_qc", "species_qc", "CellQC") + for (qc in qc_nm) { + srt[[qc]] <- ifelse(colnames(srt) %in% get(qc), "Fail", "Pass") + srt[[qc]] <- factor(srt[[qc, drop = TRUE]], levels = c("Pass", "Fail")) + } - if (return_filtered) { - srt <- srt[, srt$CellQC == "Pass"] - srt@meta.data[, intersect(qc_nm, colnames(srt@meta.data))] <- NULL + if (return_filtered) { + srt <- srt[, srt$CellQC == "Pass"] + srt@meta.data[, intersect(qc_nm, colnames(srt@meta.data))] <- NULL + } + srtList[[i]] <- srt + } + if (length(srtList) > 1) { + return(Reduce(merge, srtList)) + } else { + return(srtList[[1]]) } - - return(srt) } diff --git a/man/RunCellQC.Rd b/man/RunCellQC.Rd index 598e5c5e..1b5e2061 100644 --- a/man/RunCellQC.Rd +++ b/man/RunCellQC.Rd @@ -7,11 +7,12 @@ RunCellQC( srt, assay = "RNA", + batch = NULL, qc_metrics = c("doublets", "outlier", "umi", "gene", "mito", "ribo", "ribo_mito_ratio", "species"), return_filtered = FALSE, db_method = "scDblFinder", - db_rate = ncol(srt)/1000 * 0.01, + db_rate = NULL, outlier_cutoff = c("log10_nCount:lower:2.5", "log10_nCount:higher:5", "log10_nFeature:lower:2.5", "log10_nFeature:higher:5", "featurecount_dist:lower:2.5"), outlier_n = 1, @@ -92,4 +93,25 @@ and the ribo content in empty droplets is usually high and the mito content is l \examples{ data("pancreas_sub") pancreas_sub <- RunCellQC(pancreas_sub) +CellStatPlot( + srt = pancreas_sub, + stat.by = c( + "db_qc", "outlier_qc", "umi_qc", "gene_qc", + "mito_qc", "ribo_qc", "ribo_mito_ratio_qc", "species_qc" + ), + plot_type = "upset", stat_level = "Fail" +) +table(pancreas_sub$CellQC) + +data("ifnb_sub") +ifnb_sub <- RunCellQC(ifnb_sub, batch = "stim", UMI_threshold = 1000, gene_threshold = 550) +CellStatPlot( + srt = ifnb_sub, + stat.by = c( + "db_qc", "outlier_qc", "umi_qc", "gene_qc", + "mito_qc", "ribo_qc", "ribo_mito_ratio_qc", "species_qc" + ), + plot_type = "upset", stat_level = "Fail" +) +table(ifnb_sub$CellQC) }