Change the default n.epochs from 200 to 500 in RunUMAP2 function and …

…RunLargeVis function
zhanghao-njmu · Nov 28, 2023 · 1215407 · 1215407
1 parent f4b9432
commit 1215407
Show file tree

Hide file tree

Showing 6 changed files with 37 additions and 36 deletions.
diff --git a/R/SCP-feature_annotation.R b/R/SCP-feature_annotation.R
@@ -10,9 +10,10 @@
 #' @param Ensembl_version Version of the Ensembl database to use. Default is 103.
 #' @param mirror URL of the mirror to use for Ensembl database. Default is NULL.
 #' @param gtf Path to the GTF file to be used for annotation. Default is NULL.
-#' @param merge_gtf_by Column name to merge the GTF file by. Default is "gene_name".
-#' @param columns Vector of column names to be used from the GTF file. Default is
-#'     "seqname", "feature", "start", "end", "strand", "gene_id", "gene_name", "gene_type".
+#' @param gtf_field The features in the GTF file to include for annotation. By default, search and select the first "feature" found in the order of "gene", "transcript", "exon", and "CDS" in the GTF file.
+#' @param gtf_columns Vector of column names to be used from the GTF file. Default is
+#'     c("seqname", "feature", "start", "end", "strand", "gene_id", "gene_name", "gene_type", "gene_biotype").
+#' @param gtf_merge_by Column name to merge the GTF file by. Default is "gene_name".
 #' @param assays Character vector of assay names to be annotated. Default is "RNA".
 #' @param overwrite Logical value indicating whether to overwrite existing metadata. Default is FALSE.
 #'
@@ -32,10 +33,11 @@
 #' @export
 AnnotateFeatures <- function(srt, species = "Homo_sapiens", IDtype = c("symbol", "ensembl_id", "entrez_id"),
                              db = NULL, db_update = FALSE, db_version = "latest", convert_species = TRUE, Ensembl_version = 103, mirror = NULL,
-                             gtf = NULL, merge_gtf_by = "gene_name", columns = c(
+                             gtf = NULL, gtf_field = c("gene", "transcript", "exon", "CDS"),
+                             gtf_columns = c(
                                "seqname", "feature", "start", "end", "strand",
-                               "gene_id", "gene_name", "gene_type"
-                             ),
+                               "gene_id", "gene_name", "gene_type", "gene_biotype"
+                             ), gtf_merge_by = "gene_name",
                              assays = "RNA", overwrite = FALSE) {
   IDtype <- match.arg(IDtype)
   if (is.null(db) && is.null(gtf)) {
@@ -82,13 +84,12 @@ AnnotateFeatures <- function(srt, species = "Homo_sapiens", IDtype = c("symbol",
     gtf_all <- suppressWarnings(fread(gtf, sep = "\t"))
     gtf_all <- gtf_all[, 1:9]
     colnames(gtf_all) <- c("seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attribute")
-    for (type in c("gene", "transcript", "exon", "CDS")) {
+    for (type in gtf_field) {
       if (type %in% gtf_all[["feature"]]) {
         gtf_all <- gtf_all[gtf_all[["feature"]] == type, ]
         break
       }
     }
-    columns1 <- intersect(colnames(gtf_all), columns)
 
     gtf_attribute <- gtf_all[["attribute"]]
     gtf_attribute <- gsub(pattern = "\"", replacement = "", x = gtf_attribute)
@@ -97,13 +98,13 @@ AnnotateFeatures <- function(srt, species = "Homo_sapiens", IDtype = c("symbol",
       detail <- strsplit(x, " ")
       out <- lapply(detail, function(x) x[2:length(x)])
       names(out) <- sapply(detail, function(x) x[1])
-      out <- out[intersect(columns, names(out))]
+      out <- out[intersect(gtf_columns, names(out))]
       return(out)
     })
     gene_attr_df <- rbindlist(gene_attr, fill = TRUE)
-    gtf_columns <- cbind(gtf_all[, intersect(colnames(gtf_all), columns), with = FALSE], gene_attr_df)
+    gtf_columns <- cbind(gtf_all[, intersect(colnames(gtf_all), gtf_columns), with = FALSE], gene_attr_df)
     colnames(gtf_columns) <- make.unique(colnames(gtf_columns))
-    gtf_columns_collapse <- aggregate(gtf_columns, by = list(rowid = gtf_columns[[merge_gtf_by]]), FUN = function(x) {
+    gtf_columns_collapse <- aggregate(gtf_columns, by = list(rowid = gtf_columns[[gtf_merge_by]]), FUN = function(x) {
       paste0(unique(x), collapse = ";")
     })
     rownames(gtf_columns_collapse) <- gtf_columns_collapse[["rowid"]]

diff --git a/R/SCP-plot.R b/R/SCP-plot.R
@@ -3825,6 +3825,7 @@ ExpressionStatPlot <- function(exp.data, meta.data, stat.by, group.by = NULL, sp
   }
   bg_map <- NULL
   if (!is.null(bg.by)) {
+    meta.data[[bg.by]] <- factor(meta.data[[bg.by]], levels = intersect(levels(meta.data[[bg.by]]), meta.data[[bg.by]]))
     for (g in group.by) {
       df_table <- table(meta.data[[g]], meta.data[[bg.by]])
       if (max(rowSums(df_table > 0), na.rm = TRUE) > 1) {
@@ -8684,7 +8685,7 @@ GroupHeatmap <- function(srt, features = NULL, group.by = NULL, split.by = NULL,
     index <- which(features_ordered %in% features_label)
     drop <- setdiff(features_label, features_ordered)
     if (length(drop) > 0) {
-      warning(paste0(paste0(drop, collapse = ","), "was not found in the features"), immediate. = TRUE)
+      warning(paste0(paste0(drop, collapse = ","), " was not found in the features"), immediate. = TRUE)
     }
   }
   if (length(index) > 0) {
@@ -9754,7 +9755,7 @@ FeatureHeatmap <- function(srt, features = NULL, cells = NULL, group.by = NULL,
     index <- which(features_ordered %in% features_label)
     drop <- setdiff(features_label, features_ordered)
     if (length(drop) > 0) {
-      warning(paste0(paste0(drop, collapse = ","), "was not found in the features"), immediate. = TRUE)
+      warning(paste0(paste0(drop, collapse = ","), " was not found in the features"), immediate. = TRUE)
     }
   }
   if (length(index) > 0) {
@@ -11743,7 +11744,7 @@ DynamicHeatmap <- function(srt, lineages, features = NULL, use_fitted = FALSE, b
     index <- which(features_ordered %in% features_label)
     drop <- setdiff(features_label, features_ordered)
     if (length(drop) > 0) {
-      warning(paste0(paste0(drop, collapse = ","), "was not found in the features"), immediate. = TRUE)
+      warning(paste0(paste0(drop, collapse = ","), " was not found in the features"), immediate. = TRUE)
     }
   }
   if (length(index) > 0) {

diff --git a/R/Seurat-function.R b/R/Seurat-function.R
@@ -553,7 +553,7 @@ RunDM.default <- function(object, assay = NULL, slot = "data",
 #' @param n.neighbors An integer specifying the number of nearest neighbors to be used. Default is 30.
 #' @param n.components An integer specifying the number of UMAP components. Default is 2.
 #' @param metric A character string specifying the metric or a function to be used for distance calculations. When using a string, available metrics are: euclidean, manhattan. Other available generalized metrics are: cosine, pearson, pearson2. Note the triangle inequality may not be satisfied by some generalized metrics, hence knn search may not be optimal. When using metric.function as a function, the signature must be function(matrix, origin, target) and should compute a distance between the origin column and the target columns.  Default is "cosine".
-#' @param n.epochs An integer specifying the number of iterations performed during layout optimization for UMAP. Default is 200.
+#' @param n.epochs An integer specifying the number of iterations performed during layout optimization for UMAP. Default is 500.
 #' @param spread A numeric value specifying the spread parameter for UMAP, used during automatic estimation of a/b parameters. Default is 1.
 #' @param min.dist A numeric value specifying the minimum distance between UMAP embeddings, determines how close points appear in the final layout. Default is 0.3.
 #' @param set.op.mix.ratio Interpolate between (fuzzy) union and intersection as the set operation used to combine local fuzzy simplicial sets to obtain a global fuzzy simplicial sets. Both fuzzy set operations use the product t-norm. The value of this parameter should be between 0.0 and 1.0; a value of 1.0 will use a pure fuzzy union, while 0.0 will use a pure fuzzy intersection.
@@ -589,7 +589,7 @@ RunUMAP2.Seurat <- function(object,
                             assay = NULL, slot = "data",
                             umap.method = "uwot", reduction.model = NULL, n_threads = NULL,
                             return.model = FALSE, n.neighbors = 30L, n.components = 2L,
-                            metric = "cosine", n.epochs = 200L, spread = 1, min.dist = 0.3,
+                            metric = "cosine", n.epochs = 500L, spread = 1, min.dist = 0.3,
                             set.op.mix.ratio = 1, local.connectivity = 1L, negative.sample.rate = 5L,
                             a = NULL, b = NULL, learning.rate = 1, repulsion.strength = 1,
                             reduction.name = "umap", reduction.key = "UMAP_",
@@ -669,7 +669,7 @@ RunUMAP2.Seurat <- function(object,
 RunUMAP2.default <- function(object, assay = NULL,
                              umap.method = "uwot", reduction.model = NULL, n_threads = NULL,
                              return.model = FALSE, n.neighbors = 30L, n.components = 2L,
-                             metric = "cosine", n.epochs = 200L, spread = 1, min.dist = 0.3,
+                             metric = "cosine", n.epochs = 500L, spread = 1, min.dist = 0.3,
                              set.op.mix.ratio = 1, local.connectivity = 1L, negative.sample.rate = 5L,
                              a = NULL, b = NULL, learning.rate = 1, repulsion.strength = 1,
                              reduction.key = "UMAP_", verbose = TRUE, seed.use = 11L, ...) {
@@ -1538,6 +1538,7 @@ RunTriMap.default <- function(object, assay = NULL,
 #' @param verbose A logical value indicating whether to print verbose output. Default is TRUE.
 #' @param seed.use An integer specifying the random seed to be used. Default is 11.
 #' @param ... Additional arguments to be passed to the \link[uwot]{lvish} function.
+#' @param n_epochs Number of epochs to use during the optimization of the embedded coordinates. Default is 500.
 #'
 #' @examples
 #' pancreas_sub <- Seurat::FindVariableFeatures(pancreas_sub)
@@ -1558,7 +1559,7 @@ RunLargeVis <- function(object, ...) {
 RunLargeVis.Seurat <- function(object, reduction = "pca", dims = NULL, features = NULL,
                                assay = NULL, slot = "data",
                                perplexity = 50, n_neighbors = perplexity * 3, n_components = 2, metric = "euclidean",
-                               n_epochs = -1, learning_rate = 1, scale = "maxabs", init = "lvrandom", init_sdev = NULL,
+                               n_epochs = 500, learning_rate = 1, scale = "maxabs", init = "lvrandom", init_sdev = NULL,
                                repulsion_strength = 7, negative_sample_rate = 5, nn_method = NULL, n_trees = 50,
                                search_k = 2 * n_neighbors * n_trees, n_threads = NULL, n_sgd_threads = 0, grain_size = 1,
                                kernel = "gauss", pca = NULL, pca_center = TRUE, pcg_rand = TRUE, fast_sgd = FALSE,
@@ -1617,7 +1618,7 @@ RunLargeVis.Seurat <- function(object, reduction = "pca", dims = NULL, features
 #' @export
 RunLargeVis.default <- function(object, assay = NULL,
                                 perplexity = 50, n_neighbors = perplexity * 3, n_components = 2, metric = "euclidean",
-                                n_epochs = -1, learning_rate = 1, scale = "maxabs", init = "lvrandom", init_sdev = NULL,
+                                n_epochs = 500, learning_rate = 1, scale = "maxabs", init = "lvrandom", init_sdev = NULL,
                                 repulsion_strength = 7, negative_sample_rate = 5, nn_method = NULL, n_trees = 50,
                                 search_k = 2 * n_neighbors * n_trees, n_threads = NULL, n_sgd_threads = 0, grain_size = 1,
                                 kernel = "gauss", pca = NULL, pca_center = TRUE, pcg_rand = TRUE, fast_sgd = FALSE,

diff --git a/man/AnnotateFeatures.Rd b/man/AnnotateFeatures.Rd
diff --git a/man/RunLargeVis.Rd b/man/RunLargeVis.Rd
diff --git a/man/RunUMAP2.Rd b/man/RunUMAP2.Rd