From 12154071dc3d4b39e70934351aedf7f0a0110a32 Mon Sep 17 00:00:00 2001
From: zhanghao-njmu <542370159@qq.com>
Date: Tue, 28 Nov 2023 15:03:46 +0800
Subject: [PATCH] Change the default n.epochs from 200 to 500 in RunUMAP2
 function and RunLargeVis function

---
 R/SCP-feature_annotation.R | 23 ++++++++++++-----------
 R/SCP-plot.R               |  7 ++++---
 R/Seurat-function.R        | 11 ++++++-----
 man/AnnotateFeatures.Rd    | 15 +++++++++------
 man/RunLargeVis.Rd         | 11 +++--------
 man/RunUMAP2.Rd            |  6 +++---
 6 files changed, 37 insertions(+), 36 deletions(-)

diff --git a/R/SCP-feature_annotation.R b/R/SCP-feature_annotation.R
index 95d5336d..9bee3088 100644
--- a/R/SCP-feature_annotation.R
+++ b/R/SCP-feature_annotation.R
@@ -10,9 +10,10 @@
 #' @param Ensembl_version Version of the Ensembl database to use. Default is 103.
 #' @param mirror URL of the mirror to use for Ensembl database. Default is NULL.
 #' @param gtf Path to the GTF file to be used for annotation. Default is NULL.
-#' @param merge_gtf_by Column name to merge the GTF file by. Default is "gene_name".
-#' @param columns Vector of column names to be used from the GTF file. Default is
-#'     "seqname", "feature", "start", "end", "strand", "gene_id", "gene_name", "gene_type".
+#' @param gtf_field The features in the GTF file to include for annotation. By default, search and select the first "feature" found in the order of "gene", "transcript", "exon", and "CDS" in the GTF file.
+#' @param gtf_columns Vector of column names to be used from the GTF file. Default is
+#'     c("seqname", "feature", "start", "end", "strand", "gene_id", "gene_name", "gene_type", "gene_biotype").
+#' @param gtf_merge_by Column name to merge the GTF file by. Default is "gene_name".
 #' @param assays Character vector of assay names to be annotated. Default is "RNA".
 #' @param overwrite Logical value indicating whether to overwrite existing metadata. Default is FALSE.
 #'
@@ -32,10 +33,11 @@
 #' @export
 AnnotateFeatures <- function(srt, species = "Homo_sapiens", IDtype = c("symbol", "ensembl_id", "entrez_id"),
                              db = NULL, db_update = FALSE, db_version = "latest", convert_species = TRUE, Ensembl_version = 103, mirror = NULL,
-                             gtf = NULL, merge_gtf_by = "gene_name", columns = c(
+                             gtf = NULL, gtf_field = c("gene", "transcript", "exon", "CDS"),
+                             gtf_columns = c(
                                "seqname", "feature", "start", "end", "strand",
-                               "gene_id", "gene_name", "gene_type"
-                             ),
+                               "gene_id", "gene_name", "gene_type", "gene_biotype"
+                             ), gtf_merge_by = "gene_name",
                              assays = "RNA", overwrite = FALSE) {
   IDtype <- match.arg(IDtype)
   if (is.null(db) && is.null(gtf)) {
@@ -82,13 +84,12 @@ AnnotateFeatures <- function(srt, species = "Homo_sapiens", IDtype = c("symbol",
     gtf_all <- suppressWarnings(fread(gtf, sep = "\t"))
     gtf_all <- gtf_all[, 1:9]
     colnames(gtf_all) <- c("seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attribute")
-    for (type in c("gene", "transcript", "exon", "CDS")) {
+    for (type in gtf_field) {
       if (type %in% gtf_all[["feature"]]) {
         gtf_all <- gtf_all[gtf_all[["feature"]] == type, ]
         break
       }
     }
-    columns1 <- intersect(colnames(gtf_all), columns)
 
     gtf_attribute <- gtf_all[["attribute"]]
     gtf_attribute <- gsub(pattern = "\"", replacement = "", x = gtf_attribute)
@@ -97,13 +98,13 @@ AnnotateFeatures <- function(srt, species = "Homo_sapiens", IDtype = c("symbol",
       detail <- strsplit(x, " ")
       out <- lapply(detail, function(x) x[2:length(x)])
       names(out) <- sapply(detail, function(x) x[1])
-      out <- out[intersect(columns, names(out))]
+      out <- out[intersect(gtf_columns, names(out))]
       return(out)
     })
     gene_attr_df <- rbindlist(gene_attr, fill = TRUE)
-    gtf_columns <- cbind(gtf_all[, intersect(colnames(gtf_all), columns), with = FALSE], gene_attr_df)
+    gtf_columns <- cbind(gtf_all[, intersect(colnames(gtf_all), gtf_columns), with = FALSE], gene_attr_df)
     colnames(gtf_columns) <- make.unique(colnames(gtf_columns))
-    gtf_columns_collapse <- aggregate(gtf_columns, by = list(rowid = gtf_columns[[merge_gtf_by]]), FUN = function(x) {
+    gtf_columns_collapse <- aggregate(gtf_columns, by = list(rowid = gtf_columns[[gtf_merge_by]]), FUN = function(x) {
       paste0(unique(x), collapse = ";")
     })
     rownames(gtf_columns_collapse) <- gtf_columns_collapse[["rowid"]]
diff --git a/R/SCP-plot.R b/R/SCP-plot.R
index 0695e8d2..f4fde1ea 100644
--- a/R/SCP-plot.R
+++ b/R/SCP-plot.R
@@ -3825,6 +3825,7 @@ ExpressionStatPlot <- function(exp.data, meta.data, stat.by, group.by = NULL, sp
   }
   bg_map <- NULL
   if (!is.null(bg.by)) {
+    meta.data[[bg.by]] <- factor(meta.data[[bg.by]], levels = intersect(levels(meta.data[[bg.by]]), meta.data[[bg.by]]))
     for (g in group.by) {
       df_table <- table(meta.data[[g]], meta.data[[bg.by]])
       if (max(rowSums(df_table > 0), na.rm = TRUE) > 1) {
@@ -8684,7 +8685,7 @@ GroupHeatmap <- function(srt, features = NULL, group.by = NULL, split.by = NULL,
     index <- which(features_ordered %in% features_label)
     drop <- setdiff(features_label, features_ordered)
     if (length(drop) > 0) {
-      warning(paste0(paste0(drop, collapse = ","), "was not found in the features"), immediate. = TRUE)
+      warning(paste0(paste0(drop, collapse = ","), " was not found in the features"), immediate. = TRUE)
     }
   }
   if (length(index) > 0) {
@@ -9754,7 +9755,7 @@ FeatureHeatmap <- function(srt, features = NULL, cells = NULL, group.by = NULL,
     index <- which(features_ordered %in% features_label)
     drop <- setdiff(features_label, features_ordered)
     if (length(drop) > 0) {
-      warning(paste0(paste0(drop, collapse = ","), "was not found in the features"), immediate. = TRUE)
+      warning(paste0(paste0(drop, collapse = ","), " was not found in the features"), immediate. = TRUE)
     }
   }
   if (length(index) > 0) {
@@ -11743,7 +11744,7 @@ DynamicHeatmap <- function(srt, lineages, features = NULL, use_fitted = FALSE, b
     index <- which(features_ordered %in% features_label)
     drop <- setdiff(features_label, features_ordered)
     if (length(drop) > 0) {
-      warning(paste0(paste0(drop, collapse = ","), "was not found in the features"), immediate. = TRUE)
+      warning(paste0(paste0(drop, collapse = ","), " was not found in the features"), immediate. = TRUE)
     }
   }
   if (length(index) > 0) {
diff --git a/R/Seurat-function.R b/R/Seurat-function.R
index be5f448d..2e7e97f4 100644
--- a/R/Seurat-function.R
+++ b/R/Seurat-function.R
@@ -553,7 +553,7 @@ RunDM.default <- function(object, assay = NULL, slot = "data",
 #' @param n.neighbors An integer specifying the number of nearest neighbors to be used. Default is 30.
 #' @param n.components An integer specifying the number of UMAP components. Default is 2.
 #' @param metric A character string specifying the metric or a function to be used for distance calculations. When using a string, available metrics are: euclidean, manhattan. Other available generalized metrics are: cosine, pearson, pearson2. Note the triangle inequality may not be satisfied by some generalized metrics, hence knn search may not be optimal. When using metric.function as a function, the signature must be function(matrix, origin, target) and should compute a distance between the origin column and the target columns.  Default is "cosine".
-#' @param n.epochs An integer specifying the number of iterations performed during layout optimization for UMAP. Default is 200.
+#' @param n.epochs An integer specifying the number of iterations performed during layout optimization for UMAP. Default is 500.
 #' @param spread A numeric value specifying the spread parameter for UMAP, used during automatic estimation of a/b parameters. Default is 1.
 #' @param min.dist A numeric value specifying the minimum distance between UMAP embeddings, determines how close points appear in the final layout. Default is 0.3.
 #' @param set.op.mix.ratio Interpolate between (fuzzy) union and intersection as the set operation used to combine local fuzzy simplicial sets to obtain a global fuzzy simplicial sets. Both fuzzy set operations use the product t-norm. The value of this parameter should be between 0.0 and 1.0; a value of 1.0 will use a pure fuzzy union, while 0.0 will use a pure fuzzy intersection.
@@ -589,7 +589,7 @@ RunUMAP2.Seurat <- function(object,
                             assay = NULL, slot = "data",
                             umap.method = "uwot", reduction.model = NULL, n_threads = NULL,
                             return.model = FALSE, n.neighbors = 30L, n.components = 2L,
-                            metric = "cosine", n.epochs = 200L, spread = 1, min.dist = 0.3,
+                            metric = "cosine", n.epochs = 500L, spread = 1, min.dist = 0.3,
                             set.op.mix.ratio = 1, local.connectivity = 1L, negative.sample.rate = 5L,
                             a = NULL, b = NULL, learning.rate = 1, repulsion.strength = 1,
                             reduction.name = "umap", reduction.key = "UMAP_",
@@ -669,7 +669,7 @@ RunUMAP2.Seurat <- function(object,
 RunUMAP2.default <- function(object, assay = NULL,
                              umap.method = "uwot", reduction.model = NULL, n_threads = NULL,
                              return.model = FALSE, n.neighbors = 30L, n.components = 2L,
-                             metric = "cosine", n.epochs = 200L, spread = 1, min.dist = 0.3,
+                             metric = "cosine", n.epochs = 500L, spread = 1, min.dist = 0.3,
                              set.op.mix.ratio = 1, local.connectivity = 1L, negative.sample.rate = 5L,
                              a = NULL, b = NULL, learning.rate = 1, repulsion.strength = 1,
                              reduction.key = "UMAP_", verbose = TRUE, seed.use = 11L, ...) {
@@ -1538,6 +1538,7 @@ RunTriMap.default <- function(object, assay = NULL,
 #' @param verbose A logical value indicating whether to print verbose output. Default is TRUE.
 #' @param seed.use An integer specifying the random seed to be used. Default is 11.
 #' @param ... Additional arguments to be passed to the \link[uwot]{lvish} function.
+#' @param n_epochs Number of epochs to use during the optimization of the embedded coordinates. Default is 500.
 #'
 #' @examples
 #' pancreas_sub <- Seurat::FindVariableFeatures(pancreas_sub)
@@ -1558,7 +1559,7 @@ RunLargeVis <- function(object, ...) {
 RunLargeVis.Seurat <- function(object, reduction = "pca", dims = NULL, features = NULL,
                                assay = NULL, slot = "data",
                                perplexity = 50, n_neighbors = perplexity * 3, n_components = 2, metric = "euclidean",
-                               n_epochs = -1, learning_rate = 1, scale = "maxabs", init = "lvrandom", init_sdev = NULL,
+                               n_epochs = 500, learning_rate = 1, scale = "maxabs", init = "lvrandom", init_sdev = NULL,
                                repulsion_strength = 7, negative_sample_rate = 5, nn_method = NULL, n_trees = 50,
                                search_k = 2 * n_neighbors * n_trees, n_threads = NULL, n_sgd_threads = 0, grain_size = 1,
                                kernel = "gauss", pca = NULL, pca_center = TRUE, pcg_rand = TRUE, fast_sgd = FALSE,
@@ -1617,7 +1618,7 @@ RunLargeVis.Seurat <- function(object, reduction = "pca", dims = NULL, features
 #' @export
 RunLargeVis.default <- function(object, assay = NULL,
                                 perplexity = 50, n_neighbors = perplexity * 3, n_components = 2, metric = "euclidean",
-                                n_epochs = -1, learning_rate = 1, scale = "maxabs", init = "lvrandom", init_sdev = NULL,
+                                n_epochs = 500, learning_rate = 1, scale = "maxabs", init = "lvrandom", init_sdev = NULL,
                                 repulsion_strength = 7, negative_sample_rate = 5, nn_method = NULL, n_trees = 50,
                                 search_k = 2 * n_neighbors * n_trees, n_threads = NULL, n_sgd_threads = 0, grain_size = 1,
                                 kernel = "gauss", pca = NULL, pca_center = TRUE, pcg_rand = TRUE, fast_sgd = FALSE,
diff --git a/man/AnnotateFeatures.Rd b/man/AnnotateFeatures.Rd
index e6d8d96f..bc9d363d 100644
--- a/man/AnnotateFeatures.Rd
+++ b/man/AnnotateFeatures.Rd
@@ -16,9 +16,10 @@ AnnotateFeatures(
   Ensembl_version = 103,
   mirror = NULL,
   gtf = NULL,
-  merge_gtf_by = "gene_name",
-  columns = c("seqname", "feature", "start", "end", "strand", "gene_id", "gene_name",
-    "gene_type"),
+  gtf_field = c("gene", "transcript", "exon", "CDS"),
+  gtf_columns = c("seqname", "feature", "start", "end", "strand", "gene_id", "gene_name",
+    "gene_type", "gene_biotype"),
+  gtf_merge_by = "gene_name",
   assays = "RNA",
   overwrite = FALSE
 )
@@ -44,10 +45,12 @@ AnnotateFeatures(
 
 \item{gtf}{Path to the GTF file to be used for annotation. Default is NULL.}
 
-\item{merge_gtf_by}{Column name to merge the GTF file by. Default is "gene_name".}
+\item{gtf_field}{The features in the GTF file to include for annotation. By default, search and select the first "feature" found in the order of "gene", "transcript", "exon", and "CDS" in the GTF file.}
 
-\item{columns}{Vector of column names to be used from the GTF file. Default is
-"seqname", "feature", "start", "end", "strand", "gene_id", "gene_name", "gene_type".}
+\item{gtf_columns}{Vector of column names to be used from the GTF file. Default is
+c("seqname", "feature", "start", "end", "strand", "gene_id", "gene_name", "gene_type", "gene_biotype").}
+
+\item{gtf_merge_by}{Column name to merge the GTF file by. Default is "gene_name".}
 
 \item{assays}{Character vector of assay names to be annotated. Default is "RNA".}
 
diff --git a/man/RunLargeVis.Rd b/man/RunLargeVis.Rd
index d95ce53f..f7a18f72 100644
--- a/man/RunLargeVis.Rd
+++ b/man/RunLargeVis.Rd
@@ -19,7 +19,7 @@ RunLargeVis(object, ...)
   n_neighbors = perplexity * 3,
   n_components = 2,
   metric = "euclidean",
-  n_epochs = -1,
+  n_epochs = 500,
   learning_rate = 1,
   scale = "maxabs",
   init = "lvrandom",
@@ -55,7 +55,7 @@ RunLargeVis(object, ...)
   n_neighbors = perplexity * 3,
   n_components = 2,
   metric = "euclidean",
-  n_epochs = -1,
+  n_epochs = 500,
   learning_rate = 1,
   scale = "maxabs",
   init = "lvrandom",
@@ -147,12 +147,7 @@ exists to allow mixed binary and real-valued data to be included and to have
 PCA applied to both, but with centering applied only to the real-valued data
 (it is typical not to apply centering to binary data before PCA is applied).}
 
-\item{n_epochs}{Number of epochs to use during the optimization of the
-embedded coordinates. The default is calculate the number of epochs
-dynamically based on dataset size, to give the same number of edge samples
-as the LargeVis defaults. This is usually substantially larger than the
-UMAP defaults. If \code{n_epochs = 0}, then coordinates determined by
-\code{"init"} will be returned.}
+\item{n_epochs}{Number of epochs to use during the optimization of the embedded coordinates. Default is 500.}
 
 \item{learning_rate}{Initial learning rate used in optimization of the
 coordinates.}
diff --git a/man/RunUMAP2.Rd b/man/RunUMAP2.Rd
index fb32bb47..f770e9c2 100644
--- a/man/RunUMAP2.Rd
+++ b/man/RunUMAP2.Rd
@@ -24,7 +24,7 @@ RunUMAP2(object, ...)
   n.neighbors = 30L,
   n.components = 2L,
   metric = "cosine",
-  n.epochs = 200L,
+  n.epochs = 500L,
   spread = 1,
   min.dist = 0.3,
   set.op.mix.ratio = 1,
@@ -51,7 +51,7 @@ RunUMAP2(object, ...)
   n.neighbors = 30L,
   n.components = 2L,
   metric = "cosine",
-  n.epochs = 200L,
+  n.epochs = 500L,
   spread = 1,
   min.dist = 0.3,
   set.op.mix.ratio = 1,
@@ -98,7 +98,7 @@ RunUMAP2(object, ...)
 
 \item{metric}{A character string specifying the metric or a function to be used for distance calculations. When using a string, available metrics are: euclidean, manhattan. Other available generalized metrics are: cosine, pearson, pearson2. Note the triangle inequality may not be satisfied by some generalized metrics, hence knn search may not be optimal. When using metric.function as a function, the signature must be function(matrix, origin, target) and should compute a distance between the origin column and the target columns.  Default is "cosine".}
 
-\item{n.epochs}{An integer specifying the number of iterations performed during layout optimization for UMAP. Default is 200.}
+\item{n.epochs}{An integer specifying the number of iterations performed during layout optimization for UMAP. Default is 500.}
 
 \item{spread}{A numeric value specifying the spread parameter for UMAP, used during automatic estimation of a/b parameters. Default is 1.}