diff --git a/NEWS.md b/NEWS.md index b0d256e3..3dc64ba2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -# uwot 0.0.0.9008 +# uwot 0.0.0.9008 (December 23 2018) ## New features @@ -11,6 +11,10 @@ the old, less consistent, but faster settings, set `n_sgd_threads = "auto"`. * `gamma` is now `repulsion_strength`. * Default spectral initialization now looks for disconnected components and initializes them separately (also applies to `laplacian` and `normlaplacian`). +* New `init` options: `sspectral`, `snormlaplacian` and `slaplacian`. These are +like `spectral`, `normlaplacian`, `laplacian` respectively, but scaled so that +each dimension has a standard deviation of 1e-4. This is like the difference +between the `pca` and `spca` options. ## Bug fixes and minor improvements @@ -19,7 +23,7 @@ initializes them separately (also applies to `laplacian` and `normlaplacian`). number of threads used. * Anomalously long spectral intialization times should now be reduced. * Internal changes and fixes thanks to a code review by Aaron Lun -(https://github.com/ltla) +(https://github.com/ltla). # uwot 0.0.0.9007 (December 9 2018) diff --git a/R/uwot.R b/R/uwot.R index f1ca1784..c89fe675 100644 --- a/R/uwot.R +++ b/R/uwot.R @@ -81,8 +81,35 @@ #' \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled #' so the standard deviation is 1e-4, to give a distribution similar to #' that used in t-SNE. +#' \item \code{"sspectral"} Like \code{"spectral"}, but each dimension is +#' then scaled so the standard deviation is 1e-4. +#' \item \code{"snormlaplacian"} Like \code{"normlaplacian"}, but each +#' dimension is then scaled so the standard deviation is 1e-4. +#' \item \code{"slaplacian"} Like \code{"laplacian"}, but each dimension is +#' then scaled so the standard deviation is 1e-4. #' \item A matrix of initial coordinates. #' } +#' For spectral initializations, (\code{"spectral"}, \code{"normlaplacian"}, +#' \code{"laplacian"}, \code{"sspectral"}, \code{"snormlaplacian"}, +#' \code{"slaplacian"}), if more than one connected component is identified, +#' each connected component is initialized separately and the results are +#' merged. If \code{verbose = TRUE} the number of connected components are +#' logged to the console. The existence of multiple connected components +#' implies that a global view of the data cannot be attained with this +#' initialization. Either a PCA-based initialization or increasing the value of +#' \code{n_neighbors} may be more appropriate. +#' +#' The scaled initializations (\code{"spca"}, \code{"sspectral"}, +#' \code{"snormlaplacian"}, \code{"slaplacian"}) might be useful as +#' alternatives to the non-scaled equivalents if these result in initial +#' coordinates with large inter-point distances or outliers. This usually +#' results in small gradients during optimization and very little progress +#' being made to the layout. Shrinking the initial embedding by rescaling can +#' help under these circumstances. \code{"spca"} is usually recommended over +#' \code{"pca"}, but for the spectral initializations the scaled versions +#' usually aren't necessary unless you are using a large value of +#' \code{n_neighbors} (e.g. \code{n_neighbors = 150} or higher). +#' #' @param spread The effective scale of embedded points. In combination with #' \code{min_dist}, this determines how clustered/clumped the embedded points #' are. @@ -431,8 +458,34 @@ umap <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean", #' \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled #' so the standard deviation is 1e-4, to give a distribution similar to #' that used in t-SNE. +#' \item \code{"sspectral"} Like \code{"spectral"}, but each dimension is +#' then scaled so the standard deviation is 1e-4. +#' \item \code{"snormlaplacian"} Like \code{"normlaplacian"}, but each +#' dimension is then scaled so the standard deviation is 1e-4. +#' \item \code{"slaplacian"} Like \code{"laplacian"}, but each dimension is +#' then scaled so the standard deviation is 1e-4. #' \item A matrix of initial coordinates. #' } +#' For spectral initializations, (\code{"spectral"}, \code{"normlaplacian"}, +#' \code{"laplacian"}, \code{"sspectral"}, \code{"snormlaplacian"}, +#' \code{"slaplacian"}), if more than one connected component is identified, +#' each connected component is initialized separately and the results are +#' merged. If \code{verbose = TRUE} the number of connected components are +#' logged to the console. The existence of multiple connected components +#' implies that a global view of the data cannot be attained with this +#' initialization. Either a PCA-based initialization or increasing the value of +#' \code{n_neighbors} may be more appropriate. +#' +#' The scaled initializations (\code{"spca"}, \code{"sspectral"}, +#' \code{"snormlaplacian"}, \code{"slaplacian"}) might be useful as +#' alternatives to the non-scaled equivalents if these result in initial +#' coordinates with large inter-point distances or outliers. This usually +#' results in small gradients during optimization and very little progress +#' being made to the layout. Shrinking the initial embedding by rescaling can +#' help under these circumstances. \code{"spca"} is usually recommended over +#' \code{"pca"}, but for the spectral initializations the scaled versions +#' usually aren't necessary unless you are using a large value of +#' \code{n_neighbors} (e.g. \code{n_neighbors = 150} or higher). #' @param set_op_mix_ratio Interpolate between (fuzzy) union and intersection as #' the set operation used to combine local fuzzy simplicial sets to obtain a #' global fuzzy simplicial sets. Both fuzzy set operations use the product @@ -723,8 +776,35 @@ tumap <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean", #' \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled #' so the standard deviation is 1e-4, to give a distribution similar to #' that used in t-SNE and LargeVis. +#' \item \code{"sspectral"} Like \code{"spectral"}, but each dimension is +#' then scaled so the standard deviation is 1e-4. +#' \item \code{"snormlaplacian"} Like \code{"normlaplacian"}, but each +#' dimension is then scaled so the standard deviation is 1e-4. +#' \item \code{"slaplacian"} Like \code{"laplacian"}, but each dimension is +#' then scaled so the standard deviation is 1e-4. #' \item A matrix of initial coordinates. #' } +#' For spectral initializations, (\code{"spectral"}, \code{"normlaplacian"}, +#' \code{"laplacian"}, \code{"sspectral"}, \code{"snormlaplacian"}, +#' \code{"slaplacian"}), if more than one connected component is identified, +#' each connected component is initialized separately and the results are +#' merged. If \code{verbose = TRUE} the number of connected components are +#' logged to the console. The existence of multiple connected components +#' implies that a global view of the data cannot be attained with this +#' initialization. Either a PCA-based initialization or increasing the value of +#' \code{n_neighbors} may be more appropriate. +#' +#' The scaled initializations (\code{"spca"}, \code{"sspectral"}, +#' \code{"snormlaplacian"}, \code{"slaplacian"}) might be useful as +#' alternatives to the non-scaled equivalents if these result in initial +#' coordinates with large inter-point distances or outliers. This usually +#' results in small gradients during optimization and very little progress +#' being made to the layout. Shrinking the initial embedding by rescaling can +#' help under these circumstances. \code{"spca"} is usually recommended over +#' \code{"pca"}, but for the spectral initializations the scaled versions +#' usually aren't necessary unless you are using a large value of +#' \code{n_neighbors} (e.g. \code{n_neighbors = 150} or higher). +#' #' @param repulsion_strength Weighting applied to negative samples in low #' dimensional embedding optimization. Values higher than one will result in #' greater weight being given to negative samples. @@ -1128,15 +1208,20 @@ uwot <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean", else { init <- match.arg(tolower(init), c( "spectral", "random", "lvrandom", "normlaplacian", - "laplacian", "spca", "pca" + "laplacian", "spca", "pca", "sspectral", "snormlaplacian", "slaplacian" )) + do_shrink <- init %in% + c("spca", "sspectral", "snormlaplacian", "slaplacian") + if (do_shrink) { + init <- substring(init, 2) + } + # Don't repeat PCA initialization if we've already done it once if (pca_shortcut && init %in% c("spca", "pca") && pca >= n_components) { embedding <- X[, 1:n_components] if (init == "spca") { tsmessage("Initializing from scaled PCA") - embedding <- scale(embedding, scale = apply(embedding, 2, stats::sd) / 1e-4) } else { tsmessage("Initializing from PCA") @@ -1154,14 +1239,15 @@ uwot <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean", laplacian = laplacian_eigenmap(V, ndim = n_components, verbose = verbose), spca = scaled_pca(X, ndim = n_components, verbose = verbose), pca = pca_init(X, ndim = n_components, verbose = verbose), - sspectral = shrink_coords(spectral_init(V, ndim = n_components, - verbose = verbose)), stop("Unknown initialization method: '", init, "'") ) } + + if (do_shrink) { + embedding <- shrink_coords(embedding) + } } - - + if (is.null(n_epochs) || n_epochs <= 0) { if (method == "largevis") { n_epochs <- lvish_epochs(n_vertices, V) diff --git a/man/lvish.Rd b/man/lvish.Rd index b0cf423d..7518e339 100644 --- a/man/lvish.Rd +++ b/man/lvish.Rd @@ -83,26 +83,52 @@ coordinates.} For lvish, the default is \code{"maxabs"}, for consistency with LargeVis.} \item{init}{Type of initialization for the coordinates. Options are: -\itemize{ - \item \code{"spectral"} Spectral embedding using the normalized Laplacian - of the fuzzy 1-skeleton, with Gaussian noise added. - \item \code{"normlaplacian"}. Spectral embedding using the normalized - Laplacian of the fuzzy 1-skeleton, without noise. - \item \code{"random"}. Coordinates assigned using a uniform random - distribution between -10 and 10. - \item \code{"lvrandom"}. Coordinates assigned using a Gaussian - distribution with standard deviation 1e-4, as used in LargeVis - (Tang et al., 2016) and t-SNE. - \item \code{"laplacian"}. Spectral embedding using the Laplacian Eigenmap - (Belkin and Niyogi, 2002). - \item \code{"pca"}. The first two principal components from PCA of - \code{X} if \code{X} is a data frame, and from a 2-dimensional classical - MDS if \code{X} is of class \code{"dist"}. - \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled - so the standard deviation is 1e-4, to give a distribution similar to - that used in t-SNE and LargeVis. - \item A matrix of initial coordinates. -}} + \itemize{ + \item \code{"spectral"} Spectral embedding using the normalized Laplacian + of the fuzzy 1-skeleton, with Gaussian noise added. + \item \code{"normlaplacian"}. Spectral embedding using the normalized + Laplacian of the fuzzy 1-skeleton, without noise. + \item \code{"random"}. Coordinates assigned using a uniform random + distribution between -10 and 10. + \item \code{"lvrandom"}. Coordinates assigned using a Gaussian + distribution with standard deviation 1e-4, as used in LargeVis + (Tang et al., 2016) and t-SNE. + \item \code{"laplacian"}. Spectral embedding using the Laplacian Eigenmap + (Belkin and Niyogi, 2002). + \item \code{"pca"}. The first two principal components from PCA of + \code{X} if \code{X} is a data frame, and from a 2-dimensional classical + MDS if \code{X} is of class \code{"dist"}. + \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled + so the standard deviation is 1e-4, to give a distribution similar to + that used in t-SNE and LargeVis. + \item \code{"sspectral"} Like \code{"spectral"}, but each dimension is + then scaled so the standard deviation is 1e-4. + \item \code{"snormlaplacian"} Like \code{"normlaplacian"}, but each + dimension is then scaled so the standard deviation is 1e-4. + \item \code{"slaplacian"} Like \code{"laplacian"}, but each dimension is + then scaled so the standard deviation is 1e-4. + \item A matrix of initial coordinates. + } +For spectral initializations, (\code{"spectral"}, \code{"normlaplacian"}, +\code{"laplacian"}, \code{"sspectral"}, \code{"snormlaplacian"}, +\code{"slaplacian"}), if more than one connected component is identified, +each connected component is initialized separately and the results are +merged. If \code{verbose = TRUE} the number of connected components are +logged to the console. The existence of multiple connected components +implies that a global view of the data cannot be attained with this +initialization. Either a PCA-based initialization or increasing the value of +\code{n_neighbors} may be more appropriate. + +The scaled initializations (\code{"spca"}, \code{"sspectral"}, +\code{"snormlaplacian"}, \code{"slaplacian"}) might be useful as +alternatives to the non-scaled equivalents if these result in initial +coordinates with large inter-point distances or outliers. This usually +results in small gradients during optimization and very little progress +being made to the layout. Shrinking the initial embedding by rescaling can +help under these circumstances. \code{"spca"} is usually recommended over +\code{"pca"}, but for the spectral initializations the scaled versions +usually aren't necessary unless you are using a large value of +\code{n_neighbors} (e.g. \code{n_neighbors = 150} or higher).} \item{repulsion_strength}{Weighting applied to negative samples in low dimensional embedding optimization. Values higher than one will result in diff --git a/man/tumap.Rd b/man/tumap.Rd index df3ad8f4..971e0648 100644 --- a/man/tumap.Rd +++ b/man/tumap.Rd @@ -81,26 +81,52 @@ coordinates.} For t-UMAP, the default is \code{"none"}.} \item{init}{Type of initialization for the coordinates. Options are: -\itemize{ - \item \code{"spectral"} Spectral embedding using the normalized Laplacian - of the fuzzy 1-skeleton, with Gaussian noise added. - \item \code{"normlaplacian"}. Spectral embedding using the normalized - Laplacian of the fuzzy 1-skeleton, without noise. - \item \code{"random"}. Coordinates assigned using a uniform random - distribution between -10 and 10. - \item \code{"lvrandom"}. Coordinates assigned using a Gaussian - distribution with standard deviation 1e-4, as used in LargeVis - (Tang et al., 2016) and t-SNE. - \item \code{"laplacian"}. Spectral embedding using the Laplacian Eigenmap - (Belkin and Niyogi, 2002). - \item \code{"pca"}. The first two principal components from PCA of - \code{X} if \code{X} is a data frame, and from a 2-dimensional classical - MDS if \code{X} is of class \code{"dist"}. - \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled - so the standard deviation is 1e-4, to give a distribution similar to - that used in t-SNE. - \item A matrix of initial coordinates. -}} + \itemize{ + \item \code{"spectral"} Spectral embedding using the normalized Laplacian + of the fuzzy 1-skeleton, with Gaussian noise added. + \item \code{"normlaplacian"}. Spectral embedding using the normalized + Laplacian of the fuzzy 1-skeleton, without noise. + \item \code{"random"}. Coordinates assigned using a uniform random + distribution between -10 and 10. + \item \code{"lvrandom"}. Coordinates assigned using a Gaussian + distribution with standard deviation 1e-4, as used in LargeVis + (Tang et al., 2016) and t-SNE. + \item \code{"laplacian"}. Spectral embedding using the Laplacian Eigenmap + (Belkin and Niyogi, 2002). + \item \code{"pca"}. The first two principal components from PCA of + \code{X} if \code{X} is a data frame, and from a 2-dimensional classical + MDS if \code{X} is of class \code{"dist"}. + \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled + so the standard deviation is 1e-4, to give a distribution similar to + that used in t-SNE. + \item \code{"sspectral"} Like \code{"spectral"}, but each dimension is + then scaled so the standard deviation is 1e-4. + \item \code{"snormlaplacian"} Like \code{"normlaplacian"}, but each + dimension is then scaled so the standard deviation is 1e-4. + \item \code{"slaplacian"} Like \code{"laplacian"}, but each dimension is + then scaled so the standard deviation is 1e-4. + \item A matrix of initial coordinates. + } +For spectral initializations, (\code{"spectral"}, \code{"normlaplacian"}, +\code{"laplacian"}, \code{"sspectral"}, \code{"snormlaplacian"}, +\code{"slaplacian"}), if more than one connected component is identified, +each connected component is initialized separately and the results are +merged. If \code{verbose = TRUE} the number of connected components are +logged to the console. The existence of multiple connected components +implies that a global view of the data cannot be attained with this +initialization. Either a PCA-based initialization or increasing the value of +\code{n_neighbors} may be more appropriate. + +The scaled initializations (\code{"spca"}, \code{"sspectral"}, +\code{"snormlaplacian"}, \code{"slaplacian"}) might be useful as +alternatives to the non-scaled equivalents if these result in initial +coordinates with large inter-point distances or outliers. This usually +results in small gradients during optimization and very little progress +being made to the layout. Shrinking the initial embedding by rescaling can +help under these circumstances. \code{"spca"} is usually recommended over +\code{"pca"}, but for the spectral initializations the scaled versions +usually aren't necessary unless you are using a large value of +\code{n_neighbors} (e.g. \code{n_neighbors = 150} or higher).} \item{set_op_mix_ratio}{Interpolate between (fuzzy) union and intersection as the set operation used to combine local fuzzy simplicial sets to obtain a diff --git a/man/umap.Rd b/man/umap.Rd index 1afaea1a..8528f1d6 100644 --- a/man/umap.Rd +++ b/man/umap.Rd @@ -83,26 +83,52 @@ coordinates.} For UMAP, the default is \code{"none"}.} \item{init}{Type of initialization for the coordinates. Options are: -\itemize{ - \item \code{"spectral"} Spectral embedding using the normalized Laplacian - of the fuzzy 1-skeleton, with Gaussian noise added. - \item \code{"normlaplacian"}. Spectral embedding using the normalized - Laplacian of the fuzzy 1-skeleton, without noise. - \item \code{"random"}. Coordinates assigned using a uniform random - distribution between -10 and 10. - \item \code{"lvrandom"}. Coordinates assigned using a Gaussian - distribution with standard deviation 1e-4, as used in LargeVis - (Tang et al., 2016) and t-SNE. - \item \code{"laplacian"}. Spectral embedding using the Laplacian Eigenmap - (Belkin and Niyogi, 2002). - \item \code{"pca"}. The first two principal components from PCA of - \code{X} if \code{X} is a data frame, and from a 2-dimensional classical - MDS if \code{X} is of class \code{"dist"}. - \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled - so the standard deviation is 1e-4, to give a distribution similar to - that used in t-SNE. - \item A matrix of initial coordinates. -}} + \itemize{ + \item \code{"spectral"} Spectral embedding using the normalized Laplacian + of the fuzzy 1-skeleton, with Gaussian noise added. + \item \code{"normlaplacian"}. Spectral embedding using the normalized + Laplacian of the fuzzy 1-skeleton, without noise. + \item \code{"random"}. Coordinates assigned using a uniform random + distribution between -10 and 10. + \item \code{"lvrandom"}. Coordinates assigned using a Gaussian + distribution with standard deviation 1e-4, as used in LargeVis + (Tang et al., 2016) and t-SNE. + \item \code{"laplacian"}. Spectral embedding using the Laplacian Eigenmap + (Belkin and Niyogi, 2002). + \item \code{"pca"}. The first two principal components from PCA of + \code{X} if \code{X} is a data frame, and from a 2-dimensional classical + MDS if \code{X} is of class \code{"dist"}. + \item \code{"spca"}. Like \code{"pca"}, but each dimension is then scaled + so the standard deviation is 1e-4, to give a distribution similar to + that used in t-SNE. + \item \code{"sspectral"} Like \code{"spectral"}, but each dimension is + then scaled so the standard deviation is 1e-4. + \item \code{"snormlaplacian"} Like \code{"normlaplacian"}, but each + dimension is then scaled so the standard deviation is 1e-4. + \item \code{"slaplacian"} Like \code{"laplacian"}, but each dimension is + then scaled so the standard deviation is 1e-4. + \item A matrix of initial coordinates. + } +For spectral initializations, (\code{"spectral"}, \code{"normlaplacian"}, +\code{"laplacian"}, \code{"sspectral"}, \code{"snormlaplacian"}, +\code{"slaplacian"}), if more than one connected component is identified, +each connected component is initialized separately and the results are +merged. If \code{verbose = TRUE} the number of connected components are +logged to the console. The existence of multiple connected components +implies that a global view of the data cannot be attained with this +initialization. Either a PCA-based initialization or increasing the value of +\code{n_neighbors} may be more appropriate. + +The scaled initializations (\code{"spca"}, \code{"sspectral"}, +\code{"snormlaplacian"}, \code{"slaplacian"}) might be useful as +alternatives to the non-scaled equivalents if these result in initial +coordinates with large inter-point distances or outliers. This usually +results in small gradients during optimization and very little progress +being made to the layout. Shrinking the initial embedding by rescaling can +help under these circumstances. \code{"spca"} is usually recommended over +\code{"pca"}, but for the spectral initializations the scaled versions +usually aren't necessary unless you are using a large value of +\code{n_neighbors} (e.g. \code{n_neighbors = 150} or higher).} \item{spread}{The effective scale of embedded points. In combination with \code{min_dist}, this determines how clustered/clumped the embedded points diff --git a/tests/testthat/test_output.R b/tests/testthat/test_output.R index 21af6fbe..67b69e8f 100644 --- a/tests/testthat/test_output.R +++ b/tests/testthat/test_output.R @@ -264,4 +264,24 @@ expect_equal(res$pca_models[["3"]]$center, c(1.45, 0.22), res_trans <- umap_transform(iris10, model = res, verbose = FALSE, n_threads = 0, n_epochs = 2) -expect_ok_matrix(res_trans) \ No newline at end of file +expect_ok_matrix(res_trans) + + +# shrunk spectral initialization +res <- umap(iris10, + n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, + init = "snormlaplacian", verbose = FALSE, n_threads = 0 +) +expect_ok_matrix(res) + +res <- umap(iris10, + n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, + init = "slaplacian", verbose = FALSE, n_threads = 0 +) +expect_ok_matrix(res) + +res <- umap(iris10, + n_neighbors = 4, n_epochs = 2, learning_rate = 0.5, + init = "sspectral", verbose = FALSE, n_threads = 0 +) +expect_ok_matrix(res) \ No newline at end of file