From 218dd1f2045e78ce3fff37e609d2c4912b54c546 Mon Sep 17 00:00:00 2001 From: James Melville Date: Sun, 4 Nov 2018 09:42:53 -0800 Subject: [PATCH] Add Hamming distance support. --- NEWS.md | 6 ++++ R/uwot.R | 6 +++- README.md | 59 ++++++++++++++++++++++-------------- man/lvish.Rd | 1 + man/tumap.Rd | 1 + man/umap.Rd | 1 + tests/testthat/test_output.R | 8 +++++ 7 files changed, 59 insertions(+), 23 deletions(-) diff --git a/NEWS.md b/NEWS.md index 22951f09..dcf87632 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,9 @@ +# uwot 0.0.0.9005 (November 4 2018) + +## New features + +* Hamming distance is now supported, due to upgrade to RcppAnnoy 0.0.11. + # uwot 0.0.0.9004 (October 21 2018) ## New features diff --git a/R/uwot.R b/R/uwot.R index 98f11751..e8c5055c 100644 --- a/R/uwot.R +++ b/R/uwot.R @@ -26,6 +26,7 @@ #' \item \code{"euclidean"} (the default) #' \item \code{"cosine"} #' \item \code{"manhattan"} +#' \item \code{"hamming"} #' } #' Only applies if \code{nn_method = "annoy"} (for \code{nn_method = "fnn"}, the #' distance metric is always "euclidean"). @@ -304,6 +305,7 @@ umap <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean", #' \item \code{"euclidean"} (the default) #' \item \code{"cosine"} #' \item \code{"manhattan"} +#' \item \code{"hamming"} #' } #' Only applies if \code{nn_method = "annoy"} (for \code{nn_method = "fnn"}, the #' distance metric is always "euclidean"). @@ -534,6 +536,7 @@ tumap <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean", #' \item \code{"euclidean"} (the default) #' \item \code{"cosine"} #' \item \code{"manhattan"} +#' \item \code{"hamming"} #' } #' Only applies if \code{nn_method = "annoy"} (for \code{nn_method = "fnn"}, the #' distance metric is always "euclidean"). @@ -779,7 +782,8 @@ uwot <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean", } } - metric <- match.arg(tolower(metric), c("euclidean", "cosine", "manhattan")) + metric <- match.arg(tolower(metric), c("euclidean", "cosine", "manhattan", + "hamming")) if (is.null(nn_method)) { if (n_vertices < 4096 && metric == "euclidean" && !ret_model) { diff --git a/README.md b/README.md index e18540dd..eb2b8b07 100644 --- a/README.md +++ b/README.md @@ -11,12 +11,8 @@ the basic method. Translated from the ## News -*October 20 2018*. Supervised UMAP with numeric `y` now supports passing nearest -neighbor data directly. This might be useful if you don't want to use Euclidean -distances with `y` or if you have missing data, but you do have a way of -assigning neighbors to these points. See the -[Nearest Neighbor Data Format section](https://github.com/jlmelville/uwot#nearest-neighbor-data-format) -for more details. +*November 4 2018*. Thanks to RcppAnnoy 0.0.11, Hamming distance is now supported +(`metric = "hamming"`). Note: I recently upgraded to [devtools](https://cran.r-project.org/package=devtools) 2.0.0, and noticed that @@ -103,10 +99,22 @@ output of UMAP on some datasets, compared to t-SNE. ## Implementation Details -For small (N < 4096), exact nearest neighbors are found using the -[FNN](https://cran.r-project.org/package=FNN) package. Otherwise, approximate -nearest neighbors are found using -[RcppAnnoy](https://cran.r-project.org/package=RcppAnnoy). +For small (N < 4096) and Euclidean distance, exact nearest neighbors are found +using the [FNN](https://cran.r-project.org/package=FNN) package. Otherwise, +approximate nearest neighbors are found using +[RcppAnnoy](https://cran.r-project.org/package=RcppAnnoy). The supported +distance metrics (set by the `metric` parameter) are: + +* Euclidean +* Cosine +* Manhattan +* Hamming + +If you need other metrics, and can generate the nearest neighbor info +externally, you can pass the data directly to `uwot` via the `nn_method` +parameter. See the +[Nearest Neighbor Data Format section](https://github.com/jlmelville/uwot#nearest-neighbor-data-format) +for more details. Coordinate initialization uses [RSpectra](https://cran.r-project.org/package=RSpectra) to do the @@ -215,16 +223,25 @@ issue. ## Limitations and Other Issues -* Only Euclidean, cosine, and Manhattan distances are supported for finding -nearest neighbors from data frame and dense matrix input. But if you can -calculate a distance matrix for your data, you can pass it in as `dist` object. -For larger distance matrices, you can pass in a `sparseMatrix` (from the -[Matrix](https://cran.r-project.org/package=Matrix) package). Neither approach -is supremely efficient at the moment. Proper sparse matrix support is limited -by the nearest neighbor search routine: Annoy is intended for dense vectors. -Adding a library for sparse nearest neighbor search would be a good extension. -* I haven't tried this on anything much larger than MNIST and Fashion MNIST (so -at least around 100,000 rows with 500-1,000 columns works fine). Bear in mind +* As noted in the +[Implementation Details](https://github.com/jlmelville/uwot#implementation-details), +only Euclidean, Cosine, Hamming, and Manhattan distances are supported for finding +nearest neighbors from data frame and dense matrix input. For other metrics, +you can pass nearest neighbor data directly: see the +[Nearest Neighbor Data Format](https://github.com/jlmelville/uwot#nearest-neighbor-data-format) +section. Or if you can calculate a distance matrix for your data, you can pass +it in as `dist` object. For larger distance matrices, you can pass in a +`sparseMatrix` (from the [Matrix](https://cran.r-project.org/package=Matrix) +package). Neither approach is supremely efficient at the moment. Proper sparse +matrix support is limited by the nearest neighbor search routine: Annoy is +intended for dense vectors. Adding a library for sparse nearest neighbor search +would be a good extension. +* For supervised dimensionality reduction using a numeric vector, only the +Euclidean distance is supported for building the target graph. Again, see the +[Nearest Neighbor Data Format](https://github.com/jlmelville/uwot#nearest-neighbor-data-format) +for a possible alternative. +* I haven't applied `uwot` on anything much larger than MNIST and Fashion MNIST +(so at least around 100,000 rows with 500-1,000 columns works fine). Bear in mind that Annoy itself says it works best with dimensions < 100, but still works "surprisingly well" up to 1000. * The spectral initialization default for `umap` (and the Laplacian eigenmap @@ -233,8 +250,6 @@ fails to converge it will fall back to random initialization, but on occasion I've seen it take an extremely long time (a couple of hours) to converge. If initialization is taking more than a few minutes, I suggest stopping the calculation and using the scaled PCA (`init = "spca"`) instead. -* For supervised dimensionality reduction using a numeric vector, only the -Euclidean distance is supported for building the target graph. * `R CMD check` currently reports the following note: `GNU make is a SystemRequirements.`, which is expected and due to using RcppParallel. On Linux, it sometimes notes that the `libs` sub-directory is over diff --git a/man/lvish.Rd b/man/lvish.Rd index 756fa5ea..f5d26e01 100644 --- a/man/lvish.Rd +++ b/man/lvish.Rd @@ -40,6 +40,7 @@ integer value in the range \code{2} to \code{100}.} \item \code{"euclidean"} (the default) \item \code{"cosine"} \item \code{"manhattan"} + \item \code{"hamming"} } Only applies if \code{nn_method = "annoy"} (for \code{nn_method = "fnn"}, the distance metric is always "euclidean").} diff --git a/man/tumap.Rd b/man/tumap.Rd index 1c92f143..ded24504 100644 --- a/man/tumap.Rd +++ b/man/tumap.Rd @@ -39,6 +39,7 @@ integer value in the range \code{2} to \code{100}.} \item \code{"euclidean"} (the default) \item \code{"cosine"} \item \code{"manhattan"} + \item \code{"hamming"} } Only applies if \code{nn_method = "annoy"} (for \code{nn_method = "fnn"}, the distance metric is always "euclidean").} diff --git a/man/umap.Rd b/man/umap.Rd index d3d52377..c99b92dc 100644 --- a/man/umap.Rd +++ b/man/umap.Rd @@ -40,6 +40,7 @@ integer value in the range \code{2} to \code{100}.} \item \code{"euclidean"} (the default) \item \code{"cosine"} \item \code{"manhattan"} + \item \code{"hamming"} } Only applies if \code{nn_method = "annoy"} (for \code{nn_method = "fnn"}, the distance metric is always "euclidean").} diff --git a/tests/testthat/test_output.R b/tests/testthat/test_output.R index 0ea9b246..7cc19edf 100644 --- a/tests/testthat/test_output.R +++ b/tests/testthat/test_output.R @@ -168,3 +168,11 @@ res_ynn <- umap(iris10, n_neighbors = 4, n_epochs = 2, alpha = 0.5, expect_ok_matrix(res_ynn) # Should be the same result expect_equal(res_ynn, res_y) + +hamm <- structure(c(0L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, + 0L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, + 0L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 0L), .Dim = c(10L, 4L + )) +res <- umap(hamm, n_neighbors = 4, metric = "hamming", verbose = FALSE, + n_threads = 1) +expect_ok_matrix(res)