diff --git a/.Rproj.user/shared/notebooks/paths b/.Rproj.user/shared/notebooks/paths index 93c4658..4fb4c7c 100644 --- a/.Rproj.user/shared/notebooks/paths +++ b/.Rproj.user/shared/notebooks/paths @@ -1,8 +1,10 @@ /Users/berenz/Downloads/Template of Abstract in Latex.tex="A4C7846D" +/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/.gitignore="C912F95E" /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/DESCRIPTION="019D16E4" /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/controls.R="5BC637B7" /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/method_annoy.R="684202BA" /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/method_hnsw.R="A4FAA5A3" +/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/method_mlpack.R="B6A90565" /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/method_nnd.R="87049873" /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/methods.R="B7F84C4B" /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/reclin2_pair_ann.R="1D89EE3E" @@ -15,8 +17,10 @@ /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_mlpack.R="51D2EAA1" /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_print.R="AA7835F7" /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_reclin2.R="E3E08D07" +/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/misc/hnsw-nndesc.Rmd="F39A0093" /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/tests/tinytest.R="D6BBCDC1" /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/vignettes/v1-deduplication.Rmd="9D34DD44" /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/vignettes/v2-reclin.Rmd="289A4D2F" /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/vignettes/v3-evaluation.Rmd="E778A54F" /Users/berenz/mac/nauka/ncn-foreigners/software/blocking/vignettes/v4-integration.Rmd="E3EFC8F1" +/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/vignettes/v5-bigdata.Rmd="335CBF49" diff --git a/.gitignore b/.gitignore index b15e3fc..a83549c 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ playground docs inst/doc +misc diff --git a/R/blocking.R b/R/blocking.R index efcec47..6ba6609 100644 --- a/R/blocking.R +++ b/R/blocking.R @@ -219,6 +219,7 @@ blocking <- function(x, y = y_dtm[, colnames_xy], k = k, distance = distance, + deduplication = deduplication, verbose = if (verbose == 2) TRUE else FALSE, n_threads = n_threads, control = control_ann), diff --git a/R/controls.R b/R/controls.R index bc5dc28..0e862f2 100644 --- a/R/controls.R +++ b/R/controls.R @@ -6,17 +6,19 @@ #' Controls for ANN algorithms used in the package #' #' @param sparse whether sparse data should be used as an input for algorithms, -#' @param nnd parameters for [rnndescent::rnnd_build()] and [rnndescent::rnnd_query()], -#' @param hnsw parameters for [RcppHNSW::hnsw_build()] and [RcppHNSW::hnsw_search()], -#' @param lsh parameters for [mlpack::lsh()], -#' @param annoy parameters for [RcppAnnoy] package, -#' @param kd parameters for [mlpack::knn()] function. +#' @param k_search number of neighbours to search, +#' @param nnd list of parameters for [rnndescent::rnnd_build()] and [rnndescent::rnnd_query()], +#' @param hnsw list of parameters for [RcppHNSW::hnsw_build()] and [RcppHNSW::hnsw_search()], +#' @param lsh list of parameters for [mlpack::lsh()], +#' @param lisf of kd parameters for [mlpack::knn()] function, +#' @param annoy list of parameters for [RcppAnnoy] package. #' #' @returns Returns a list with parameters #' #' @export controls_ann <- function( sparse = FALSE, + k_search = 30, nnd = list(k_build = 30, use_alt_metric = TRUE, init = "tree", @@ -45,18 +47,19 @@ controls_ann <- function( num_probes = 0, projections = 10, tables = 30), - annoy = list(n_trees = 250, - build_on_disk = FALSE), kd = list(algorithm = "dual_tree", epsilon = 0, leaf_size = 20, random_basis = FALSE, rho = 0.7, tau = 0, - tree_type = "kd") + tree_type = "kd"), + annoy = list(n_trees = 250, + build_on_disk = FALSE) ) { list(sparse = sparse, + k_search = k_search, nnd = nnd, hnsw = hnsw, lsh = lsh, @@ -73,8 +76,8 @@ controls_ann <- function( #' #' @param n_shingles length of shingles (default `2L`), #' @param n_chunks passed to (default `10L`), -#' @param lowercase should the caracters be made lowercase? (default `TRUE`) -#' @param strip_non_alphanum should punctuation and white space be stripped? (default `TRUE`) +#' @param lowercase should the characters be made lowercase? (default `TRUE`), +#' @param strip_non_alphanum should punctuation and white space be stripped? (default `TRUE`). #' #' @returns Returns a list with parameters. #' diff --git a/R/method_annoy.R b/R/method_annoy.R index db064b4..316c495 100644 --- a/R/method_annoy.R +++ b/R/method_annoy.R @@ -53,7 +53,7 @@ method_annoy <- function(x, } if (verbose) l_ind$setVerbose(1) - ## index - this does not require dense matrix (sparse can be used?) + ## index - this does not require dense matrix for (i in 1:nrow(x)) l_ind$addItem(i - 1, x[i,]) l_ind$build(control$annoy$n_trees) l_ind_nns <- numeric(length = nrow(y)) @@ -61,7 +61,10 @@ method_annoy <- function(x, ## query for (i in 1:nrow(y)) { - annoy_res <- l_ind$getNNsByVectorList(y[i, ], k, -1, TRUE) + annoy_res <- l_ind$getNNsByVectorList(y[i, ], + if (nrow(x) < control$k_search) nrow(x) else control$k_search, + -1, + TRUE) l_ind_nns[i] <- annoy_res$item[k] l_ind_dist[i] <- annoy_res$distance[k] } diff --git a/R/method_hnsw.R b/R/method_hnsw.R index 5af47ea..3c4bb3b 100644 --- a/R/method_hnsw.R +++ b/R/method_hnsw.R @@ -54,9 +54,12 @@ method_hnsw <- function(x, l_ind$setEf(control$hnsw$ef_s) + ## this does not handle the control$k_search parameter l_1nn_m <- list() for (i in 1:nrow(y)) { - l_1nn_m[[i]] <- l_ind$getNNsList(y[i,], k, TRUE) + l_1nn_m[[i]] <- l_ind$getNNsList(y[i,], + k, + TRUE) } l_1nn <- list(idx = do.call("rbind",lapply(l_1nn_m, "[[", "item")), @@ -76,7 +79,7 @@ method_hnsw <- function(x, ## query l_1nn <- RcppHNSW::hnsw_search(X = y, ann = l_ind, - k = k, + k = if (nrow(x) < control$k_search) nrow(x) else control$k_search, ef = control$hnsw$ef_s, verbose = verbose, n_threads = n_threads) diff --git a/R/method_mlpack.R b/R/method_mlpack.R index d76cf55..0859a63 100644 --- a/R/method_mlpack.R +++ b/R/method_mlpack.R @@ -38,7 +38,7 @@ method_mlpack <- function(x, y <- as.matrix(y) result <- switch(algo, - "lsh" = mlpack::lsh(k = k, + "lsh" = mlpack::lsh(k = if (nrow(x) < control$k_search) nrow(x) else control$k_search, query = y, reference = x, verbose = verbose, @@ -48,7 +48,7 @@ method_mlpack <- function(x, num_probes = control$lsh$num_probes, projections = control$lsh$projections, tables = control$lsh$tables), - "kd" = mlpack::knn(k = k, + "kd" = mlpack::knn(k = if (nrow(x) < control$k_search) nrow(x) else control$k_search, query = y, reference = x, verbose = verbose, diff --git a/R/method_nnd.R b/R/method_nnd.R index 341d3f7..e2e0b4a 100644 --- a/R/method_nnd.R +++ b/R/method_nnd.R @@ -10,6 +10,7 @@ #' @param y query data, #' @param k number of neighbours to return, #' @param distance type of distance to calculate, +#' @param deduplication whether the deduplication is applied, #' @param verbose if TRUE, log messages to the console, #' @param n_threads maximum number of threads to use, #' @param control controls for the NN descent algorithm. @@ -22,12 +23,13 @@ method_nnd <- function(x, y, k, distance, + deduplication, verbose, n_threads, control) { l_ind <- rnndescent::rnnd_build(data = x, - k = if (nrow(x) < control$nnd$k_build) nrow(x) else control$nnd$k_build, + k = if (nrow(x) < control$nnd$k_build) nrow(x)-1 else control$nnd$k_build, metric = distance, verbose = verbose, n_threads = n_threads, @@ -49,10 +51,22 @@ method_nnd <- function(x, progress = control$nnd$progress, obs = control$nnd$obs) - ## query + ## query k dependent on the study + ## there is a problem when dataset is small + + if (deduplication == T) { + k_nnd_query <- k + } else if (nrow(x) < 10) { + k_nnd_query <- k + } else if (nrow(x) < control$k_search) { + k_nnd_query <- nrow(x) + } else { + k_nnd_query <- control$k_search + } + l_1nn <- rnndescent::rnnd_query(index = l_ind, query = y, - k = k, + k = k_nnd_query, epsilon = 0.1, max_search_fraction = 1, init = NULL, diff --git a/R/methods.R b/R/methods.R index c18763e..0447fb4 100644 --- a/R/methods.R +++ b/R/methods.R @@ -20,7 +20,9 @@ print.blocking <- function(x,...) { if (!is.null(x$metrics)) { cat("========================================================\n") cat("Evaluation metrics (standard):\n" ) - sprintf("%.4f", x$metrics*100) + metrics <- as.numeric(sprintf("%.4f", x$metrics*100)) + names(metrics) <- names(result2$metrics) + print(metrics) } invisible(x) diff --git a/README.Rmd b/README.Rmd index 60898c5..cc1aff0 100644 --- a/README.Rmd +++ b/README.Rmd @@ -18,6 +18,10 @@ knitr::opts_chunk$set( # Overview +## Warning! + +The package is under heavily development so the API as well as functionalities may change. + ## Description This R package is designed to block records for data deduplication and record linkage (also known as entity resolution) using [approximate nearest neighbours algorithms (ANN)](https://en.wikipedia.org/wiki/Nearest_neighbor_search) and graphs (via the `igraph` package). diff --git a/README.md b/README.md index 38757de..f8f9642 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,11 @@ coverage](https://codecov.io/gh/ncn-foreigners/blocking/branch/main/graph/badge. # Overview +## Warning! + +The package is under heavily development so the API as well as +functionalities may change. + ## Description This R package is designed to block records for data deduplication and diff --git a/inst/tinytest/test_annoy.R b/inst/tinytest/test_annoy.R index 34c44d3..3877aea 100644 --- a/inst/tinytest/test_annoy.R +++ b/inst/tinytest/test_annoy.R @@ -44,14 +44,14 @@ expect_equal( list(x = c(1, 1, 1, 2, 2, 2, 2, 3), y = c(5L, 6L, 7L, 1L, 2L, 3L, 4L, 8L), block = c(2, 2, 2, 1, 1, 1, 1, 3), - dist = c(0, 1, 0, 1, 0, 1, 4, 5)), + dist = c(0, 1, 0, 1, 0, 1, 4, 4)), row.names = c(NA, -8L), class = c("data.table", "data.frame")), method = "annoy", deduplication = FALSE, metrics = NULL, - colnames = c("al", "an", "ho", "ij", "ja", "ki", "ko", "ls", "mo", "nt", - "ow", "py", "sk", "ty", "wa", "yp", "yt", "on", "th"), + colnames = c("al", "an", "ho", "ij", "ja", "ki", "ko", "ls", "mo", "ow", + "py", "sk", "ty", "wa", "yp", "yt", "nt", "on", "th"), graph = NULL), class = "blocking") ) diff --git a/inst/tinytest/test_blocking.R b/inst/tinytest/test_blocking.R index ac1fba9..71e75d7 100644 --- a/inst/tinytest/test_blocking.R +++ b/inst/tinytest/test_blocking.R @@ -5,6 +5,23 @@ expect_silent( ) +expect_equal( + blocking(x = df_example$txt)$result$block, + c(1, 1, 1, 2, 2, 2) +) + + +expect_equal( + blocking(x = df_example$txt, ann = "hnsw")$result$block, + c(1, 1, 1, 2, 2, 2) +) + +expect_equal( + blocking(x = df_example$txt, ann = "annoy")$result$block, + c(1, 1, 1, 2, 2, 2) +) + + expect_equal( blocking(x = df_example$txt, ann = "lsh")$result$block, c(1, 1, 1, 2, 2, 2) @@ -15,11 +32,29 @@ expect_equal( c(1, 1, 1, 2, 2, 2) ) - expect_silent( blocking(x = df_base$txt, y = df_example$txt) ) +expect_equal( + blocking(x = df_base$txt, y = df_example$txt)$result$block, + c(rep(2,4),rep(1,4)) +) + +expect_equal( + blocking(x = df_base$txt, y = df_example$txt, ann = "hnsw")$result$block, + c(rep(2,4),rep(1,4)) +) + +expect_equal( + blocking(x = df_base$txt, y = df_example$txt, ann = "annoy")$result$block, + c(rep(2,4),rep(1,4)) +) + +expect_equal( + blocking(x = df_base$txt, y = df_example$txt, ann = "lsh")$result$block, + c(rep(2,3),rep(1,4), 3) +) expect_silent( blocking(x = mat_y) diff --git a/inst/tinytest/test_data.R b/inst/tinytest/test_data.R index b760f2b..1a7e7de 100644 --- a/inst/tinytest/test_data.R +++ b/inst/tinytest/test_data.R @@ -11,7 +11,7 @@ df_example <- data.frame(txt = c( "cyrkmontypython", "monty" )) -df_base <- data.frame(txt = c("montypython", "kowalskijan", "other")) +df_base <- data.frame(txt = c("montypython", "kowalskijan", "somethingcompletelydifferent")) diff --git a/inst/tinytest/test_hnsw.R b/inst/tinytest/test_hnsw.R index 5cbebc9..f4ac9e6 100644 --- a/inst/tinytest/test_hnsw.R +++ b/inst/tinytest/test_hnsw.R @@ -25,8 +25,8 @@ expect_equal( method = "hnsw", deduplication = FALSE, metrics = NULL, - colnames = c("al", "an", "ho", "ij", "ja", "ki", "ko", "ls", "mo", "nt", "ow", - "py", "sk", "ty", "wa", "yp", "yt", "on", "th"), + colnames = c("al", "an", "ho", "ij", "ja", "ki", "ko", "ls", "mo", "ow", + "py", "sk", "ty", "wa", "yp", "yt", "nt", "on", "th"), graph = NULL), class = "blocking") ) @@ -56,8 +56,8 @@ expect_equal( method = "hnsw", deduplication = FALSE, metrics = NULL, - colnames = c("al", "an", "ho", "ij", "ja", "ki", "ko", "ls", "mo", "nt", "ow", - "py", "sk", "ty", "wa", "yp", "yt", "on", "th"), + colnames = c("al", "an", "ho", "ij", "ja", "ki", "ko", "ls", "mo", "nt", + "ow", "py", "sk", "ty", "wa", "yp", "yt", "on", "th"), graph = NULL), class = "blocking") ) diff --git a/inst/tinytest/test_mlpack.R b/inst/tinytest/test_mlpack.R index 27172f0..45703ea 100644 --- a/inst/tinytest/test_mlpack.R +++ b/inst/tinytest/test_mlpack.R @@ -4,18 +4,18 @@ expect_equal( blocking(x = df_base$txt, y = df_example$txt, ann = "lsh"), - structure(list(result = structure( - list(x = c(1, 1, 1, 2, 2, 2, 2, 3), - y = c(5L, 6L, 7L, 1L, 2L, 3L, 4L, 8L), - block = c(2, 2, 2, 1, 1, 1, 1, 3), - dist = c(0, 1, 0, 1, 0, 1, 2, 2.23606797749979)), + structure(list(result = structure(list( + x = c(1, 1, 1, 2, 2, 2, 2, 3), + y = c(5L, 6L, 7L, 1L, 2L, 3L, 4L, 8L), + block = c(2, 2, 2, 1, 1, 1, 1, 3), + dist = c(0, 1, 0, 1, 0, 1, 2, 2)), row.names = c(NA, -8L), class = c("data.table", "data.frame")), method = "lsh", deduplication = FALSE, metrics = NULL, - colnames = c("al", "an", "ho", "ij", "ja", "ki", "ko", "ls", "mo", "nt", "ow", - "py", "sk", "ty", "wa", "yp", "yt", "on", "th"), + colnames = c("al", "an", "ho", "ij", "ja", "ki", "ko", "ls", "mo", "ow", + "py", "sk", "ty", "wa", "yp", "yt", "nt", "on", "th"), graph = NULL), class = "blocking") ) @@ -28,14 +28,14 @@ expect_equal( list(x = c(1, 1, 1, 2, 2, 2, 2, 3), y = c(5L, 6L, 7L, 1L, 2L, 3L, 4L, 8L), block = c(2, 2, 2, 1, 1, 1, 1, 3), - dist = c(0, 1, 0, 1, 0, 1, 2, 2.23606797749979)), + dist = c(0, 1, 0, 1, 0, 1, 2, 2)), row.names = c(NA, -8L), class = c("data.table", "data.frame")), method = "kd", deduplication = FALSE, metrics = NULL, - colnames = c("al", "an", "ho", "ij", "ja", "ki", "ko", "ls", "mo", "nt", "ow", - "py", "sk", "ty", "wa", "yp", "yt", "on", "th"), + colnames =c("al", "an", "ho", "ij", "ja", "ki", "ko", "ls", "mo", "ow", + "py", "sk", "ty", "wa", "yp", "yt", "nt", "on", "th"), graph = NULL), class = "blocking") ) @@ -61,7 +61,7 @@ expect_equal( list(x = c(1, 1, 1, 2, 2, 2, 2, 3), y = c(5L, 6L, 7L, 1L, 2L, 3L, 4L, 8L), block = c(2, 2, 2, 1, 1, 1, 1, 3), - dist = c(0, 1, 0, 1, 0, 1, 2, 2.23606797749979)), + dist = c(0, 1, 0, 1, 0, 1, 2, 2.236068)), row.names = c(NA, -8L), class = c("data.table", "data.frame")), method = "lsh", @@ -82,7 +82,7 @@ expect_equal( list(x = c(1, 1, 1, 2, 2, 2, 2, 3), y = c(5L, 6L, 7L, 1L, 2L, 3L, 4L, 8L), block = c(2, 2, 2, 1, 1, 1, 1, 3), - dist = c(0, 1, 0, 1, 0, 1, 2, 2.23606797749979)), + dist = c(0, 1, 0, 1, 0, 1, 2, 2.236068)), row.names = c(NA, -8L), class = c("data.table", "data.frame")), method = "kd", diff --git a/man/controls_ann.Rd b/man/controls_ann.Rd index 03496fa..614d0ae 100644 --- a/man/controls_ann.Rd +++ b/man/controls_ann.Rd @@ -6,6 +6,7 @@ \usage{ controls_ann( sparse = FALSE, + k_search = 30, nnd = list(k_build = 30, use_alt_metric = TRUE, init = "tree", n_trees = NULL, leaf_size = NULL, max_tree_depth = 200, margin = "auto", n_iters = NULL, delta = 0.001, max_candidates = NULL, low_memory = TRUE, n_search_trees = 1, @@ -14,23 +15,25 @@ controls_ann( hnsw = list(M = 25, ef_c = 200, ef_s = 200, grain_size = 1, byrow = TRUE), lsh = list(bucket_size = 500, hash_width = 10, num_probes = 0, projections = 10, tables = 30), - annoy = list(n_trees = 250, build_on_disk = FALSE), kd = list(algorithm = "dual_tree", epsilon = 0, leaf_size = 20, random_basis = FALSE, - rho = 0.7, tau = 0, tree_type = "kd") + rho = 0.7, tau = 0, tree_type = "kd"), + annoy = list(n_trees = 250, build_on_disk = FALSE) ) } \arguments{ \item{sparse}{whether sparse data should be used as an input for algorithms,} -\item{nnd}{parameters for \code{\link[rnndescent:rnnd_build]{rnndescent::rnnd_build()}} and \code{\link[rnndescent:rnnd_query]{rnndescent::rnnd_query()}},} +\item{k_search}{number of neighbours to search,} -\item{hnsw}{parameters for \code{\link[RcppHNSW:hnsw_build]{RcppHNSW::hnsw_build()}} and \code{\link[RcppHNSW:hnsw_search]{RcppHNSW::hnsw_search()}},} +\item{nnd}{list of parameters for \code{\link[rnndescent:rnnd_build]{rnndescent::rnnd_build()}} and \code{\link[rnndescent:rnnd_query]{rnndescent::rnnd_query()}},} -\item{lsh}{parameters for \code{\link[mlpack:lsh]{mlpack::lsh()}},} +\item{hnsw}{list of parameters for \code{\link[RcppHNSW:hnsw_build]{RcppHNSW::hnsw_build()}} and \code{\link[RcppHNSW:hnsw_search]{RcppHNSW::hnsw_search()}},} -\item{annoy}{parameters for \link{RcppAnnoy} package,} +\item{lsh}{list of parameters for \code{\link[mlpack:lsh]{mlpack::lsh()}},} -\item{kd}{parameters for \code{\link[mlpack:knn]{mlpack::knn()}} function.} +\item{annoy}{list of parameters for \link{RcppAnnoy} package.} + +\item{lisf}{of kd parameters for \code{\link[mlpack:knn]{mlpack::knn()}} function,} } \value{ Returns a list with parameters diff --git a/man/controls_txt.Rd b/man/controls_txt.Rd index 5425f65..0e71a13 100644 --- a/man/controls_txt.Rd +++ b/man/controls_txt.Rd @@ -16,9 +16,9 @@ controls_txt( \item{n_chunks}{passed to (default \code{10L}),} -\item{lowercase}{should the caracters be made lowercase? (default \code{TRUE})} +\item{lowercase}{should the characters be made lowercase? (default \code{TRUE}),} -\item{strip_non_alphanum}{should punctuation and white space be stripped? (default \code{TRUE})} +\item{strip_non_alphanum}{should punctuation and white space be stripped? (default \code{TRUE}).} } \value{ Returns a list with parameters. diff --git a/man/method_nnd.Rd b/man/method_nnd.Rd index c73bd0e..aadcae4 100644 --- a/man/method_nnd.Rd +++ b/man/method_nnd.Rd @@ -4,7 +4,7 @@ \alias{method_nnd} \title{An internal function to use the NN descent algorithm via the \link{rnndescent} package.} \usage{ -method_nnd(x, y, k, distance, verbose, n_threads, control) +method_nnd(x, y, k, distance, deduplication, verbose, n_threads, control) } \arguments{ \item{x}{deduplication or reference data,} @@ -15,6 +15,8 @@ method_nnd(x, y, k, distance, verbose, n_threads, control) \item{distance}{type of distance to calculate,} +\item{deduplication}{whether the deduplication is applied,} + \item{verbose}{if TRUE, log messages to the console,} \item{n_threads}{maximum number of threads to use,} diff --git a/vignettes/v2-reclin.Rmd b/vignettes/v2-reclin.Rmd index 329261a..16d312f 100644 --- a/vignettes/v2-reclin.Rmd +++ b/vignettes/v2-reclin.Rmd @@ -122,12 +122,6 @@ Let's take a look at the first pair. Obviously there is a typo in the `pername1` cbind(t(census[1, 1:9]), t(cis[8152, 1:9])) ``` -Now, let's look at the 7th pair with the largest distance from the first 10 rows. This seems to be a non-match because only `pername2` and `sex` are the same. - -```{r} -cbind( t(census[8, 1:9]), t(cis[3901, 1:9])) -``` - ## Assessing the quality @@ -155,13 +149,14 @@ Let's see how our approach handled this problem. result2 ``` -It seems that the default parameters of the NND method result in an FNR of `r sprintf("%.1f",result2$metrics["fnr"]*100)`%, which is quite large. We can see if increasing the number of `k` (and thus `max_candidates`) as suggested in the [Nearest Neighbor Descent +It seems that the default parameters of the NND method result in an FNR of `r sprintf("%.2f",result2$metrics["fnr"]*100)`%, which is quite large. We can see if increasing the number of `k` (and thus `max_candidates`) as suggested in the [Nearest Neighbor Descent ](https://jlmelville.github.io/rnndescent/articles/nearest-neighbor-descent.html) vignette will help. ```{r} set.seed(2024) ann_control_pars <- controls_ann() +ann_control_pars$k_search <- 60 ann_control_pars$nnd$k_build <- 60 result3 <- blocking(x = census$txt, y = cis$txt, verbose = 1, @@ -201,6 +196,7 @@ Computation times are: 16 seconds for NND and about 60 for HNSW (on M2 MacBook A Finally, we can compare the results of two ANN algorithms. The overlap between neighbours is given by ```{r} -mean(result3$result[order(y)]$x == result4$result[order(y)]$x)*100 +c("no tuning" = mean(result2$result[order(y)]$x == result4$result[order(y)]$x)*100, + "with tuning" = mean(result3$result[order(y)]$x == result4$result[order(y)]$x)*100) ```