diff --git a/R/controls.R b/R/controls.R index 5261de0..f02d5b9 100644 --- a/R/controls.R +++ b/R/controls.R @@ -36,7 +36,10 @@ controls_ann <- function( weight_by_degree = FALSE, prune_reverse = FALSE, progress = "bar", - obs = "R"), + obs = "R", + ## + max_search_fraction = 1, + epsilon = 0.1), hnsw = list(M = 25, ef_c = 200, ef_s = 200, diff --git a/R/method_nnd.R b/R/method_nnd.R index 401af32..11424ab 100644 --- a/R/method_nnd.R +++ b/R/method_nnd.R @@ -67,12 +67,12 @@ method_nnd <- function(x, l_1nn <- rnndescent::rnnd_query(index = l_ind, query = y, k = if (nrow(x) < control$k_search) nrow(x) else control$k_search, - epsilon = 0.1, - max_search_fraction = 1, + epsilon = control$nnd$epsilon, + max_search_fraction = control$nnd$max_search_fraction, init = NULL, verbose = verbose, n_threads = n_threads, - obs = "R") + obs = control$nnd$obs) # if (!is.null(path)) { # if (grepl("(/|\\\\)$", path)) { diff --git a/man/controls_ann.Rd b/man/controls_ann.Rd index 03daae2..e321a63 100644 --- a/man/controls_ann.Rd +++ b/man/controls_ann.Rd @@ -11,7 +11,8 @@ controls_ann( leaf_size = NULL, max_tree_depth = 200, margin = "auto", n_iters = NULL, delta = 0.001, max_candidates = NULL, low_memory = TRUE, n_search_trees = 1, pruning_degree_multiplier = 1.5, diversify_prob = 1, weight_by_degree = FALSE, - prune_reverse = FALSE, progress = "bar", obs = "R"), + prune_reverse = FALSE, progress = "bar", obs = "R", max_search_fraction = 1, epsilon + = 0.1), hnsw = list(M = 25, ef_c = 200, ef_s = 200, grain_size = 1, byrow = TRUE), lsh = list(bucket_size = 500, hash_width = 10, num_probes = 0, projections = 10, tables = 30), diff --git a/vignettes/v2-reclin.Rmd b/vignettes/v2-reclin.Rmd index 4ecf612..d61a39f 100644 --- a/vignettes/v2-reclin.Rmd +++ b/vignettes/v2-reclin.Rmd @@ -157,14 +157,14 @@ It seems that the default parameters of the NND method result in an FNR of `r sp ```{r} set.seed(2024) ann_control_pars <- controls_ann() -ann_control_pars$k_search <- 60 +ann_control_pars$nnd$epsilon <- 0.2 result3 <- blocking(x = census$txt, y = cis$txt, verbose = 1, true_blocks = matches[, .(x, y, block)], n_threads = 8, control_ann = ann_control_pars) ``` -Changing the `k_search` parameter from 30 to 60 decreased the FDR to `r sprintf("%.1f",result3$metrics["fnr"]*100)`%. +Changing the `epsilon` search parameter from 0.1 to 0.2 decreased the FDR to `r sprintf("%.1f",result3$metrics["fnr"]*100)`%. ```{r} result3 @@ -184,12 +184,6 @@ It seems that the HNSW algorithm performed better with `r sprintf("%.2f",result4 result4 ``` -However, this comes at a cost, especially in terms of computation: - -1. the HNSW does not handle sparse matrices, so a sparse matrix of tokens must be converted to dense or provided line by line. -2. The HNSW algorithm is slower than NND. - -Computation times are: 16 seconds for NND and about 60 for HNSW (on M2 MacBook AIR). We can improve the time by changing the parameters `M` and `ef_s` in the `controls_ann()` function (e.g. setting `M=16` and `ef_s=15` leads to about 16 seconds with 1\% FNR). ## Compare results