Skip to content

Commit

Permalink
initial fixing for nnd
Browse files Browse the repository at this point in the history
  • Loading branch information
BERENZ committed May 6, 2024
1 parent 5326b6e commit 8741892
Show file tree
Hide file tree
Showing 20 changed files with 135 additions and 59 deletions.
4 changes: 4 additions & 0 deletions .Rproj.user/shared/notebooks/paths
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
/Users/berenz/Downloads/Template of Abstract in Latex.tex="A4C7846D"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/.gitignore="C912F95E"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/DESCRIPTION="019D16E4"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/controls.R="5BC637B7"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/method_annoy.R="684202BA"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/method_hnsw.R="A4FAA5A3"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/method_mlpack.R="B6A90565"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/method_nnd.R="87049873"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/methods.R="B7F84C4B"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/R/reclin2_pair_ann.R="1D89EE3E"
Expand All @@ -15,8 +17,10 @@
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_mlpack.R="51D2EAA1"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_print.R="AA7835F7"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/inst/tinytest/test_reclin2.R="E3E08D07"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/misc/hnsw-nndesc.Rmd="F39A0093"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/tests/tinytest.R="D6BBCDC1"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/vignettes/v1-deduplication.Rmd="9D34DD44"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/vignettes/v2-reclin.Rmd="289A4D2F"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/vignettes/v3-evaluation.Rmd="E778A54F"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/vignettes/v4-integration.Rmd="E3EFC8F1"
/Users/berenz/mac/nauka/ncn-foreigners/software/blocking/vignettes/v5-bigdata.Rmd="335CBF49"
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
playground
docs
inst/doc
misc
1 change: 1 addition & 0 deletions R/blocking.R
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,7 @@ blocking <- function(x,
y = y_dtm[, colnames_xy],
k = k,
distance = distance,
deduplication = deduplication,
verbose = if (verbose == 2) TRUE else FALSE,
n_threads = n_threads,
control = control_ann),
Expand Down
23 changes: 13 additions & 10 deletions R/controls.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,19 @@
#' Controls for ANN algorithms used in the package
#'
#' @param sparse whether sparse data should be used as an input for algorithms,
#' @param nnd parameters for [rnndescent::rnnd_build()] and [rnndescent::rnnd_query()],
#' @param hnsw parameters for [RcppHNSW::hnsw_build()] and [RcppHNSW::hnsw_search()],
#' @param lsh parameters for [mlpack::lsh()],
#' @param annoy parameters for [RcppAnnoy] package,
#' @param kd parameters for [mlpack::knn()] function.
#' @param k_search number of neighbours to search,
#' @param nnd list of parameters for [rnndescent::rnnd_build()] and [rnndescent::rnnd_query()],
#' @param hnsw list of parameters for [RcppHNSW::hnsw_build()] and [RcppHNSW::hnsw_search()],
#' @param lsh list of parameters for [mlpack::lsh()],
#' @param lisf of kd parameters for [mlpack::knn()] function,
#' @param annoy list of parameters for [RcppAnnoy] package.
#'
#' @returns Returns a list with parameters
#'
#' @export
controls_ann <- function(
sparse = FALSE,
k_search = 30,
nnd = list(k_build = 30,
use_alt_metric = TRUE,
init = "tree",
Expand Down Expand Up @@ -45,18 +47,19 @@ controls_ann <- function(
num_probes = 0,
projections = 10,
tables = 30),
annoy = list(n_trees = 250,
build_on_disk = FALSE),
kd = list(algorithm = "dual_tree",
epsilon = 0,
leaf_size = 20,
random_basis = FALSE,
rho = 0.7,
tau = 0,
tree_type = "kd")
tree_type = "kd"),
annoy = list(n_trees = 250,
build_on_disk = FALSE)
) {

list(sparse = sparse,
k_search = k_search,
nnd = nnd,
hnsw = hnsw,
lsh = lsh,
Expand All @@ -73,8 +76,8 @@ controls_ann <- function(
#'
#' @param n_shingles length of shingles (default `2L`),
#' @param n_chunks passed to (default `10L`),
#' @param lowercase should the caracters be made lowercase? (default `TRUE`)
#' @param strip_non_alphanum should punctuation and white space be stripped? (default `TRUE`)
#' @param lowercase should the characters be made lowercase? (default `TRUE`),
#' @param strip_non_alphanum should punctuation and white space be stripped? (default `TRUE`).
#'
#' @returns Returns a list with parameters.
#'
Expand Down
7 changes: 5 additions & 2 deletions R/method_annoy.R
Original file line number Diff line number Diff line change
Expand Up @@ -53,15 +53,18 @@ method_annoy <- function(x,
}
if (verbose) l_ind$setVerbose(1)

## index - this does not require dense matrix (sparse can be used?)
## index - this does not require dense matrix
for (i in 1:nrow(x)) l_ind$addItem(i - 1, x[i,])
l_ind$build(control$annoy$n_trees)
l_ind_nns <- numeric(length = nrow(y))
l_ind_dist <- numeric(length = nrow(y))

## query
for (i in 1:nrow(y)) {
annoy_res <- l_ind$getNNsByVectorList(y[i, ], k, -1, TRUE)
annoy_res <- l_ind$getNNsByVectorList(y[i, ],
if (nrow(x) < control$k_search) nrow(x) else control$k_search,
-1,
TRUE)
l_ind_nns[i] <- annoy_res$item[k]
l_ind_dist[i] <- annoy_res$distance[k]
}
Expand Down
7 changes: 5 additions & 2 deletions R/method_hnsw.R
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,12 @@ method_hnsw <- function(x,

l_ind$setEf(control$hnsw$ef_s)

## this does not handle the control$k_search parameter
l_1nn_m <- list()
for (i in 1:nrow(y)) {
l_1nn_m[[i]] <- l_ind$getNNsList(y[i,], k, TRUE)
l_1nn_m[[i]] <- l_ind$getNNsList(y[i,],
k,
TRUE)
}

l_1nn <- list(idx = do.call("rbind",lapply(l_1nn_m, "[[", "item")),
Expand All @@ -76,7 +79,7 @@ method_hnsw <- function(x,
## query
l_1nn <- RcppHNSW::hnsw_search(X = y,
ann = l_ind,
k = k,
k = if (nrow(x) < control$k_search) nrow(x) else control$k_search,
ef = control$hnsw$ef_s,
verbose = verbose,
n_threads = n_threads)
Expand Down
4 changes: 2 additions & 2 deletions R/method_mlpack.R
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ method_mlpack <- function(x,
y <- as.matrix(y)

result <- switch(algo,
"lsh" = mlpack::lsh(k = k,
"lsh" = mlpack::lsh(k = if (nrow(x) < control$k_search) nrow(x) else control$k_search,
query = y,
reference = x,
verbose = verbose,
Expand All @@ -48,7 +48,7 @@ method_mlpack <- function(x,
num_probes = control$lsh$num_probes,
projections = control$lsh$projections,
tables = control$lsh$tables),
"kd" = mlpack::knn(k = k,
"kd" = mlpack::knn(k = if (nrow(x) < control$k_search) nrow(x) else control$k_search,
query = y,
reference = x,
verbose = verbose,
Expand Down
20 changes: 17 additions & 3 deletions R/method_nnd.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#' @param y query data,
#' @param k number of neighbours to return,
#' @param distance type of distance to calculate,
#' @param deduplication whether the deduplication is applied,
#' @param verbose if TRUE, log messages to the console,
#' @param n_threads maximum number of threads to use,
#' @param control controls for the NN descent algorithm.
Expand All @@ -22,12 +23,13 @@ method_nnd <- function(x,
y,
k,
distance,
deduplication,
verbose,
n_threads,
control) {

l_ind <- rnndescent::rnnd_build(data = x,
k = if (nrow(x) < control$nnd$k_build) nrow(x) else control$nnd$k_build,
k = if (nrow(x) < control$nnd$k_build) nrow(x)-1 else control$nnd$k_build,
metric = distance,
verbose = verbose,
n_threads = n_threads,
Expand All @@ -49,10 +51,22 @@ method_nnd <- function(x,
progress = control$nnd$progress,
obs = control$nnd$obs)

## query
## query k dependent on the study
## there is a problem when dataset is small

if (deduplication == T) {
k_nnd_query <- k
} else if (nrow(x) < 10) {
k_nnd_query <- k
} else if (nrow(x) < control$k_search) {
k_nnd_query <- nrow(x)
} else {
k_nnd_query <- control$k_search
}

l_1nn <- rnndescent::rnnd_query(index = l_ind,
query = y,
k = k,
k = k_nnd_query,
epsilon = 0.1,
max_search_fraction = 1,
init = NULL,
Expand Down
4 changes: 3 additions & 1 deletion R/methods.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ print.blocking <- function(x,...) {
if (!is.null(x$metrics)) {
cat("========================================================\n")
cat("Evaluation metrics (standard):\n" )
sprintf("%.4f", x$metrics*100)
metrics <- as.numeric(sprintf("%.4f", x$metrics*100))
names(metrics) <- names(result2$metrics)
print(metrics)

}
invisible(x)
Expand Down
4 changes: 4 additions & 0 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ knitr::opts_chunk$set(

# Overview

## Warning!

The package is under heavily development so the API as well as functionalities may change.

## Description

This R package is designed to block records for data deduplication and record linkage (also known as entity resolution) using [approximate nearest neighbours algorithms (ANN)](https://en.wikipedia.org/wiki/Nearest_neighbor_search) and graphs (via the `igraph` package).
Expand Down
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@ coverage](https://codecov.io/gh/ncn-foreigners/blocking/branch/main/graph/badge.

# Overview

## Warning!

The package is under heavily development so the API as well as
functionalities may change.

## Description

This R package is designed to block records for data deduplication and
Expand Down
6 changes: 3 additions & 3 deletions inst/tinytest/test_annoy.R
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,14 @@ expect_equal(
list(x = c(1, 1, 1, 2, 2, 2, 2, 3),
y = c(5L, 6L, 7L, 1L, 2L, 3L, 4L, 8L),
block = c(2, 2, 2, 1, 1, 1, 1, 3),
dist = c(0, 1, 0, 1, 0, 1, 4, 5)),
dist = c(0, 1, 0, 1, 0, 1, 4, 4)),
row.names = c(NA, -8L),
class = c("data.table", "data.frame")),
method = "annoy",
deduplication = FALSE,
metrics = NULL,
colnames = c("al", "an", "ho", "ij", "ja", "ki", "ko", "ls", "mo", "nt",
"ow", "py", "sk", "ty", "wa", "yp", "yt", "on", "th"),
colnames = c("al", "an", "ho", "ij", "ja", "ki", "ko", "ls", "mo", "ow",
"py", "sk", "ty", "wa", "yp", "yt", "nt", "on", "th"),
graph = NULL),
class = "blocking")
)
Expand Down
37 changes: 36 additions & 1 deletion inst/tinytest/test_blocking.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,23 @@ expect_silent(
)


expect_equal(
blocking(x = df_example$txt)$result$block,
c(1, 1, 1, 2, 2, 2)
)


expect_equal(
blocking(x = df_example$txt, ann = "hnsw")$result$block,
c(1, 1, 1, 2, 2, 2)
)

expect_equal(
blocking(x = df_example$txt, ann = "annoy")$result$block,
c(1, 1, 1, 2, 2, 2)
)


expect_equal(
blocking(x = df_example$txt, ann = "lsh")$result$block,
c(1, 1, 1, 2, 2, 2)
Expand All @@ -15,11 +32,29 @@ expect_equal(
c(1, 1, 1, 2, 2, 2)
)


expect_silent(
blocking(x = df_base$txt, y = df_example$txt)
)

expect_equal(
blocking(x = df_base$txt, y = df_example$txt)$result$block,
c(rep(2,4),rep(1,4))
)

expect_equal(
blocking(x = df_base$txt, y = df_example$txt, ann = "hnsw")$result$block,
c(rep(2,4),rep(1,4))
)

expect_equal(
blocking(x = df_base$txt, y = df_example$txt, ann = "annoy")$result$block,
c(rep(2,4),rep(1,4))
)

expect_equal(
blocking(x = df_base$txt, y = df_example$txt, ann = "lsh")$result$block,
c(rep(2,3),rep(1,4), 3)
)

expect_silent(
blocking(x = mat_y)
Expand Down
2 changes: 1 addition & 1 deletion inst/tinytest/test_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ df_example <- data.frame(txt = c(
"cyrkmontypython",
"monty"
))
df_base <- data.frame(txt = c("montypython", "kowalskijan", "other"))
df_base <- data.frame(txt = c("montypython", "kowalskijan", "somethingcompletelydifferent"))



Expand Down
8 changes: 4 additions & 4 deletions inst/tinytest/test_hnsw.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ expect_equal(
method = "hnsw",
deduplication = FALSE,
metrics = NULL,
colnames = c("al", "an", "ho", "ij", "ja", "ki", "ko", "ls", "mo", "nt", "ow",
"py", "sk", "ty", "wa", "yp", "yt", "on", "th"),
colnames = c("al", "an", "ho", "ij", "ja", "ki", "ko", "ls", "mo", "ow",
"py", "sk", "ty", "wa", "yp", "yt", "nt", "on", "th"),
graph = NULL),
class = "blocking")
)
Expand Down Expand Up @@ -56,8 +56,8 @@ expect_equal(
method = "hnsw",
deduplication = FALSE,
metrics = NULL,
colnames = c("al", "an", "ho", "ij", "ja", "ki", "ko", "ls", "mo", "nt", "ow",
"py", "sk", "ty", "wa", "yp", "yt", "on", "th"),
colnames = c("al", "an", "ho", "ij", "ja", "ki", "ko", "ls", "mo", "nt",
"ow", "py", "sk", "ty", "wa", "yp", "yt", "on", "th"),
graph = NULL),
class = "blocking")
)
Expand Down
Loading

0 comments on commit 8741892

Please sign in to comment.