diff --git a/.Rbuildignore b/.Rbuildignore index 91114bf..a32bacb 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,2 +1,4 @@ ^.*\.Rproj$ ^\.Rproj\.user$ +^doc$ +^Meta$ diff --git a/.gitignore b/.gitignore index 8552ded..59271f7 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,5 @@ vignettes/*.html vignettes/*.pdf # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 .Rproj.user +/doc/ +/Meta/ diff --git a/DESCRIPTION b/DESCRIPTION index a894b11..2999b18 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,22 +1,26 @@ Package: microclustr Type: Package Title: Entity Resolution with Random Partition Priors for Microclustering -Version: 0.1.0 -Date: 2020-09-15 +Version: 0.1.1 +Date: 2022-08-14 Authors@R: c( - person(given=c("Rebecca", "C"), family="Steorts", role = c("aut","cre"), email="beka@stat.duke.edu"), + person(given=c("Rebecca", "C"), family="Steorts", role = c("aut"), email="beka@stat.duke.edu"), person("Brenda", "Betancourt", email = "bbetancourt@ufl.edu", role = c("aut")), - person("Giacomo", "Zanella", email = "zanella.gcm@gmail.com", role = c("aut")) + person("Giacomo", "Zanella", email = "zanella.gcm@gmail.com", role = c("aut")), + person("Changwoo", "Lee", email = "c.lee@stat.tamu.edu", role = c("aut","cre")), + person("Huiyan", "Sang", email = "huyian@stat.tamu.edu", role = c("aut")) ) +Maintainer: Changwoo Lee Depends: R (>= 3.2.4) Imports: Rcpp (>= 1.0.1), stats +Encoding: UTF-8 Suggests: knitr Description: An implementation of the model in Betancourt, Zanella, Steorts (2020), which performs microclustering models for categorical data. The package provides a vignette for two proposed methods in the paper as well as two standard Bayesian non-parametric clustering approaches for entity resolution. The experiments are reproducible and illustrated using a simple vignette. LICENSE: GPL-3 + file license. VignetteBuilder: knitr License: GPL-3 LinkingTo: Rcpp -RoxygenNote: 7.1.1.9000 +RoxygenNote: 7.2.0 diff --git a/R/error_rates.R b/R/error_rates.R index 934e8b0..ef2d562 100644 --- a/R/error_rates.R +++ b/R/error_rates.R @@ -1,97 +1,158 @@ ## functions to compute FNR and FDR ## -#' Calculates FDR when ground truth is available -#' +#' @title Calculates false discovery rate (FDR) when the ground truth is available +#' +#' @description False discovery rate (FDR) of the estimated record linkage (partition) based on the ground truth is defined as (Steorts, 2015) +#' \deqn{FDR = \frac{FP}{CL + FP}} +#' where FP is the number of false positives (not linked under the truth but linked under the estimate) and CL is the number of correct links (true positives). +#' If both FP=0 and CL=0, define FDR = 0. +#' +#' FDR can be also defined as \eqn{FDR = 1 - Precision}, where \eqn{Precision = CL/(CL+FP)}. +#' +#' \code{fdr_fun} calculates FDR for an estimated partition, and \code{mean_fdr} calculates average FDR based on posterior samples of partition. +#' #' @param z Vector of cluster assignments #' @param id Vector of true cluster assignments (ground truth) -#' @return FDR +#' @param use_apply Logical (default F), whether to use \code{apply()} to calculate the rate. +#' Setting \code{use_apply = T} may be slower but memory efficient when \code{length(z)} is large. +#' @return False discovery rate (FDR) +#' +#' @references Steorts, R. C. (2015). Entity resolution with empirically motivated priors. Bayesian Analysis, 10(4), 849-875. +#' @seealso \code{\link{fnr_fun}}, \code{\link{mean_fnr}} #' @export #' @examples -#' truePartition <- c(50,50,50,50) -#' maxPartitionSize<- length(truePartition) -#' uniqueNumberRecords <- sum(truePartition) -#' id <- rep(1:uniqueNumberRecords, times=rep(1:maxPartitionSize, times=truePartition)) -#' fdr_fun(z = truePartition, id) -fdr_fun <- function(z, id) { - if (n_matches_fun(z) == 0) { - return(0) - } else { - return(sum(vapply(X = c(1:length(z)), FUN = i_false_det_fun, - FUN.VALUE = 1, z. = z, id. = id))/(2 * - n_matches_fun(z))) - } +#' nclusters_per_size <- c(50,50,50,50) +#' numberFields <- 5 +#' numberCategories <- rep(10,5) +#' trueBeta <- 0.01 +#' # generate simulated data +#' simulatedData <- SimData(nclusters_per_size, numberFields, numberCategories, trueBeta) +#' # Fit ESCD model +#' posteriorESCD <- SampleCluster(data=simulatedData, Prior="ESCD", burn=0, nsamples=10) +#' # true number of clusters +#' trueK = sum(nclusters_per_size) +#' # true cluster membership vector +#' trueid = rep(1:trueK, times=rep(1:length(nclusters_per_size), times=nclusters_per_size)) +#' # FDR calculation for a single estimate +#' fdr_fun(posteriorESCD$Z[10,], trueid) +#' # average FDR calculation +#' mean_fdr(posteriorESCD$Z, trueid) +#' +fdr_fun <- function(z, id, use_apply = F) { + if(!use_apply){ # use_apply = F, faster, but memory intensive + estimatelinks = outer_equal(z) + diag(estimatelinks) = F + truelinks = outer_equal(id) + diag(truelinks) = F + denom = sum(estimatelinks) + if(denom == 0){ + return(0) + }else{ + return(sum(estimatelinks & !truelinks)/denom) + } + }else{ # use_apply = T, slower, but not memory intensive + if (n_matches_fun(z) == 0) { + return(0) + } else { + return(sum(vapply(X = c(1:length(z)), FUN = i_false_det_fun, + FUN.VALUE = 1, z. = z, id. = id))/(2 * n_matches_fun(z))) + } + } } -#' Calculates FNR when ground truth is available -#' -#' @param z Vector of cluster assignments -#' @param id Vector of true cluster assignments (ground truth) -#' @return FNR -#' @export -#' @examples -#' truePartition <- c(50,50,50,50) -#' maxPartitionSize<- length(truePartition) -#' uniqueNumberRecords <- sum(truePartition) -#' id <- rep(1:uniqueNumberRecords, times=rep(1:maxPartitionSize, times=truePartition)) -#' fnr_fun(z = truePartition, id) -fnr_fun <- function(z, id) { - if (n_matches_fun(id) == 0) { - return(0) - } else { - return(sum(vapply(X = c(1:length(z)), FUN = i_false_neg_fun, - FUN.VALUE = 1, z. = z, id. = id))/(2 * - n_matches_fun(id))) - } -} -#' Calculates average FDR when ground truth is available +#' @title Calculates false negative rate (FNR) when the ground truth is available +#' +#' @description False negative rate (FNR) of the estimated record linkage (partition) based on the ground truth is defined as (Steorts, 2015) +#' \deqn{FNR = \frac{FN}{CL + FN}} +#' where FN is the number of false negatives (linked under the truth but not linked under the estimate) and CL is the number of correct links (true positives). +#' If both FN=0 and CL=0, define FNR = 0. +#' +#' FNR can be also defined as \eqn{FNR = 1 - Recall}, where \eqn{Recall = CL/(CL+FN)}. #' -#' @param zm Matrix with posterior samples of cluster assignments -#' @param id Vector of true cluster assignments (ground truth) -#' @return Average FDR over posterior samples +#' \code{fnr_fun} calculates FNR for an estimated partition, and \code{mean_fnr} calculates average FNR based on posterior samples of partition. +#' +#' @param z Integer vector of cluster assignments +#' @param id Integer vector of true cluster assignments (ground truth) +#' @param use_apply Logical (default F), whether to use \code{apply()} to calculate the rate. +#' Setting \code{use_apply = T} may be slower but memory efficient when \code{length(z)} is large. +#' @return False negative rate (FNR) +#' +#' @references Steorts, R. C. (2015). Entity resolution with empirically motivated priors. Bayesian Analysis, 10(4), 849-875. +#' @seealso \code{\link{fdr_fun}}, \code{\link{mean_fdr}} #' @export #' @examples -#' truePartition <- c(50,50,50,50) -#' maxPartitionSize<- length(truePartition) -#' uniqueNumberRecords <- sum(truePartition) -#' id <- rep(1:uniqueNumberRecords, times=rep(1:maxPartitionSize, times=truePartition)) +#' nclusters_per_size <- c(50,50,50,50) #' numberFields <- 5 #' numberCategories <- rep(10,5) #' trueBeta <- 0.01 -#' simulatedData <- SimData(truePartition, numberFields, numberCategories, trueBeta) +#' # generate simulated data +#' simulatedData <- SimData(nclusters_per_size, numberFields, numberCategories, trueBeta) +#' # Fit ESCD model #' posteriorESCD <- SampleCluster(data=simulatedData, Prior="ESCD", burn=0, nsamples=10) -#' mean_fdr(zm = posteriorESCD$Z, id) -mean_fdr <- function(zm, id) { +#' # true number of clusters +#' trueK = sum(nclusters_per_size) +#' # true cluster membership vector +#' trueid = rep(1:trueK, times=rep(1:length(nclusters_per_size), times=nclusters_per_size)) +#' # FNR calculation for a single estimate +#' fnr_fun(posteriorESCD$Z[10,], trueid) +#' # average FNR calculation +#' mean_fnr(posteriorESCD$Z, trueid) +#' +fnr_fun <- function(z, id, use_apply = F) { + if(!use_apply){ # use_apply = F, faster, but memory intensive + estimatelinks = outer_equal(z) + diag(estimatelinks) = F + truelinks = outer_equal(id) + diag(truelinks) = F + denom = sum(truelinks) + if(denom == 0){ + return(0) + }else{ + return(sum(!estimatelinks & truelinks)/denom) + } + }else{ # use_apply = T, slower, but not memory intensive + if (n_matches_fun(id) == 0) { + return(0) + } else { + return(sum(vapply(X = c(1:length(z)), FUN = i_false_neg_fun, + FUN.VALUE = 1, z. = z, id. = id))/(2 * n_matches_fun(id))) + } + } +} + + +#' @rdname fdr_fun +#' @param zm Matrix with posterior samples of cluster assignments, where each row corresponds to one sample from the posterior +#' @export +mean_fdr <- function(zm, id, use_apply = F) { fdr_vec <- apply(X = zm, MARGIN = 1, FUN = fdr_fun, - id = id) + id = id, use_apply = use_apply) return(mean(fdr_vec)) } -#' Calculates average FNR when ground truth is available -#' -#' @param zm Matrix with posterior samples of cluster assignments -#' @param id Vector of true cluster assignments (ground truth) -#' @return Average FNR over posterior samples +#' @rdname fnr_fun +#' @param zm Matrix with posterior samples of cluster assignments, where each row corresponds to one sample from the posterior #' @export -#' @examples -#' truePartition <- c(50,50,50,50) -#' maxPartitionSize<- length(truePartition) -#' uniqueNumberRecords <- sum(truePartition) -#' id <- rep(1:uniqueNumberRecords, times=rep(1:maxPartitionSize, times=truePartition)) -#' numberFields <- 5 -#' numberCategories <- rep(10,5) -#' trueBeta <- 0.01 -#' simulatedData <- SimData(truePartition, numberFields, numberCategories, trueBeta) -#' posteriorESCD <- SampleCluster(data=simulatedData, Prior="ESCD", burn=0, nsamples=10) -#' mean_fnr(zm = posteriorESCD$Z, id) -mean_fnr <- function(zm, id) { - fnr_vec <- apply(X = zm, MARGIN = 1, FUN = fnr_fun, - id = id) - return(mean(fnr_vec)) +mean_fnr <- function(zm, id, use_apply = F) { + fnr_vec <- apply(X = zm, MARGIN = 1, FUN = fnr_fun, + id = id, use_apply = use_apply) + return(mean(fnr_vec)) } # auxiliary functions to compute fnr and fdr +# faster version of base::outer(z, z, "==") +outer_equal<- function(z){ + z = as.integer(z) + n = length(z) + Y = rep.int(z, rep.int(n, n)) + robj <- z==Y + dim(robj) = c(n,n) + robj +} + + i_false_det_fun <- function(i, z., id.) { i_match <- which(z.[-i] == z.[i]) i_match_true <- which(id.[-i] == id.[i]) @@ -108,10 +169,11 @@ i_false_neg_fun <- function(i, z., id.) { return(i_false_neg) } +# calculates all possible (undirected) links in the given partition. n_matches_fun <- function(z) { sizes <- table(z) n_match <- sum(vapply(X = sizes, FUN = function(s) { return(choose(s, 2)) }, FUN.VALUE = 1)) return(sum(n_match)) -} \ No newline at end of file +} diff --git a/R/package_description.R b/R/package_description.R new file mode 100644 index 0000000..80e2904 --- /dev/null +++ b/R/package_description.R @@ -0,0 +1,33 @@ +#' @name microclustr-package +#' @docType package +#' @aliases microclustr +#' +#' @title Entity Resolution with Random Partition Priors for Microclustering +#' +#' @description An implementation of the model in Betancourt, Zanella, and Steorts (2020), which performs microclustering models for categorical data. +#' The package provides a vignette for two proposed methods in the paper as well as two standard Bayesian non-parametric clustering approaches for entity resolution. +#' The experiments are reproducible and illustrated using a simple vignette. +#' +#' (Update) An implementaton of the ESC-Binom, ESC-Poisson models in Lee and Sang (2022) has been added. +#' +#' LICENSE: GPL-3 + file license. +#' +#' @author Rebecca C. Steorts, Brenda Betancourt, Giacomo Zanella, Changwoo J. Lee, Huiyan Sang +#' +#' @references +#' Steorts, R. C., Hall, R., & Fienberg, S. E. (2016). A Bayesian approach to graphical record linkage and deduplication. Journal of the American Statistical Association, 111(516), 1660-1672. +#' +#' Zanella, G., Betancourt, B., Miller, J. W., Wallach, H., Zaidi, A., & Steorts, R. C. (2016). Flexible models for microclustering with application to entity resolution. Advances in neural information processing systems, 29. +#' +#' Betancourt, B., Zanella, G., & Steorts, R. C. (2020). Random partition models for microclustering tasks. Journal of the American Statistical Association, 1-13. +#' +#' Lee, C. J., & Sang, H. (2022). Why the Rich Get Richer? On the Balancedness of Random Partition Models. Proceedings of the 39th International Conference on Machine Learning (ICML), PMLR 162:12521 - 12541. +#' +#' @examples +#' +#' library(microclustr) +#' ?SimData +#' ?SampleCluster +#' ?mean_fnr +#' ?mean_fdr +NULL \ No newline at end of file diff --git a/R/samplesESC.R b/R/samplesESC.R index ca26dbd..a019aa7 100644 --- a/R/samplesESC.R +++ b/R/samplesESC.R @@ -6,7 +6,7 @@ NULL ## Wrapper function ## # Inputs: N - the sample size based on the dataset # -# Prior - Specify Prior Type: ESCNB, DP,PY # +# Prior - Specify Prior Type: ESCNB, DP,PY, ESCP, ESCB, ESCBshift # # param - Parameter for prior type specified # # Output: Parameters for probability of existing cluster (A) # # and new cluster (B) for reseating algorithms # @@ -29,11 +29,31 @@ SetParam <- function(Prior, param, N) { A <- (seq(N) + rnb) B <- (seq(N) + 1) * beta * rnb } + if (Prior == "ESCP"){ + lambda <- param[1] + A <- rep(lambda,N) + B <- (seq(N) + 1) * lambda*exp(-lambda)/(1-exp(-lambda)) # can replace lambda with 1, for both A and B + } + if (Prior == "ESCB"){ + Nbinom <- param[1] + pbinom <- param[2] + temp = (1-pbinom)^Nbinom + A <- c(Nbinom - seq(Nbinom), rep(0,N-Nbinom)) + B <- (seq(N) + 1) * Nbinom * temp/(1-temp) + } + if (Prior == "ESCBshift"){ + Nbinom <- param[1] + pbinom <- param[2] + temp = (1-pbinom)^Nbinom + temp2 = (seq(Nbinom)+1)/(seq(Nbinom))*(Nbinom - seq(Nbinom)+1)*pbinom/(1-pbinom) + A <- c(temp2, rep(0,N-Nbinom)) + B <- (seq(N) + 1) * temp + } return(list(A = A, B = B)) } SetParamESCD <- function(mus, N, M) { - A <- (seq(N) + 1) * (c((mus[-1])/mus[-(M + 1)], + A <- (seq(N) + 1) * (c((mus[-1])/mus[-(M + 1)], rep(0, N - M))) B <- (seq(N) + 1) * mus[1] return(list(A = A, B = B)) @@ -41,7 +61,7 @@ SetParamESCD <- function(mus, N, M) { ## Wrapper function ## -# Inputs: Prior - Specify Prior Type: DP, PY, ESCNB, ESCD # +# Inputs: Prior - Specify Prior Type: DP, PY, ESCNB, ESCD, ESCP, ESCB, ESCBshift # # Output: Lower and Upper bound on support of the distribution # # of Prior parameters # @@ -62,43 +82,61 @@ lowupSlice <- function(Prior) { lo <- c(0, 0, 0) up <- c(Inf, Inf, 1) } + if (Prior == "ESCP"){ + lo <- 0 + up <- Inf + } + if (Prior == "ESCB"||Prior == "ESCBshift"){ + lo <- c(0,0) + up <- c(Inf,1) + } return(list(lo = lo, up = up)) } ## Wrapper function ## -# Inputs: Prior - Specify Prior Type: DP, PY, ESCNB, ESCD # +# Inputs: Prior - Specify Prior Type: DP, PY, ESCNB, ESCD, ESCP, ESCB, ESCBshift # # N - number of record in data # +# Nbinom: For ESCB, the maximum size of cluster, for ESCBshift, the initial value of Nbinom. # Output: Initial values for hyperpameters # -SetInit <- function(Prior, N) { - if (Prior == "DP") { - #concentration parameter - theta <- N/2 - x1 <- as.vector(theta) - } - if (Prior == "PY") { - #concentration parameter - theta <- N/2 - # discount parameter - beta <- 0.5 - x1 <- c(theta, beta) - } - if (Prior == "ESCNB") { - rnb <- 1 - pnb <- 0.5 - x1 <- c(rnb, pnb) - } - if (Prior == "ESCD") { - alpha <- 1 - rnb <- 1 - pnb <- 0.5 - x1 <- c(alpha, rnb, pnb) - } - return(x1) +SetInit <- function(Prior, N, Nbinom = NULL) { + if (Prior == "DP") { + #concentration parameter + theta <- N/2 + x1 <- as.vector(theta) + } + if (Prior == "PY") { + #concentration parameter + theta <- N/2 + # discount parameter + beta <- 0.5 + x1 <- c(theta, beta) + } + if (Prior == "ESCNB") { + rnb <- 1 + pnb <- 0.5 + x1 <- c(rnb, pnb) + } + if (Prior == "ESCD") { + alpha <- 1 + rnb <- 1 + pnb <- 0.5 + x1 <- c(alpha, rnb, pnb) + } + if (Prior == "ESCP"){ + lambda <- 1 + x1 <- lambda + } + if (Prior == "ESCB"||Prior == "ESCBshift"){ + Nbinom <- Nbinom + pbinom <- 0.1 + x1 <- c(Nbinom,pbinom) + } + return(x1) } ## Wrapper function ## -# Inputs: Prior - Specify Prior Type: DP, PY, , ESCNB, ESCD # +# Inputs: Prior - Specify Prior Type: DP, PY, , ESCNB, ESCD, ESCP, ESCB, ESCBshift # # Output: Indicator for hyperparameter sampling # SamInd <- function(Prior) { @@ -109,7 +147,7 @@ SamInd <- function(Prior) { if (Prior == "PY") { # concentration parameter # discount parameter -samind <- c(1, 1) + samind <- c(1, 1) } if (Prior == "ESCNB") { # NB parameters r and p @@ -119,12 +157,20 @@ samind <- c(1, 1) # NB parameters r and p, alpha fixed samind <- c(0, 1, 1) } + if (Prior == "ESCP"){ + # poisson + samind <- c(1) + } + if (Prior == "ESCB"||Prior == "ESCBshift"){ + # binomial + samind <- c(0,1) # Nbinom fixed (ESCBshift will not use slice sampling; ignore) + } return(samind) } ## Wrapper function ## -# Inputs: Prior - Specify Prior Type: DP, PY, ESCNB, ESCD # +# Inputs: Prior - Specify Prior Type: DP, PY, ESCNB, ESCD, ESCP, ESCB, ESCBshift # # N - number of record in data # # Output: Vector of hyperpameter values for prior on parameters # # of specified partition prior # @@ -154,42 +200,66 @@ hpriorpar <- function(Prior, N) { bb <- 2 hpriorpar <- c(ag, bg, ab, bb) } + if (Prior == "ESCP" ){ + # Gamma(ag1, bg1) prior over lambda + ag <- 1 + bg <- 1 + hpriorpar <- c(ag, bg) + } + if (Prior == "ESCB"||Prior == "ESCBshift"){ + # Beta(0.5,0.5) prior over pbinom + ab <- 0.5 + bb <- 0.5 + hpriorpar <- c(ab, bb) + } return(hpriorpar) } ## Wrapper function ## -# Inputs: Prior - Specify Prior Type: DP, PY, NBNB, NBD # +# Inputs: Prior - Specify Prior Type: DP, PY, ESCNB, ESCD, ESCP, ESCB, ESCBshift # # Output: Parameter names used to save posterior samples # NameParam <- function(Prior, nfields) { if (Prior == "DP") { cnames <- c("theta") for (i in 1:nfields) { - cnames <- c(cnames, sprintf("beta_%d", + cnames <- c(cnames, sprintf("beta_%d", i)) } } if (Prior == "PY") { cnames <- c("theta", "delta") for (i in 1:nfields) { - cnames <- c(cnames, sprintf("beta_%d", + cnames <- c(cnames, sprintf("beta_%d", i)) } } if (Prior == "ESCNB") { cnames <- c("r", "p") for (i in 1:nfields) { - cnames <- c(cnames, sprintf("beta_%d", + cnames <- c(cnames, sprintf("beta_%d", i)) } } if (Prior == "ESCD") { cnames <- c("alpha", "r", "p") for (i in 1:nfields) { - cnames <- c(cnames, sprintf("beta_%d", + cnames <- c(cnames, sprintf("beta_%d", i)) } } + if (Prior=="ESCP"){ + cnames <- c("lambda") + for(i in 1:nfields){ + cnames <- c(cnames, sprintf("beta_%d",i)) + } + } + if(Prior == "ESCB"||Prior == "ESCBshift"){ + cnames <- c("N","p") + for(i in 1:nfields){ + cnames <- c(cnames, sprintf("beta_%d",i)) + } + } return(cnames) } @@ -199,10 +269,10 @@ calcProp <- function(i, data) { return(table(data[, i])/sum(table(data[, i]))) } -# For each field, remap values to a consecutive +# For each field, remap values to a consecutive # list of integers starting with 1 remap <- function(i, data) { - return(as.numeric(factor(data[, i], labels = seq(1, + return(as.numeric(factor(data[, i], labels = seq(1, length(unique(data[, i])))))) } @@ -215,13 +285,13 @@ rdirichletf <- function(params) { # Find Beta distribution parameters for distortion # probailities, given the mean and sd abeta <- function(bmean, bsd) { - return(bmean * (1 - bmean) * (bmean/bsd^2) - + return(bmean * (1 - bmean) * (bmean/bsd^2) - bmean) } # selects a subset of points (called "block") that agree on some -# randomly choosen features. Chaperones are chosen within the block +# randomly choosen features. Chaperones are chosen within the block # for a few consecutive iterations. select_block <- function(x, size_block) { N <- dim(x)[1] # number of datapoints @@ -229,12 +299,12 @@ select_block <- function(x, size_block) { n_fields <- rbinom(1, L, prob = 0.75) # select how many fields the chaperones will agree on block <- c(1:N) if (n_fields > 0) { - fields_to_agree <- sample(c(1:L), size = n_fields, + fields_to_agree <- sample(c(1:L), size = n_fields, replace = FALSE) # select which fields the chaperones will agree on x0 <- x[sample(c(1:N), 1), ] #select a point at random (to be used as \pivot\" point)" for (l in 1:n_fields) { if (length(block) > size_block) { - block <- block[which(x[block, fields_to_agree[l]] == + block <- block[which(x[block, fields_to_agree[l]] == x0[fields_to_agree[l]])] } } @@ -244,16 +314,16 @@ select_block <- function(x, size_block) { #' Remap data to a list of consecutive integers per field #' -#' @param data Data frame containing only categorical variables -#' @return Data frame of remapped values to consecutive +#' @param data Data frame containing only categorical variables +#' @return Data frame of remapped values to consecutive #' list of integers #' @export #' @examples -#' truePartition <- c(10,10,10,10) +#' nclusters_per_size <- c(10,10,10,10) #' numberFields <- 5 #' numberCategories <- rep(10,5) #' trueBeta <- 0.01 -#' data <- SimData(truePartition, numberFields, numberCategories, trueBeta) +#' data <- SimData(nclusters_per_size, numberFields, numberCategories, trueBeta) #' DataRemap(data) DataRemap <- function(data) { nfields <- ncol(data) @@ -267,7 +337,7 @@ DataRemap <- function(data) { # distortion probablities of the fields. # # # # Inputs: Data - e.g "Italy" # -# Prior - specify Prior Type: DP,PY,ESCNB, ESCD # +# Prior - specify Prior Type: DP,PY,ESCNB, ESCD, ESCP, ESCB, ESCBshift # # burn - MCMC burn-in period # # nsamples - MCMC iterations # # truncds - truncation value for field distortions # @@ -284,30 +354,120 @@ DataRemap <- function(data) { # and returns posterior samples of cluster # # assignments and hyperparameters. # -#' Posterior samples of cluster assignments and Prior parameters +#' @title Perform entity resolution with random partition priors. +#' +#' @description \code{SampleCluster} runs Bayesian entity resolution model for categorical dataset under various possible random partition priors. +#' The possible choices of random partition prior for the \code{Prior} argument are: +#' \itemize{ +#' \item \code{"DP"}: Random partition induced by Dirichlet process (DP), also known as Chinese restaurant process prior. +#' \item \code{"PY"}: Random partition induced by Pitman-Yor process (PY). +#' \item \code{"ESCNB"}: Exchangeable sequence of clusters (ESC) model with zero-truncated negative binomial distribution. +#' \item \code{"ESCD"}: ESC model with Dirichlet distribution. +#' \item \code{"ESCP"}: ESC model with zero-truncated Poisson distribution. +#' \item \code{"ESCB"}: ESC model with zero-truncated binomial distribution with fixed maximum cluster size (given by \code{Nbinom}). +#' \item \code{"ESCBshift"}: ESC model with shifted binomial distribution, non-fixed maximum cluster size. +#' } +#' The \strong{Details} section below describes the properties of these priors. Especially for the details of ESC models, see Betancourt, Zanella and Steorts (2020) and Lee and Sang (2022). +#' +#' The independent fields model of Steorts, Hall, and Fienberg (2016) has been used to model the noisy categorical data. +#' Specifically, the density vector \eqn{\theta_l} within each field is fixed as the empirical distribution of the data; See Section 4 of Betancourt et al. (2020) for detailed specification. +#' +#' +#' @details The choice of random partition prior significantly affects the Bayesian entity resolution results. +#' There are two major properties of random partition priors that affects the entity resolution result: \strong{microclustering} and \strong{balancedness}. +#' +#' \strong{Microclustering.} Traditional exchangeable random partition models such as the one induced by Dirichlet process (DP) or Pitman-Yor process (PY), assumes that +#' the number of data points in each cluster grows linearly with the total number of data points. +#' This growth assumption is not desirable in entity resolution tasks, where the cluster represents duplicates of the record so that +#' cluster sizes remain small, even for large data sets. +#' The \emph{microclustering} property (Zanella, Betancourt and others, 2016) holds when cluster sizes grow sublinearly with the total number of data points, +#' and the exchangeable sequence of clusters (ESC) model (Betancourt et al, 2020) is a very general class of random partition models that possess microclustering property. +#' ESC models can directly control the prior distribution of the cluster sizes (\eqn{\mu}), which can be either zero-truncated negative binomial distribution, zero-truncated Poisson, and many others. +#' +#' \strong{Balancedness.} Traditional exchangeable random partition models such as Chinese restaurant process induced by DP, often possesses the "rich-get-richer" property. +#' This gives tendency that some few clusters become large and dominates the whole dataset a priori, leading to an unbalanced partition. +#' Lee and Sang (2022) studied the balancedness of random partition models, and characterized \emph{balance-averse} and \emph{balance-seeking} properties +#' which corresponds to favoring unbalanced and balanced partition in terms of probability, respectively. +#' While the microclustering property has a similar rationale by limiting the growth rate of the largest cluster, +#' the balancedness property analyzes how it assigns probabilities to partitions with different levels of balancedness in non-asymptotic point of view and they complement each other. +#' +#' The table below summarizes the properties with different choice of random partition priors: +#' +#' | \code{Prior} | Microclustering | Balancedness | +#' | ----- | ----- | ----- | +#' | \code{"DP"} | No | balance-averse | +#' | \code{"PY"} | No | balance-averse | +#' | \code{"ESCNB"} | Yes | balance-averse | +#' | \code{"ESCD"} | Yes | N/A | +#' | \code{"ESCP"} | Yes | balance-neutral | +#' | \code{"ESCB"} | Yes, bounded microclustering | balance-seeking | +#' | \code{"ESCBshift"} | Yes | balance-seeking | +#' +#' Here \code{Prior = "ESCD"} is neither balance-averse nor balance-seeking, +#' and \code{Prior = "ESCB"} has \emph{bounded microclustering property} (Betancourt et al, 2022), where the maximum size of the cluster is upper bounded by the fixed hyperparameter \code{Nbinom}. +#' To let the binomial parameter (number of trials) also be random, use \code{Prior = "ESCBShift"}. +#' Using balance-seeking prior leads to assigning less prior probability to partition with many singleton clusters, thus regularizing the emergence of singleton clusters. #' -#' @param data Data frame containing only categorical variables -#' @param Prior Specify partition prior: "DP", "PY", "ESCNB" +#' For the posterior sampling of cluster assignments (partition), a tailored MCMC scheme named \emph{chaperones algorithm} (Zanella, Betancourt and others, 2016) has been used. +#' +#' @md +#' +#' @param data Integer matrix, noisy categorical data where row corresponds to records and column corresponds to fields. +#' @param Prior Specify partition prior: "DP", "PY", "ESCNB", "ESCD", "ESCP", "ESCB", "ESCBshift". See \strong{Details} section for their properties and differences. #' @param burn MCMC burn-in period #' @param nsamples MCMC iterations after burn-in #' @param spacing Thinning for chaperones algorithm (default 1000) #' @param block_flag TRUE for non-uniform chaperones (default) +#' @param beta_known default NULL, specify if true distortion probabilities (with length same as \code{ncol(data)}) is assumed to be known. +#' @param Nbinom default NULL, specify if Prior is "ESCB" which represents the maximum +#' #' @return List with posterior samples for cluster assignments (Z), #' Prior parameters and distortion probabilities (Params) +#' @references +#' Steorts, R. C., Hall, R., & Fienberg, S. E. (2016). A Bayesian approach to graphical record linkage and deduplication. Journal of the American Statistical Association, 111(516), 1660-1672. +#' +#' Zanella, G., Betancourt, B., Miller, J. W., Wallach, H., Zaidi, A., & Steorts, R. C. (2016). Flexible models for microclustering with application to entity resolution. Advances in neural information processing systems, 29. +#' +#' Betancourt, B., Zanella, G., & Steorts, R. C. (2020). Random partition models for microclustering tasks. Journal of the American Statistical Association, 1-13. +#' +#' Betancourt, B., Sosa, J., & Rodríguez, A. (2022). A prior for record linkage based on allelic partitions. Computational Statistics & Data Analysis, 172, 107474. +#' +#' Lee, C. J., & Sang, H. (2022). Why the Rich Get Richer? On the Balancedness of Random Partition Models. Proceedings of the 39th International Conference on Machine Learning (ICML), PMLR 162:12521 - 12541. #' @export -SampleCluster <- function(data, Prior, burn, nsamples, - spacing = 1000, block_flag = TRUE) { +#' @examples +#' nclusters_per_size <- c(50,50,50,50) +#' numberFields <- 5 +#' numberCategories <- rep(10,5) +#' trueBeta <- 0.01 +#' # generate simulated data +#' simulatedData <- SimData(nclusters_per_size, numberFields, numberCategories, trueBeta) +#' # Fit ESCD model +#' posteriorESCD <- SampleCluster(data=simulatedData, Prior="ESCD", burn=0, nsamples=10) +#' +SampleCluster <- function(data, Prior, burn, nsamples, + spacing = 1000, block_flag = TRUE, beta_known = NULL, Nbinom = NULL){ + # sanity check + if( !(Prior == "DP" || Prior == "PY" || Prior == "ESCNB" || Prior == "ESCD" || Prior == "ESCP" || Prior == "ESCB" || Prior == "ESCBshift")){ + stop("invaild Prior argument, see ?Samplecluster for possible random partition priors") + } + if( burn < 0 || nsamples < 1) stop("burn should be nonnegative integer and nsamples should be positive integer") + if(is.null(Nbinom) & (Prior== "ESCB")) stop("specify Nbinom value for Prior= ESCB") + x <- DataRemap(data) N <- nrow(x) nfields <- ncol(x) proportions <- lapply(1:nfields, calcProp, data = x) # Initial clustering - z0 <- sample(c(1:N), N, replace = TRUE) + z0 <- sample(c(1:N), N, replace = TRUE) + # For ESCB, there is an upper bound of cluster size. Start with singleton partition + if(Prior == "ESCB") z0 <- 1:N # No gaps - z <- as.numeric(factor(z0, + z <- as.numeric(factor(z0, labels = seq(1, length(unique(z0))))) + # For ESCBshift, specify initial value of Nbinom + if(Prior == "ESCBshift") Nbinom = max(tabulate(z)) # Initial Prior parameter values - x1 <- SetInit(Prior, N) + x1 <- SetInit(Prior, N, Nbinom) samind <- SamInd(Prior) # Beta prior for distortion parameters bmean <- 0.005 @@ -319,7 +479,11 @@ SampleCluster <- function(data, Prior, burn, nsamples, # Initial value for distortion parameters betas <- rep(pdist, nfields) truncds <- 0.1 - + # If beta_known is known.. + if(!is.null(beta_known)){ + if(length(beta_known)!=nfields) stop("wrong beta_known length") + betas <- beta_known + } # hyperparameter values for priors on Prior parameters hpriorpar <- hpriorpar(Prior, N) @@ -337,8 +501,8 @@ SampleCluster <- function(data, Prior, burn, nsamples, # number of iterations for each random block of chaperons itbl <- 50 - if (Prior == "DP" | Prior == "PY" | Prior == - "ESCNB") { + if (Prior == "DP" | Prior == "PY" | Prior == + "ESCNB"|Prior == "ESCP"|Prior == "ESCB"||Prior == "ESCBshift") { for (i in 1:(burn + nsamples)) { if ((i%%10) == 0) { print(paste("iter=", i)) @@ -348,14 +512,32 @@ SampleCluster <- function(data, Prior, burn, nsamples, # univariate slice sampler for all parameters w <- 1 # step size for slice sampler m <- 10 # limit on steps for slice sampler - x1 <- unislicem(x1, N, Khat, lx, Nk, - hpriorpar, w, m, lo, up, Prior, samind) + if(Prior!="ESCBshift"){ + x1 <- unislicem(x1, N, Khat,lx, Nk, hpriorpar, w, m, lo, up, Prior, samind) + }else{ # if prior is ESCBshift + + # 1. first sample Nbinom (x1[1]) + # In this implementation we trucate the support of Nbinom; TODO: full draw + lengthNbinom = 20 # maximum Nmax samples. + # support of Nbinom (assuming unimodal) + Ncandidates = (max(Nk)-1):(max(Nk)-1+lengthNbinom-1) + tempmat = matrix(sequence(from = Ncandidates[1]-Nk +1, by= 1, nvec = rep(lengthNbinom,Khat)), ncol = Khat) + temp = rowSums(lfactorial(tempmat)) # product of denominator term + logprobs = lbeta(N-Khat + hpriorpar[1], Ncandidates*Khat - N + Khat + hpriorpar[2]) - log(Ncandidates) + Khat*lfactorial(Ncandidates) - temp + + # draw Nbinom from this log-prob. Gumbel trick + gumbels = -log(-log(runif(lengthNbinom))) + x1[1] = Ncandidates[which.max(logprobs + gumbels)] + # 2. then sample p. + x1[2] = stats::rbeta(1, N-Khat + hpriorpar[1], x1[1]*Khat - N + Khat + hpriorpar[2]) + } lods <- 0 upds <- truncds - betas <- unislicespb1(betas, x, z, proportions, - hpriords, w, m, lods, upds, x1, N, - Khat, Nk, hpriorpar, Prior) - + if(is.null(beta_known)){ + betas <- unislicespb1(betas, x, z, proportions, + hpriords, w, m, lods, upds, x1, N, + Khat, Nk, hpriorpar, Prior) + } # parameters of reallocation algortihm # AB <- SetParam(Prior, x1, N) A <- AB$A @@ -363,17 +545,17 @@ SampleCluster <- function(data, Prior, burn, nsamples, if (block_flag) { for (ss in 1:(spacing/itbl)) { bl <- select_block(x, 200) - z1 <- Web_SamplerSP_fbl(x, z, bl, + z1 <- Web_SamplerSP_fbl(x, z, bl, A, B, betas, proportions, 1, itbl) # no gaps in cluster assigment # - z <- as.numeric(factor(z1[1, ], labels = seq(1, + z <- as.numeric(factor(z1[1, ], labels = seq(1, length(unique(z1[1, ]))))) } } else { - z1 <- Web_SamplerSP(x, z, A, B, betas, + z1 <- Web_SamplerSP(x, z, A, B, betas, proportions, 1, spacing) # no gaps in cluster assigment # - z <- as.numeric(factor(z1[1, ], labels = seq(1, + z <- as.numeric(factor(z1[1, ], labels = seq(1, length(unique(z1[1, ]))))) } lx <- loglikxSP(betas, x, z, proportions) @@ -398,8 +580,8 @@ SampleCluster <- function(data, Prior, burn, nsamples, alpha0 = x1[1] r0 = x1[2] p0 = x1[3] - lmu0 = lgamma(seq(M) + r0) + (seq(M)) * log(p0) + - r0 * log(1 - p0) - log(1 - (1 - p0)^r0) - + lmu0 = lgamma(seq(M) + r0) + (seq(M)) * log(p0) + + r0 * log(1 - p0) - log(1 - (1 - p0)^r0) - lgamma(r0) - lgamma(seq(M) + 1) mu0 = exp(lmu0) for (i in 1:(burn + nsamples)) { @@ -409,26 +591,26 @@ SampleCluster <- function(data, Prior, burn, nsamples, # univariate slice sampler for all parameters w <- 1 # step size for slice sampler m <- 10 # limit on steps for slice sampler - x1 <- unislicemESCD(x1, lx, Lm, mu0, + x1 <- unislicemESCD(x1, lx, Lm, mu0, hpriorpar, w, m, lo, up, samind) lods <- 0 upds <- truncds - betas <- unislicespb1(betas, x, z, proportions, - hpriords, w, m, lods, upds, x1, N, + betas <- unislicespb1(betas, x, z, proportions, + hpriords, w, m, lods, upds, x1, N, Khat, Nk, hpriorpar, Prior) - # parameters of reseating algortihm + # parameters of reseating algortihm alpha0 <- x1[1] r0 <- x1[2] p0 <- x1[3] # Sampling mu from dirichlet of size max cluster size - lmu0 <- lgamma(seq(M) + r0) + (seq(M)) * - log(p0) + r0 * log(1 - p0) - log(1 - - (1 - p0)^r0) - lgamma(r0) - lgamma(seq(M) + + lmu0 <- lgamma(seq(M) + r0) + (seq(M)) * + log(p0) + r0 * log(1 - p0) - log(1 - + (1 - p0)^r0) - lgamma(r0) - lgamma(seq(M) + 1) mu0 <- exp(lmu0) - alphasDM <- c(alpha0 * mu0 + Lm, abs(alpha0 * + alphasDM <- c(alpha0 * mu0 + Lm, abs(alpha0 * (1 - sum(mu0)))) mus <- rdirichletf(alphasDM) AB <- SetParamESCD(mus, N, M) @@ -437,17 +619,17 @@ SampleCluster <- function(data, Prior, burn, nsamples, if (block_flag) { for (ss in 1:(spacing/itbl)) { bl <- select_block(x, 200) - z1 <- Web_SamplerSP_fbl(x, z, bl, + z1 <- Web_SamplerSP_fbl(x, z, bl, A, B, betas, proportions, 1, itbl) # no gaps in cluster assigment # - z <- as.numeric(factor(z1[1, ], + z <- as.numeric(factor(z1[1, ], labels = seq(1, length(unique(z1[1, ]))))) } } else { - z1 <- Web_SamplerSP(x, z, A, B, betas, + z1 <- Web_SamplerSP(x, z, A, B, betas, proportions, 1, spacing) # no gaps in cluster assigment # - z <- as.numeric(factor(z1[1, ], labels = seq(1, + z <- as.numeric(factor(z1[1, ], labels = seq(1, length(unique(z1[1, ]))))) } Khat <- length(unique(z)) diff --git a/R/simulation.R b/R/simulation.R index d04028b..f184ad1 100644 --- a/R/simulation.R +++ b/R/simulation.R @@ -1,41 +1,86 @@ ## function to simulate data ## -#' Generates a simulated dataset based on a true partition +#' @title Generating a simulated dataset using independent fields model +#' +#' @description \code{SimData} generates a simulated dataset using independent fields model (Steorts et al. 2016). +#' This data generation scheme has been used in Section 5.1 of Betancourt et al.(2020), Section 5 of Betancourt et al.(2022) and and Section D.2 of Lee and Sang (2022). +#' +#' Let \eqn{m_i} be the number of clusters of size \eqn{i} provided by \code{nclusters_per_size} argument. +#' Then there are total \eqn{n = \sum_{i} i \times m_i} number of entities, and \code{SimData} generates \eqn{n \times }\code{nfields} tabular dataset +#' where \eqn{\ell}th field (column) contains \eqn{D_\ell} categories for \eqn{\ell = 1,\dots,}\code{nfields} where \eqn{D_\ell} are provided by \code{ncat} argument. +#' +#' \strong{Generative process}: First, the "true", latent feature \eqn{y_{j\ell}} is independently generated from the discrete uniform distribution on \eqn{1,\dots,D_\ell} which is cluster and field specific. +#' Then, given a common distortion probability \eqn{\beta} and cluster membership \eqn{z_i}, the record \eqn{x_{i\ell}} is set as +#' \deqn{x_{i\ell} = y_{z_i\ell}} +#' with probability \eqn{1-\beta} (non-distorted) and +#' \deqn{x_{i\ell} \sim DiscreteUnif(1,\dots,D_\ell)} +#' with probability \eqn{\beta} (distorted), independently. +#' True cluster membership \eqn{z_1,\dots,z_n} is by default \code{rep(1:K, times=rep(1:length(nclusters_per_size), times=nclusters_per_size))} where \code{K = sum(nclusters_per_size)} is the number of clusters. +#' +#' #' -#' @param true_L Vector of size max cluster size with number of clusters of each size -#' @param nfields Number of fields -#' @param ncat Vector with number of categories per field +#' @param nclusters_per_size Integer vector, the number of clusters of each size. +#' For example, \code{nclusters_per_size = c(3,0,2)} corresponds to three clusters with size 1 (singletons) and two clusters with size 3. +#' This integer vector is also called \emph{allelic partition}; see also Crane (2016) and Betancourt et al. (2022). +#' @param nfields Integer, Number of fields \eqn{L} +#' @param ncat Integer vector of \eqn{D_\ell}, the number of categories per field #' @param true_beta Distortion probability for the fields -#' @return Simulated data set +#' @return Simulated data set, integer (categorical value) matrix where each column corresponds to each field. +#' @references +#' Crane, H. (2016). The ubiquitous Ewens sampling formula. Statistical science, 31(1), 1-19. +#' +#' Steorts, R. C., Hall, R., & Fienberg, S. E. (2016). A Bayesian approach to graphical record linkage and deduplication. Journal of the American Statistical Association, 111(516), 1660-1672. +#' +#' Betancourt, B., Zanella, G., & Steorts, R. C. (2020). Random partition models for microclustering tasks. Journal of the American Statistical Association, 1-13. +#' +#' Betancourt, B., Sosa, J., & Rodríguez, A. (2022). A prior for record linkage based on allelic partitions. Computational Statistics & Data Analysis, 172, 107474. +#' +#' Lee, C. J., & Sang, H. (2022). Why the Rich Get Richer? On the Balancedness of Random Partition Models. Proceedings of the 39th International Conference on Machine Learning (ICML), PMLR 162:12521 - 12541. +#' #' @export #' @examples -#' truePartition <- c(2,2,2,2) +#' nclusters_per_size <- c(2,2,2,2) #' numberFields <- 2 #' numberCategories <- rep(5,2) #' trueBeta <- 0.01 -#' SimData(truePartition, numberFields, numberCategories, trueBeta) -SimData <- function(true_L, nfields, ncat, true_beta) { - true_M <- length(true_L) - true_K <- sum(true_L) - N <- sum(c(1:true_M) * true_L) #100 - id <- rep(1:true_K, times = rep(1:true_M, times = true_L)) - ## generate true entities - y <- cbind() - for (l in 1:nfields) { - y <- cbind(y, sample.int(n = ncat[l], size = true_K, - replace = TRUE)) - } - data <- matrix(NA, nrow = N, ncol = nfields) - for (i in 1:N) { - for (l in 1:nfields) { - if (runif(1) < true_beta) { - data[i, l] <- sample.int(n = ncat[l], - size = 1) - } else { - data[i, l] <- y[id[i], l] - } - } - } - x <- DataRemap(data) - return(x) +#' # generate data +#' simulatedData <- SimData(nclusters_per_size, numberFields, numberCategories, trueBeta) +#' # true number of clusters +#' trueK = sum(nclusters_per_size) +#' # true cluster membership vector +#' trueid = rep(1:trueK, times=rep(1:length(nclusters_per_size), times=nclusters_per_size)) +SimData <- function(nclusters_per_size, nfields, ncat, true_beta) { + # sanity check + if( any(nclusters_per_size%% 1 != 0) || any(nclusters_per_size < 0) ){ + stop("nclusters_per_size must be a vector of nonnegative integer") + } + if( (nfields %% 1 != 0) || nfields < 1) stop("nfields must be a positive integer") + if( any(ncat %% 1 != 0) || any(ncat < 1) || length(ncat) != nfields){ + stop("ncat must be a vector of positive integer with length nfields ") + } + if(true_beta < 0 || true_beta > 1 ) stop("true_beta must be between 0 and 1") + + true_M <- length(nclusters_per_size) + true_K <- sum(nclusters_per_size) + N <- sum(c(1:true_M) * nclusters_per_size) #100 + id <- rep(1:true_K, times = rep(1:true_M, times = nclusters_per_size)) + ## generate true entities + y <- cbind() + for (l in 1:nfields) { + y <- cbind(y, sample.int(n = ncat[l], size = true_K, + replace = TRUE)) + } + data <- matrix(NA, nrow = N, ncol = nfields) + for (i in 1:N) { + for (l in 1:nfields) { + if (runif(1) < true_beta) { + data[i, l] <- sample.int(n = ncat[l], + size = 1) + } else { + data[i, l] <- y[id[i], l] + } + } + } + x <- DataRemap(data) + return(x) } diff --git a/README.md b/README.md index 42cd56d..506f3f9 100644 --- a/README.md +++ b/README.md @@ -1,40 +1,79 @@ # microclustr -Performs entity resolution for data bases with categorical fields using partition-based Bayesian clustering models. Includes two new microclustering prior models for random partitions -- ESC models, and the traditional Dirichlet and Pitman-Yor process priors. +R package `microclustr` performs entity resolution for databases with categorical fields using partition-based Bayesian clustering models. It includes a new family of microclustering prior models for random partitions -- exchangeable sequence of clusters (ESC) models, and the traditional Dirichlet and Pitman-Yor process priors. ## Installation ```r -devtools::install_github("resteorts/microclustr", build_vignettes = TRUE) +devtools::install_github("cleanzr/microclustr", build_vignettes = TRUE) ``` -## Citation - -This package implements the methods introduced in the following paper: - -> Betancourt, Brenda, Giacomo Zanella and Rebecca C. Steorts (2020). "Random Partitions Models for Microclustering Tasks". ## Background -Entity resolution (record linkage or de-duplication) is used to join multiple databases to remove duplicate entities. Recent methods tackle the entity resolution problem as a clustering task. While traditional Bayesian random partition models assume that the size of each cluster grows linearly with the number of data points, this assumption is not appropriate for applications such as entity resolution. This problem requires models that yield clusters whose sizes grow sublinearly with the total number of data points -- `the microclustering property`. The `partitions` package includes two random partition models that satisfy the microclustering property and implements entity resolution with categorical data. +Entity resolution (record linkage or de-duplication) is used to join multiple databases to remove duplicate entities. Recent methods tackle the entity resolution problem as a clustering task. While traditional Bayesian random partition models assume that the size of each cluster grows linearly with the number of data points, this assumption is not appropriate for applications such as entity resolution. This problem requires models that yield clusters whose sizes grow sublinearly with the total number of data points -- **the microclustering property**. The `microclustr` package includes random partition models that satisfy the microclustering property and implements entity resolution with categorical data. ## Main functions -The package contains the implemetation of four random partition models that are used to perform entitiy resolution as a clustering task: +The `microclustr` package contains the main function `SampleCluster` that are used to perform entity resolution as a clustering task under various possible random partition priors. The possible choices of priors are: + + +- "DP": Random partition induced by Dirichlet process (DP), also known as Chinese restaurant process prior. +- "PY": Random partition induced by Pitman-Yor process (PY). +- "ESCNB": Exchangeable sequence of clusters (ESC) model with zero-truncated negative binomial distribution. +- "ESCD": ESC model with Dirichlet distribution. +- "ESCP": ESC model with zero-truncated Poisson distribution. +- "ESCB": ESC model with zero-truncated binomial distribution with fixed maximum cluster size. +- "ESCBshift": ESC model with shifted binomial distribution, non-fixed maximum cluster size. -* Two traditional random partition models: Dirichlet process (DP) mixtures and Pitman–Yor process (PY) mixtures. -* Two random partition models that exhibit the microclustering property, referred to as: The ESCNB model and the ESCD model. +Run the following R codes for more details on the main function: +```r +library(microclustr) +?SampleCluster +``` -The main function in `partitions` is `SampleCluster`, which performs entity resolution for the four models. Additionally, we have added the functions `mean_fnr` and `mean_fdr` to evaluate the record linkage performance when ground truth is available. +Additionally, the functions `mean_fnr` and `mean_fdr` can be used to evaluate the record linkage performance when ground truth is available. For more extensive documentation of the use of this package, please refer to the vignette. ```r -??partitions +?microclustr browseVignettes("microclustr") ``` +## Citation + +This package implements the methods introduced in the following papers: + +- Betancourt, B., Zanella, G., & Steorts, R. C. (2020). Random partition models for microclustering tasks. Journal of the American Statistical Association, 1-13. + +````{verbatim} +@article{betancourt2020random, + title={Random partition models for microclustering tasks}, + author={Betancourt, Brenda and Zanella, Giacomo and Steorts, Rebecca C}, + journal={Journal of the American Statistical Association}, + pages={1--13}, + year={2020}, + publisher={Taylor \& Francis} +} +```` + +- Lee, C. J., & Sang, H. (2022). Why the Rich Get Richer? On the Balancedness of Random Partition Models. Proceedings of the 39th International Conference on Machine Learning (ICML), PMLR 162:12521 - 12541. + +````{verbatim} +@InProceedings{lee2022why, + title={Why the Rich Get Richer? {O}n the Balancedness of Random Partition Models}, + author={Lee, Changwoo J and Sang, Huiyan}, + booktitle={Proceedings of the 39th International Conference on Machine Learning}, + pages={12521--12541}, + year={2022}, + volume={162}, + series={Proceedings of Machine Learning Research}, + publisher={PMLR} +} +```` + ## Acknowledgements This work was supported through the National Science Foundation through NSF CAREER Award 1652431 (PI: Steorts). diff --git a/man/DataRemap.Rd b/man/DataRemap.Rd index ae0b0c4..be8bf6e 100644 --- a/man/DataRemap.Rd +++ b/man/DataRemap.Rd @@ -10,17 +10,17 @@ DataRemap(data) \item{data}{Data frame containing only categorical variables} } \value{ -Data frame of remapped values to consecutive +Data frame of remapped values to consecutive list of integers } \description{ Remap data to a list of consecutive integers per field } \examples{ -truePartition <- c(10,10,10,10) +nclusters_per_size <- c(10,10,10,10) numberFields <- 5 numberCategories <- rep(10,5) trueBeta <- 0.01 -data <- SimData(truePartition, numberFields, numberCategories, trueBeta) +data <- SimData(nclusters_per_size, numberFields, numberCategories, trueBeta) DataRemap(data) } diff --git a/man/SampleCluster.Rd b/man/SampleCluster.Rd index 42279ab..641a8d7 100644 --- a/man/SampleCluster.Rd +++ b/man/SampleCluster.Rd @@ -2,14 +2,23 @@ % Please edit documentation in R/samplesESC.R \name{SampleCluster} \alias{SampleCluster} -\title{Posterior samples of cluster assignments and Prior parameters} +\title{Perform entity resolution with random partition priors.} \usage{ -SampleCluster(data, Prior, burn, nsamples, spacing = 1000, block_flag = TRUE) +SampleCluster( + data, + Prior, + burn, + nsamples, + spacing = 1000, + block_flag = TRUE, + beta_known = NULL, + Nbinom = NULL +) } \arguments{ -\item{data}{Data frame containing only categorical variables} +\item{data}{Integer matrix, noisy categorical data where row corresponds to records and column corresponds to fields.} -\item{Prior}{Specify partition prior: "DP", "PY", "ESCNB"} +\item{Prior}{Specify partition prior: "DP", "PY", "ESCNB", "ESCD", "ESCP", "ESCB", "ESCBshift". See \strong{Details} section for their properties and differences.} \item{burn}{MCMC burn-in period} @@ -18,11 +27,89 @@ SampleCluster(data, Prior, burn, nsamples, spacing = 1000, block_flag = TRUE) \item{spacing}{Thinning for chaperones algorithm (default 1000)} \item{block_flag}{TRUE for non-uniform chaperones (default)} + +\item{beta_known}{default NULL, specify if true distortion probabilities (with length same as \code{ncol(data)}) is assumed to be known.} + +\item{Nbinom}{default NULL, specify if Prior is "ESCB" which represents the maximum} } \value{ List with posterior samples for cluster assignments (Z), - Prior parameters and distortion probabilities (Params) +Prior parameters and distortion probabilities (Params) } \description{ -Posterior samples of cluster assignments and Prior parameters +\code{SampleCluster} runs Bayesian entity resolution model for categorical dataset under various possible random partition priors. +The possible choices of random partition prior for the \code{Prior} argument are: +\itemize{ +\item \code{"DP"}: Random partition induced by Dirichlet process (DP), also known as Chinese restaurant process prior. +\item \code{"PY"}: Random partition induced by Pitman-Yor process (PY). +\item \code{"ESCNB"}: Exchangeable sequence of clusters (ESC) model with zero-truncated negative binomial distribution. +\item \code{"ESCD"}: ESC model with Dirichlet distribution. +\item \code{"ESCP"}: ESC model with zero-truncated Poisson distribution. +\item \code{"ESCB"}: ESC model with zero-truncated binomial distribution with fixed maximum cluster size (given by \code{Nbinom}). +\item \code{"ESCBshift"}: ESC model with shifted binomial distribution, non-fixed maximum cluster size. +} +The \strong{Details} section below describes the properties of these priors. Especially for the details of ESC models, see Betancourt, Zanella and Steorts (2020) and Lee and Sang (2022). + +The independent fields model of Steorts, Hall, and Fienberg (2016) has been used to model the noisy categorical data. +Specifically, the density vector \eqn{\theta_l} within each field is fixed as the empirical distribution of the data; See Section 4 of Betancourt et al. (2020) for detailed specification. +} +\details{ +The choice of random partition prior significantly affects the Bayesian entity resolution results. +There are two major properties of random partition priors that affects the entity resolution result: \strong{microclustering} and \strong{balancedness}. + +\strong{Microclustering.} Traditional exchangeable random partition models such as the one induced by Dirichlet process (DP) or Pitman-Yor process (PY), assumes that +the number of data points in each cluster grows linearly with the total number of data points. +This growth assumption is not desirable in entity resolution tasks, where the cluster represents duplicates of the record so that +cluster sizes remain small, even for large data sets. +The \emph{microclustering} property (Zanella, Betancourt and others, 2016) holds when cluster sizes grow sublinearly with the total number of data points, +and the exchangeable sequence of clusters (ESC) model (Betancourt et al, 2020) is a very general class of random partition models that possess microclustering property. +ESC models can directly control the prior distribution of the cluster sizes (\eqn{\mu}), which can be either zero-truncated negative binomial distribution, zero-truncated Poisson, and many others. + +\strong{Balancedness.} Traditional exchangeable random partition models such as Chinese restaurant process induced by DP, often possesses the "rich-get-richer" property. +This gives tendency that some few clusters become large and dominates the whole dataset a priori, leading to an unbalanced partition. +Lee and Sang (2022) studied the balancedness of random partition models, and characterized \emph{balance-averse} and \emph{balance-seeking} properties +which corresponds to favoring unbalanced and balanced partition in terms of probability, respectively. +While the microclustering property has a similar rationale by limiting the growth rate of the largest cluster, +the balancedness property analyzes how it assigns probabilities to partitions with different levels of balancedness in non-asymptotic point of view and they complement each other. + +The table below summarizes the properties with different choice of random partition priors:\tabular{lll}{ + \code{Prior} \tab Microclustering \tab Balancedness \cr + \code{"DP"} \tab No \tab balance-averse \cr + \code{"PY"} \tab No \tab balance-averse \cr + \code{"ESCNB"} \tab Yes \tab balance-averse \cr + \code{"ESCD"} \tab Yes \tab N/A \cr + \code{"ESCP"} \tab Yes \tab balance-neutral \cr + \code{"ESCB"} \tab Yes, bounded microclustering \tab balance-seeking \cr + \code{"ESCBshift"} \tab Yes \tab balance-seeking \cr +} + + +Here \code{Prior = "ESCD"} is neither balance-averse nor balance-seeking, +and \code{Prior = "ESCB"} has \emph{bounded microclustering property} (Betancourt et al, 2022), where the maximum size of the cluster is upper bounded by the fixed hyperparameter \code{Nbinom}. +To let the binomial parameter (number of trials) also be random, use \code{Prior = "ESCBShift"}. +Using balance-seeking prior leads to assigning less prior probability to partition with many singleton clusters, thus regularizing the emergence of singleton clusters. + +For the posterior sampling of cluster assignments (partition), a tailored MCMC scheme named \emph{chaperones algorithm} (Zanella, Betancourt and others, 2016) has been used. +} +\examples{ +nclusters_per_size <- c(50,50,50,50) +numberFields <- 5 +numberCategories <- rep(10,5) +trueBeta <- 0.01 +# generate simulated data +simulatedData <- SimData(nclusters_per_size, numberFields, numberCategories, trueBeta) +# Fit ESCD model +posteriorESCD <- SampleCluster(data=simulatedData, Prior="ESCD", burn=0, nsamples=10) + +} +\references{ +Steorts, R. C., Hall, R., & Fienberg, S. E. (2016). A Bayesian approach to graphical record linkage and deduplication. Journal of the American Statistical Association, 111(516), 1660-1672. + +Zanella, G., Betancourt, B., Miller, J. W., Wallach, H., Zaidi, A., & Steorts, R. C. (2016). Flexible models for microclustering with application to entity resolution. Advances in neural information processing systems, 29. + +Betancourt, B., Zanella, G., & Steorts, R. C. (2020). Random partition models for microclustering tasks. Journal of the American Statistical Association, 1-13. + +Betancourt, B., Sosa, J., & Rodríguez, A. (2022). A prior for record linkage based on allelic partitions. Computational Statistics & Data Analysis, 172, 107474. + +Lee, C. J., & Sang, H. (2022). Why the Rich Get Richer? On the Balancedness of Random Partition Models. Proceedings of the 39th International Conference on Machine Learning (ICML), PMLR 162:12521 - 12541. } diff --git a/man/SimData.Rd b/man/SimData.Rd index 1020509..be2be69 100644 --- a/man/SimData.Rd +++ b/man/SimData.Rd @@ -2,29 +2,60 @@ % Please edit documentation in R/simulation.R \name{SimData} \alias{SimData} -\title{Generates a simulated dataset based on a true partition} +\title{Generating a simulated dataset using independent fields model} \usage{ -SimData(true_L, nfields, ncat, true_beta) +SimData(nclusters_per_size, nfields, ncat, true_beta) } \arguments{ -\item{true_L}{Vector of size max cluster size with number of clusters of each size} +\item{nclusters_per_size}{Integer vector, the number of clusters of each size. +For example, \code{nclusters_per_size = c(3,0,2)} corresponds to three clusters with size 1 (singletons) and two clusters with size 3. +This integer vector is also called \emph{allelic partition}; see also Crane (2016) and Betancourt et al. (2022).} -\item{nfields}{Number of fields} +\item{nfields}{Integer, Number of fields \eqn{L}} -\item{ncat}{Vector with number of categories per field} +\item{ncat}{Integer vector of \eqn{D_\ell}, the number of categories per field} \item{true_beta}{Distortion probability for the fields} } \value{ -Simulated data set +Simulated data set, integer (categorical value) matrix where each column corresponds to each field. } \description{ -Generates a simulated dataset based on a true partition +\code{SimData} generates a simulated dataset using independent fields model (Steorts et al. 2016). +This data generation scheme has been used in Section 5.1 of Betancourt et al.(2020), Section 5 of Betancourt et al.(2022) and and Section D.2 of Lee and Sang (2022). + +Let \eqn{m_i} be the number of clusters of size \eqn{i} provided by \code{nclusters_per_size} argument. +Then there are total \eqn{n = \sum_{i} i \times m_i} number of entities, and \code{SimData} generates \eqn{n \times }\code{nfields} tabular dataset +where \eqn{\ell}th field (column) contains \eqn{D_\ell} categories for \eqn{\ell = 1,\dots,}\code{nfields} where \eqn{D_\ell} are provided by \code{ncat} argument. + +\strong{Generative process}: First, the "true", latent feature \eqn{y_{j\ell}} is independently generated from the discrete uniform distribution on \eqn{1,\dots,D_\ell} which is cluster and field specific. +Then, given a common distortion probability \eqn{\beta} and cluster membership \eqn{z_i}, the record \eqn{x_{i\ell}} is set as +\deqn{x_{i\ell} = y_{z_i\ell}} +with probability \eqn{1-\beta} (non-distorted) and +\deqn{x_{i\ell} \sim DiscreteUnif(1,\dots,D_\ell)} +with probability \eqn{\beta} (distorted), independently. +True cluster membership \eqn{z_1,\dots,z_n} is by default \code{rep(1:K, times=rep(1:length(nclusters_per_size), times=nclusters_per_size))} where \code{K = sum(nclusters_per_size)} is the number of clusters. } \examples{ -truePartition <- c(2,2,2,2) +nclusters_per_size <- c(2,2,2,2) numberFields <- 2 numberCategories <- rep(5,2) trueBeta <- 0.01 -SimData(truePartition, numberFields, numberCategories, trueBeta) +# generate data +simulatedData <- SimData(nclusters_per_size, numberFields, numberCategories, trueBeta) +# true number of clusters +trueK = sum(nclusters_per_size) +# true cluster membership vector +trueid = rep(1:trueK, times=rep(1:length(nclusters_per_size), times=nclusters_per_size)) +} +\references{ +Crane, H. (2016). The ubiquitous Ewens sampling formula. Statistical science, 31(1), 1-19. + +Steorts, R. C., Hall, R., & Fienberg, S. E. (2016). A Bayesian approach to graphical record linkage and deduplication. Journal of the American Statistical Association, 111(516), 1660-1672. + +Betancourt, B., Zanella, G., & Steorts, R. C. (2020). Random partition models for microclustering tasks. Journal of the American Statistical Association, 1-13. + +Betancourt, B., Sosa, J., & Rodríguez, A. (2022). A prior for record linkage based on allelic partitions. Computational Statistics & Data Analysis, 172, 107474. + +Lee, C. J., & Sang, H. (2022). Why the Rich Get Richer? On the Balancedness of Random Partition Models. Proceedings of the 39th International Conference on Machine Learning (ICML), PMLR 162:12521 - 12541. } diff --git a/man/fdr_fun.Rd b/man/fdr_fun.Rd index 156c9fe..3a8a852 100644 --- a/man/fdr_fun.Rd +++ b/man/fdr_fun.Rd @@ -2,25 +2,58 @@ % Please edit documentation in R/error_rates.R \name{fdr_fun} \alias{fdr_fun} -\title{Calculates FDR when ground truth is available} +\alias{mean_fdr} +\title{Calculates false discovery rate (FDR) when the ground truth is available} \usage{ -fdr_fun(z, id) +fdr_fun(z, id, use_apply = F) + +mean_fdr(zm, id, use_apply = F) } \arguments{ \item{z}{Vector of cluster assignments} \item{id}{Vector of true cluster assignments (ground truth)} + +\item{use_apply}{Logical (default F), whether to use \code{apply()} to calculate the rate. +Setting \code{use_apply = T} may be slower but memory efficient when \code{length(z)} is large.} + +\item{zm}{Matrix with posterior samples of cluster assignments, where each row corresponds to one sample from the posterior} } \value{ -FDR +False discovery rate (FDR) } \description{ -Calculates FDR when ground truth is available +False discovery rate (FDR) of the estimated record linkage (partition) based on the ground truth is defined as (Steorts, 2015) +\deqn{FDR = \frac{FP}{CL + FP}} +where FP is the number of false positives (not linked under the truth but linked under the estimate) and CL is the number of correct links (true positives). +If both FP=0 and CL=0, define FDR = 0. + +FDR can be also defined as \eqn{FDR = 1 - Precision}, where \eqn{Precision = CL/(CL+FP)}. + +\code{fdr_fun} calculates FDR for an estimated partition, and \code{mean_fdr} calculates average FDR based on posterior samples of partition. } \examples{ -truePartition <- c(50,50,50,50) -maxPartitionSize<- length(truePartition) -uniqueNumberRecords <- sum(truePartition) -id <- rep(1:uniqueNumberRecords, times=rep(1:maxPartitionSize, times=truePartition)) -fdr_fun(z = truePartition, id) +nclusters_per_size <- c(50,50,50,50) +numberFields <- 5 +numberCategories <- rep(10,5) +trueBeta <- 0.01 +# generate simulated data +simulatedData <- SimData(nclusters_per_size, numberFields, numberCategories, trueBeta) +# Fit ESCD model +posteriorESCD <- SampleCluster(data=simulatedData, Prior="ESCD", burn=0, nsamples=10) +# true number of clusters +trueK = sum(nclusters_per_size) +# true cluster membership vector +trueid = rep(1:trueK, times=rep(1:length(nclusters_per_size), times=nclusters_per_size)) +# FDR calculation for a single estimate +fdr_fun(posteriorESCD$Z[10,], trueid) +# average FDR calculation +mean_fdr(posteriorESCD$Z, trueid) + +} +\references{ +Steorts, R. C. (2015). Entity resolution with empirically motivated priors. Bayesian Analysis, 10(4), 849-875. +} +\seealso{ +\code{\link{fnr_fun}}, \code{\link{mean_fnr}} } diff --git a/man/fnr_fun.Rd b/man/fnr_fun.Rd index d3e89fa..cb95598 100644 --- a/man/fnr_fun.Rd +++ b/man/fnr_fun.Rd @@ -2,25 +2,58 @@ % Please edit documentation in R/error_rates.R \name{fnr_fun} \alias{fnr_fun} -\title{Calculates FNR when ground truth is available} +\alias{mean_fnr} +\title{Calculates false negative rate (FNR) when the ground truth is available} \usage{ -fnr_fun(z, id) +fnr_fun(z, id, use_apply = F) + +mean_fnr(zm, id, use_apply = F) } \arguments{ -\item{z}{Vector of cluster assignments} +\item{z}{Integer vector of cluster assignments} + +\item{id}{Integer vector of true cluster assignments (ground truth)} -\item{id}{Vector of true cluster assignments (ground truth)} +\item{use_apply}{Logical (default F), whether to use \code{apply()} to calculate the rate. +Setting \code{use_apply = T} may be slower but memory efficient when \code{length(z)} is large.} + +\item{zm}{Matrix with posterior samples of cluster assignments, where each row corresponds to one sample from the posterior} } \value{ -FNR +False negative rate (FNR) } \description{ -Calculates FNR when ground truth is available +False negative rate (FNR) of the estimated record linkage (partition) based on the ground truth is defined as (Steorts, 2015) +\deqn{FNR = \frac{FN}{CL + FN}} +where FN is the number of false negatives (linked under the truth but not linked under the estimate) and CL is the number of correct links (true positives). +If both FN=0 and CL=0, define FNR = 0. + +FNR can be also defined as \eqn{FNR = 1 - Recall}, where \eqn{Recall = CL/(CL+FN)}. + +\code{fnr_fun} calculates FNR for an estimated partition, and \code{mean_fnr} calculates average FNR based on posterior samples of partition. } \examples{ -truePartition <- c(50,50,50,50) -maxPartitionSize<- length(truePartition) -uniqueNumberRecords <- sum(truePartition) -id <- rep(1:uniqueNumberRecords, times=rep(1:maxPartitionSize, times=truePartition)) -fnr_fun(z = truePartition, id) +nclusters_per_size <- c(50,50,50,50) +numberFields <- 5 +numberCategories <- rep(10,5) +trueBeta <- 0.01 +# generate simulated data +simulatedData <- SimData(nclusters_per_size, numberFields, numberCategories, trueBeta) +# Fit ESCD model +posteriorESCD <- SampleCluster(data=simulatedData, Prior="ESCD", burn=0, nsamples=10) +# true number of clusters +trueK = sum(nclusters_per_size) +# true cluster membership vector +trueid = rep(1:trueK, times=rep(1:length(nclusters_per_size), times=nclusters_per_size)) +# FNR calculation for a single estimate +fnr_fun(posteriorESCD$Z[10,], trueid) +# average FNR calculation +mean_fnr(posteriorESCD$Z, trueid) + +} +\references{ +Steorts, R. C. (2015). Entity resolution with empirically motivated priors. Bayesian Analysis, 10(4), 849-875. +} +\seealso{ +\code{\link{fdr_fun}}, \code{\link{mean_fdr}} } diff --git a/man/mean_fdr.Rd b/man/mean_fdr.Rd deleted file mode 100644 index fdbd688..0000000 --- a/man/mean_fdr.Rd +++ /dev/null @@ -1,31 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/error_rates.R -\name{mean_fdr} -\alias{mean_fdr} -\title{Calculates average FDR when ground truth is available} -\usage{ -mean_fdr(zm, id) -} -\arguments{ -\item{zm}{Matrix with posterior samples of cluster assignments} - -\item{id}{Vector of true cluster assignments (ground truth)} -} -\value{ -Average FDR over posterior samples -} -\description{ -Calculates average FDR when ground truth is available -} -\examples{ -truePartition <- c(50,50,50,50) -maxPartitionSize<- length(truePartition) -uniqueNumberRecords <- sum(truePartition) -id <- rep(1:uniqueNumberRecords, times=rep(1:maxPartitionSize, times=truePartition)) -numberFields <- 5 -numberCategories <- rep(10,5) -trueBeta <- 0.01 -simulatedData <- SimData(truePartition, numberFields, numberCategories, trueBeta) -posteriorESCD <- SampleCluster(data=simulatedData, Prior="ESCD", burn=0, nsamples=10) -mean_fdr(zm = posteriorESCD$Z, id) -} diff --git a/man/mean_fnr.Rd b/man/mean_fnr.Rd deleted file mode 100644 index 4f5042d..0000000 --- a/man/mean_fnr.Rd +++ /dev/null @@ -1,31 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/error_rates.R -\name{mean_fnr} -\alias{mean_fnr} -\title{Calculates average FNR when ground truth is available} -\usage{ -mean_fnr(zm, id) -} -\arguments{ -\item{zm}{Matrix with posterior samples of cluster assignments} - -\item{id}{Vector of true cluster assignments (ground truth)} -} -\value{ -Average FNR over posterior samples -} -\description{ -Calculates average FNR when ground truth is available -} -\examples{ -truePartition <- c(50,50,50,50) -maxPartitionSize<- length(truePartition) -uniqueNumberRecords <- sum(truePartition) -id <- rep(1:uniqueNumberRecords, times=rep(1:maxPartitionSize, times=truePartition)) -numberFields <- 5 -numberCategories <- rep(10,5) -trueBeta <- 0.01 -simulatedData <- SimData(truePartition, numberFields, numberCategories, trueBeta) -posteriorESCD <- SampleCluster(data=simulatedData, Prior="ESCD", burn=0, nsamples=10) -mean_fnr(zm = posteriorESCD$Z, id) -} diff --git a/man/microclustr-package.Rd b/man/microclustr-package.Rd index e17a9cd..19d0be7 100644 --- a/man/microclustr-package.Rd +++ b/man/microclustr-package.Rd @@ -1,34 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/package_description.R +\docType{package} \name{microclustr-package} \alias{microclustr-package} \alias{microclustr} -\docType{package} -\title{ - A short title line describing what the package does -} +\title{Entity Resolution with Random Partition Priors for Microclustering} \description{ - A more detailed description of what the package does. A length - of about one to five lines is recommended. +An implementation of the model in Betancourt, Zanella, and Steorts (2020), which performs microclustering models for categorical data. + The package provides a vignette for two proposed methods in the paper as well as two standard Bayesian non-parametric clustering approaches for entity resolution. + The experiments are reproducible and illustrated using a simple vignette. + + (Update) An implementaton of the ESC-Binom, ESC-Poisson models in Lee and Sang (2022) has been added. + + LICENSE: GPL-3 + file license. } -\details{ - This section should provide a more detailed overview of how to use the - package, including the most important functions. -} -\author{ -Your Name, email optional. +\examples{ -Maintainer: Your Name +library(microclustr) +?SimData +?SampleCluster +?mean_fnr +?mean_fdr } \references{ - This optional section can contain literature or other references for - background information. -} -\keyword{ package } -\seealso{ - Optional links to other man pages +Steorts, R. C., Hall, R., & Fienberg, S. E. (2016). A Bayesian approach to graphical record linkage and deduplication. Journal of the American Statistical Association, 111(516), 1660-1672. + +Zanella, G., Betancourt, B., Miller, J. W., Wallach, H., Zaidi, A., & Steorts, R. C. (2016). Flexible models for microclustering with application to entity resolution. Advances in neural information processing systems, 29. + +Betancourt, B., Zanella, G., & Steorts, R. C. (2020). Random partition models for microclustering tasks. Journal of the American Statistical Association, 1-13. + +Lee, C. J., & Sang, H. (2022). Why the Rich Get Richer? On the Balancedness of Random Partition Models. Proceedings of the 39th International Conference on Machine Learning (ICML), PMLR 162:12521 - 12541. } -\examples{ - \dontrun{ - ## Optional simple examples of the most important functions - ## These can be in \dontrun{} and \donttest{} blocks. - } +\author{ +Rebecca C. Steorts, Brenda Betancourt, Giacomo Zanella, Changwoo J. Lee, Huiyan Sang } diff --git a/microclustr.Rproj b/microclustr.Rproj index b1ded45..9f96499 100644 --- a/microclustr.Rproj +++ b/microclustr.Rproj @@ -15,3 +15,4 @@ LaTeX: pdfLaTeX BuildType: Package PackageUseDevtools: Yes PackageInstallArgs: --no-multiarch --with-keep.source +PackageRoxygenize: rd,collate,namespace diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index c3d8847..1813942 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -5,6 +5,11 @@ using namespace Rcpp; +#ifdef RCPP_USE_GLOBAL_ROSTREAM +Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); +Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); +#endif + // loglikxSP double loglikxSP(NumericVector betas, IntegerMatrix x, IntegerVector z, List params); RcppExport SEXP _microclustr_loglikxSP(SEXP betasSEXP, SEXP xSEXP, SEXP zSEXP, SEXP paramsSEXP) { diff --git a/src/RcppExports.o b/src/RcppExports.o index e95c110..6e65233 100644 Binary files a/src/RcppExports.o and b/src/RcppExports.o differ diff --git a/src/logprior.h b/src/logprior.h index c08a844..facda89 100644 --- a/src/logprior.h +++ b/src/logprior.h @@ -52,7 +52,29 @@ double logprior(double x0, int k0, int N, int Khat, NumericVector x1, log(1 - pow((1-pnb),rnb))) + term + (ag-1)*log(rnb) - bg*rnb + (ab - 1)*log(pnb) + (bb - 1)*log(1 - pnb); } - + if (Prior=="ESCP"){ + double lambda = xx1[0]; + double ag = hpriorpar[0]; // gamma prior for lambda + double bg = hpriorpar[1]; + double term=0.0; + //for (int i=0; i - %\VignetteIndexEntry{Partitions} + %\VignetteIndexEntry{Entity resolution with microclustr package} %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- @@ -13,11 +13,11 @@ vignette: > knitr::opts_chunk$set(collapse = TRUE, comment = "#>") ``` -We present a simulated data set from Betancourt, Zanella, and Steorts (2020), "Random Partitions Models for Microclustering Tasks" \emph{Journal of the American Statistical Association, Theory and Methods}, Minor Revision. The microclustr package performs entity resolution with categorical variables using partition-based Bayesian clustering models. +The `microclustr` package performs entity resolution with categorical variables using partition-based Bayesian clustering models. It includes a new family of random partition models, namely **exchangeable sequence of clusters (ESC) models** proposed in [Betancourt, Zanella, and Steorts (2020)](#references). Under mild condition, ESC models possess **microclustering** property [(Zanella et al., 2016)](#references), i.e. cluster sizes grow sublinearly with the total number of data points. The updated version 0.1.1. includes additional ESC models that has balance-seeking property [(Lee and Sang, 2022)](#references), by assigning higher prior probability to more balanced partition thus discouraging the emergence of singleton clusters. Our goals include: -- Creating a synthetic data set (with a fixed partition) +- Creating a synthetic categorical data set - Illustrating how the user can perform entity resolution using the microclustr package - Illustrating how the user can calculate standard evaluation metrics when a unique identifier is known. @@ -28,63 +28,86 @@ We first load all packages needed for this example. ```{r} # load all packages need # set seed for reproducibility -library('microclustr') +library(microclustr) set.seed(123) ``` -## Creating a Synthetic Data Set with a Fixed Partition +## Creating a Synthetic Data Set -Now we create a synthetic data set, where we assume a fixed partition. +Now we create a synthetic categorical data set based on **independent fields model** of [Steorts, Hall and Fienberg (2016)](#references). -Assume there are a maximum of four clusters. Assume that there are 50 records within each cluster. Assume that each record has 5 fields of categorical variable. Assume that there are 10 potential categories per field. Assume the distortion probability for each field is 0.01. +Assume there are 200 unique records (clusters). Assume that there are 1 to 4 records within each cluster. Assume that each record has 5 fields of categorical variable. Assume that there are 10 potential categories per field which are uniformly distributed. Assume the distortion probability for each field is 0.01. -Our synthetic data set produces duplicate records using the `SimData()` function, where there are 500 records in all with 200 unique records. +Our synthetic data set produces duplicate records using the `SimData()` function, where there are 500 records in all with 200 unique records (clusters). ```{r} # true partition to generate simulated data -# 50 clusters of each size, max cluster size is 4 -truePartition <- c(50,50,50,50) +# 50 clusters of each size (50 singletons, 50 clusters with size 2, ... 50 clusters with size 4). +nclusters_per_size <- c(50, 50, 50, 50) # number of fields numberFields <- 5 # number of categories per field -numberCategories <- rep(10,5) +numberCategories <- c(10, 10, 10, 10, 10) # distortion probability for the fields trueBeta <- 0.01 # generate simulated data -simulatedData <- SimData(true_L = truePartition, nfields = numberFields, ncat = numberCategories, true_beta = trueBeta) +simulatedData <- SimData(nclusters_per_size, numberFields, numberCategories, trueBeta) # dimension of data set dim(simulatedData) +# true number of clusters: 200 +trueK = sum(nclusters_per_size) +# true cluster membership vector (length 500) +id = rep(1:trueK, times=rep(1:length(nclusters_per_size), times=nclusters_per_size)) ``` -## Partition Priors for Entity Resolution - -This package contains the implementation of four random partition models used for entity resolution tasks: - -- Two traditional random partition models: - -1. Dirichlet process (DP) mixtures -2. Pitman–Yor process (PY) mixtures. - -- Two random partition models that exhibit the microclustering property, which are - -Exchangeable Sequences of Clusters (ESC) models, which are referred to as (and are further defined in our paper): - -3. The ESCNB model -4. The ESCD model +## Random Partition Priors for Entity Resolution + +The main function of `microclustr` package is `SampleCluster`, which contains the implementation of several random partition models used for entity resolution tasks by specifying `Prior` argument: + +- `Prior="DP"`: Random partition induced by Dirichlet process (DP), also known as Chinese restaurant process prior. +- `Prior="PY"`: Random partition induced by Pitman-Yor process (PY). +- `Prior="ESCNB"`: Exchangeable sequence of clusters (ESC) model with zero-truncated negative binomial distribution. +- `Prior="ESCD"`: ESC model with Dirichlet distribution. +- `Prior="ESCP"`: ESC model with zero-truncated Poisson distribution. +- `Prior="ESCB"`: ESC model with zero-truncated binomial distribution with fixed maximum cluster size. +- `Prior="ESCBshift"`: ESC model with shifted binomial distribution, non-fixed maximum cluster size. + +The choice of random partition prior significantly affects the Bayesian entity resolution results. There are two major properties of random partition priors that affects the entity resolution results: **microclustering** and **balancedness**. + +**Microclustering**. Traditional exchangeable random partition models such as the one induced by Dirichlet process (DP) or Pitman-Yor process (PY), assumes that the number of data points in each cluster grows linearly with the total number of data points. This growth assumption is not desirable in entity resolution tasks, where the cluster represents duplicates of the record so that cluster sizes remain small, even for large data sets. The microclustering property [(Zanella et al. 2016)](#references) holds when cluster sizes grow sublinearly with the total number of data points, and the *exchangeable sequence of clusters (ESC) model* [(Betancourt, Zanella, and Steorts, 2020)](#references) is a very general class of random partition models that possess microclustering property. ESC models can directly control the prior distribution of the cluster sizes, which can be either zero-truncated negative binomial distribution, zero-truncated Poisson, and many others. + +**Balancedness**. Traditional exchangeable random partition models such as Chinese restaurant process induced by DP, often possesses the "rich-get-richer" property. This gives tendency that some few clusters become large and dominates the whole dataset a priori, leading to an unbalanced partition. [Lee and Sang (2022)](#references) studied the balancedness of random partition models, and characterized *balance-averse* and *balance-seeking* properties which corresponds to favoring unbalanced and balanced partition in terms of probability, respectively. While the microclustering property has a similar rationale by limiting the growth rate of the largest cluster, the balancedness property analyzes how it assigns probabilities to partitions with different levels of balancedness in non-asymptotic point of view and they complement each other. + + The table below summarizes the properties with different choice of random partition priors: + + | `Prior` | Microclustering | Balancedness | + | ----- | ----- | ----- | + | `"DP"` | No | balance-averse | + | `"PY"` | No | balance-averse | + | `"ESCNB"` | Yes | balance-averse | + | `"ESCD"` | Yes | N/A | + | `"ESCP"` | Yes | balance-neutral | + | `"ESCB"` | Yes, bounded microclustering | balance-seeking | + | `"ESCBshift"` | Yes | balance-seeking | + + Here `Prior = "ESCD"` is neither balance-averse nor balance-seeking, + and `Prior = "ESCB"` has **bounded microclustering property** [(Betancourt et al., 2022)](#references), where the maximum size of the cluster is upper bounded by the fixed hyperparameter `Nbinom`. + To let the binomial parameter (number of trials) also be random, use `Prior = "ESCBshift"`. + Using balance-seeking prior leads to assigning less prior probability to partition with many singleton clusters, thus regularizing the emergence of singleton clusters. ## Posterior Samples In order to obtain posterior samples of the cluster assignments and the model parameters the user needs to specify the following: -- the data, -- the random partition Prior ("DP", "PY", "ESCNB" or "ESCD"), +- the categorical data, +- the random partition prior, - the burn-in period, - and the number of iterations for the Gibbs sampler to run. ## Investigation for Synthetic Data Set -Let's investigate this for the synthetic data set where we draw a posterior sample from the ESCD model using our simulated data with a burn-in period of 5 points and 10 Gibbs sampler values. +Let's investigate this for the synthetic data set where we draw a posterior sample from the ESCD model using our simulated data with a burn-in period of 5 and 10 posterior samples. ```{r, eval=TRUE} # example of drawing from the posterior with the ESCD prior @@ -93,8 +116,8 @@ posteriorESCD <- SampleCluster(data=simulatedData, Prior="ESCD", burn=5, nsample The output is a **list** with two elements: -- `Z`: A matrix of size nsamples x N containing the samples of the cluster assignments. -- `Params`: A matrix of size nsamples x # of model hyper-parameters containing the samples of the model hyper-parameters. The columns named beta_1 to beta_L correspond to the distortion probabilities of the fields in the data. +- `Z`: A matrix of size nsamples x the number of datapoints containing the samples of the cluster assignments. +- `Params`: A matrix of size nsamples x the number of model hyper-parameters containing the samples of the model hyper-parameters. The columns named beta_1 to beta_L correspond to the distortion probabilities of the fields in the data. Observe that each row corresponds to an iteration of the Gibbs sampler. Observe that each column corresponds to a record. Observe that we have 500 records and 10 Gibbs iterations, as expected. We can inspect the first five row and first 10 columns of the posterior output. @@ -112,9 +135,16 @@ head(posteriorESCD$Params) Samples for the DP, PY, and ESCNB models can be similarly obtained as follows: ```{r, eval=FALSE} -posteriorDP <- SampleCluster(simulatedData, "DP", 5, 10) +# traditional random partition models +posteriorDP <- SampleCluster(simulatedData, "DP", burn = 5, nsamples = 10) posteriorPY <- SampleCluster(simulatedData, "PY", 5, 10) +# zero-truncated negative binomial posteriorESCNB <- SampleCluster(simulatedData, "ESCNB", 5, 10) +# zero-truncated Poisson +posteriorESCP <- SampleCluster(simulatedData, "ESCP", 5, 10) +# zero-truncated (or shifted) binomial +posteriorESCB <- SampleCluster(simulatedData, "ESCB", 5, 10, Nbinom = 10) # fixed Nbinom(maximum cluster size) +posteriorESCBshift <- SampleCluster(simulatedData, "ESCBshift", 5, 10) # not fixed ``` ## Evaluation Metrics @@ -123,14 +153,10 @@ If the ground truth for the partition of the data is available, the average Fals ```{r, eval=TRUE} -maxPartitionSize<- length(truePartition) -uniqueNumberRecords <- sum(truePartition) - -#true_M <- length(truePartition) -#true_K <- sum(truePartition) -# true cluster assignments - -id <- rep(1:uniqueNumberRecords, times=rep(1:maxPartitionSize, times=truePartition)) +# fnr of one posterior sample +fnr_fun(posteriorESCD$Z[10,], id) +# fdr of one posterior sample +fdr_fun(posteriorESCD$Z[10,], id) # average fnr mean_fnr(posteriorESCD$Z,id) # average fdr @@ -139,3 +165,16 @@ mean_fdr(posteriorESCD$Z,id) Of course, in practice, one would want to run the sampler much longer in practice to calculate the error rates above. +## References + +- Steorts, R. C., Hall, R., & Fienberg, S. E. (2016). A Bayesian approach to graphical record linkage and deduplication. *Journal of the American Statistical Association*, 111(516), 1660-1672. + +- Zanella, G., Betancourt, B., Miller, J. W., Wallach, H., Zaidi, A., & Steorts, R. C. (2016). Flexible models for microclustering with application to entity resolution. *Advances in neural information processing systems*, 29. + +- Betancourt, B., Zanella, G., & Steorts, R. C. (2020). Random partition models for microclustering tasks. *Journal of the American Statistical Association*, 1-13. + +- Betancourt, B., Sosa, J., & Rodríguez, A. (2022). A prior for record linkage based on allelic partitions. *Computational Statistics & Data Analysis*, 172, 107474. + +- Lee, C. J., & Sang, H. (2022). Why the Rich Get Richer? On the Balancedness of Random Partition Models. *Proceedings of the 39th International Conference on Machine Learning (ICML)*, PMLR 162:12521 - 12541. + + diff --git a/vignettes/microclustr.html b/vignettes/microclustr.html deleted file mode 100644 index 3513f5a..0000000 --- a/vignettes/microclustr.html +++ /dev/null @@ -1,468 +0,0 @@ - - - - - - - - - - - - - - - - -Microclustr - - - - - - - - - - - - - - - - - - - - - - -

Microclustr

-

Brenda Betancourt, Giacomo Zanella, Rebecca C. Steorts

-

2020-09-15

- - - -

We present a simulated data set from Betancourt, Zanella, and Steorts (2020), “Random Partitions Models for Microclustering Tasks” , Minor Revision. The microclustr package performs entity resolution with categorical variables using partition-based Bayesian clustering models.

-

Our goals include:

-
    -
  • Creating a synthetic data set (with a fixed partition)
  • -
  • Illustrating how the user can perform entity resolution using the microclustr package
  • -
  • Illustrating how the user can calculate standard evaluation metrics when a unique identifier is known.
  • -
-
-

Loading all Packages Needed

-

We first load all packages needed for this example.

-
# load all packages need
-# set seed for reproducibility 
-library('microclustr')
-set.seed(123)
-
-
-

Creating a Synthetic Data Set with a Fixed Partition

-

Now we create a synthetic data set, where we assume a fixed partition.

-

Assume there are a maximum of four clusters. Assume that there are 50 records within each cluster. Assume that each record has 5 fields of categorical variable. Assume that there are 10 potential categories per field. Assume the distortion probability for each field is 0.01.

-

Our synthetic data set produces duplicate records using the SimData() function, where there are 500 records in all with 200 unique records.

-
# true partition to generate simulated data
-# 50 clusters of each size, max cluster size is 4
-truePartition <- c(50,50,50,50)
-# number of fields
-numberFields <- 5
-# number of categories per field
-numberCategories <- rep(10,5)
-# distortion probability for the fields
-trueBeta <- 0.01
-# generate simulated data
-simulatedData <- SimData(true_L = truePartition, nfields = numberFields, ncat = numberCategories, true_beta = trueBeta)
-# dimension of data set
-dim(simulatedData)
-#> [1] 500   5
-
-
-

Partition Priors for Entity Resolution

-

This package contains the implementation of four random partition models used for entity resolution tasks:

-
    -
  • Two traditional random partition models:
  • -
-
    -
  1. Dirichlet process (DP) mixtures
  2. -
  3. Pitman–Yor process (PY) mixtures.
  4. -
-
    -
  • Two random partition models that exhibit the microclustering property, which are
  • -
-

Exchangeable Sequences of Clusters (ESC) models, which are referred to as (and are further defined in our paper):

-
    -
  1. The ESCNB model
  2. -
  3. The ESCD model
  4. -
-
-
-

Posterior Samples

-

In order to obtain posterior samples of the cluster assignments and the model parameters the user needs to specify the following:

-
    -
  • the data,
  • -
  • the random partition Prior (“DP”, “PY”, “ESCNB” or “ESCD”),
  • -
  • the burn-in period,
  • -
  • and the number of iterations for the Gibbs sampler to run.
  • -
-
-
-

Investigation for Synthetic Data Set

-

Let’s investigate this for the synthetic data set where we draw a posterior sample from the ESCD model using our simulated data with a burn-in period of 5 points and 10 Gibbs sampler values.

-
# example of drawing from the posterior with the ESCD prior 
-posteriorESCD <- SampleCluster(data=simulatedData, Prior="ESCD", burn=5, nsamples=10)
-#> [1] "iter= 10"
-

The output is a list with two elements:

-
    -
  • Z: A matrix of size nsamples x N containing the samples of the cluster assignments.
  • -
  • Params: A matrix of size nsamples x # of model hyper-parameters containing the samples of the model hyper-parameters. The columns named beta_1 to beta_L correspond to the distortion probabilities of the fields in the data.
  • -
-

Observe that each row corresponds to an iteration of the Gibbs sampler. Observe that each column corresponds to a record. Observe that we have 500 records and 10 Gibbs iterations, as expected. We can inspect the first five row and first 10 columns of the posterior output.

-
dim(posteriorESCD$Z)
-#> [1]  10 500
-posteriorESCD$Z[1:5,1:10]
-#>      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
-#> [1,]  123   90  154  105  121  110  264  122    8   260
-#> [2,]  119   87  149  102  262  107  254  118    8   250
-#> [3,]  116   84  146   99  253  104  246  115    8   257
-#> [4,]  112  186  142   96  238  101  233  111    8   242
-#> [5,]  108  181  137   92  230   97  225  107    7   234
-

In addition, we can inspect the samples of the model hyperparameters. In the case of the ESCD model, there are three hyperparamters \(\alpha\), \(r\), and \(p\).

-
head(posteriorESCD$Params)
-#>      alpha          r         p     beta_1     beta_2     beta_3     beta_4
-#> [1,]     1 0.21983936 0.5273158 0.09402145 0.09792707 0.08056948 0.09822450
-#> [2,]     1 0.69413123 0.5162523 0.07294254 0.06327406 0.06065586 0.09775310
-#> [3,]     1 0.75932593 0.3238241 0.06741252 0.07269144 0.06716123 0.09311753
-#> [4,]     1 0.96935686 0.4881323 0.04866242 0.04653387 0.04441998 0.05344789
-#> [5,]     1 0.41957922 0.4396494 0.03370082 0.04297929 0.04055417 0.06170793
-#> [6,]     1 0.09803845 0.2113132 0.03990637 0.03613336 0.04101764 0.05028134
-#>          beta_5
-#> [1,] 0.09990110
-#> [2,] 0.09543110
-#> [3,] 0.09561922
-#> [4,] 0.08492962
-#> [5,] 0.06825275
-#> [6,] 0.04042812
-

Samples for the DP, PY, and ESCNB models can be similarly obtained as follows:

-
posteriorDP <- SampleCluster(simulatedData, "DP", 5, 10)
-posteriorPY <- SampleCluster(simulatedData, "PY", 5, 10)
-posteriorESCNB <- SampleCluster(simulatedData, "ESCNB", 5, 10)
-
-
-

Evaluation Metrics

-

If the ground truth for the partition of the data is available, the average False Negative Rate (FNR) and False Discovery Rate (FDR) over the posterior samples can be computed (for any model) using the mean_fnr and mean_fdr functions:

-
maxPartitionSize<- length(truePartition)
-uniqueNumberRecords <- sum(truePartition)
-
-#true_M <- length(truePartition)
-#true_K <- sum(truePartition)
-# true cluster assignments
-
-id <- rep(1:uniqueNumberRecords, times=rep(1:maxPartitionSize, times=truePartition))
-# average fnr
-mean_fnr(posteriorESCD$Z,id)
-#> [1] 0.3066
-# average fdr
-mean_fdr(posteriorESCD$Z,id)
-#> [1] 0.1661055
-

Of course, in practice, one would want to run the sampler much longer in practice to calculate the error rates above.

-
- - - - - - - - - - -