cleanzr · changwoo-lee · Jun 13, 2022 · Jun 13, 2022 · Jun 13, 2022 · Jun 13, 2022
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -1,2 +1,4 @@
 ^.*\.Rproj$
 ^\.Rproj\.user$
+^doc$
+^Meta$
diff --git a/.gitignore b/.gitignore
@@ -18,3 +18,5 @@ vignettes/*.html
 vignettes/*.pdf
 # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
 .Rproj.user
+/doc/
+/Meta/
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,22 +1,26 @@
 Package: microclustr
 Type: Package
 Title: Entity Resolution with Random Partition Priors for Microclustering
-Version: 0.1.0
-Date: 2020-09-15
+Version: 0.1.1
+Date: 2022-08-14
 Authors@R: c(
-    person(given=c("Rebecca", "C"), family="Steorts", role = c("aut","cre"), email="[email protected]"),
+    person(given=c("Rebecca", "C"), family="Steorts", role = c("aut"), email="[email protected]"),
     person("Brenda", "Betancourt", email = "[email protected]", role = c("aut")),
-    person("Giacomo", "Zanella", email = "[email protected]", role = c("aut"))
+    person("Giacomo", "Zanella", email = "[email protected]", role = c("aut")),
+    person("Changwoo", "Lee", email = "[email protected]", role = c("aut","cre")),
+    person("Huiyan", "Sang", email = "[email protected]", role = c("aut"))
     )
+Maintainer: Changwoo Lee <[email protected]>
 Depends: 
 	R (>= 3.2.4)
 Imports: 
 	Rcpp (>= 1.0.1),
         stats
+Encoding: UTF-8
 Suggests: 
         knitr
 Description: An implementation of the model in Betancourt, Zanella, Steorts (2020), which performs microclustering models for categorical data. The package provides a vignette for two proposed methods in the paper as well as two standard Bayesian non-parametric clustering approaches for entity resolution. The experiments are reproducible and illustrated using a simple vignette. LICENSE: GPL-3 + file license. 
 VignetteBuilder: knitr
 License: GPL-3
 LinkingTo: Rcpp
-RoxygenNote: 7.1.1.9000
+RoxygenNote: 7.2.0
diff --git a/R/error_rates.R b/R/error_rates.R
@@ -1,97 +1,158 @@
 ## functions to compute FNR and FDR ##
 
-#' Calculates FDR when ground truth is available
-#'
+#' @title Calculates false discovery rate (FDR) when the ground truth is available
+#' 
+#' @description False discovery rate (FDR) of the estimated record linkage (partition) based on the ground truth is defined as (Steorts, 2015)
+#' \deqn{FDR = \frac{FP}{CL + FP}}
+#' where FP is the number of false positives (not linked under the truth but linked under the estimate) and CL is the number of correct links (true positives). 
+#' If both FP=0 and CL=0, define FDR = 0. 
+#' 
+#' FDR can be also defined as \eqn{FDR = 1 - Precision}, where \eqn{Precision = CL/(CL+FP)}.
+#' 
+#' \code{fdr_fun} calculates FDR for an estimated partition, and \code{mean_fdr} calculates average FDR based on posterior samples of partition.
+#' 
 #' @param z Vector of cluster assignments
 #' @param id Vector of true cluster assignments (ground truth)
-#' @return FDR
+#' @param use_apply Logical (default F), whether to use \code{apply()} to calculate the rate. 
+#' Setting \code{use_apply = T} may be slower but memory efficient when \code{length(z)} is large.
+#' @return False discovery rate (FDR)
+#' 
+#' @references Steorts, R. C. (2015). Entity resolution with empirically motivated priors. Bayesian Analysis, 10(4), 849-875.
+#' @seealso \code{\link{fnr_fun}}, \code{\link{mean_fnr}}
 #' @export
 #' @examples
-#' truePartition <- c(50,50,50,50)
-#' maxPartitionSize<- length(truePartition)
-#' uniqueNumberRecords <- sum(truePartition)
-#' id <- rep(1:uniqueNumberRecords, times=rep(1:maxPartitionSize, times=truePartition))
-#' fdr_fun(z = truePartition, id)
-fdr_fun <- function(z, id) {
-	if (n_matches_fun(z) == 0) {
-		return(0)
-	} else {
-		return(sum(vapply(X = c(1:length(z)), FUN = i_false_det_fun, 
-			FUN.VALUE = 1, z. = z, id. = id))/(2 * 
-			n_matches_fun(z)))
-	}
+#' nclusters_per_size <- c(50,50,50,50)
+#' numberFields <- 5
+#' numberCategories <- rep(10,5)
+#' trueBeta <- 0.01
+#' # generate simulated data
+#' simulatedData <- SimData(nclusters_per_size, numberFields, numberCategories, trueBeta)
+#' # Fit ESCD model
+#' posteriorESCD <- SampleCluster(data=simulatedData, Prior="ESCD", burn=0, nsamples=10)
+#' # true number of clusters
+#' trueK = sum(nclusters_per_size)
+#' # true cluster membership vector
+#' trueid = rep(1:trueK, times=rep(1:length(nclusters_per_size), times=nclusters_per_size))
+#' # FDR calculation for a single estimate
+#' fdr_fun(posteriorESCD$Z[10,], trueid)
+#' # average FDR calculation
+#' mean_fdr(posteriorESCD$Z, trueid)
+#' 
+fdr_fun <- function(z, id, use_apply = F) {
+  if(!use_apply){ # use_apply = F, faster, but memory intensive
+    estimatelinks = outer_equal(z)
+    diag(estimatelinks) = F
+    truelinks = outer_equal(id)
+    diag(truelinks) = F
+    denom = sum(estimatelinks)
+    if(denom == 0){
+      return(0)
+    }else{
+      return(sum(estimatelinks & !truelinks)/denom)
+    }
+  }else{ # use_apply = T, slower, but not memory intensive
+    if (n_matches_fun(z) == 0) {
+      return(0)
+    } else {
+      return(sum(vapply(X = c(1:length(z)), FUN = i_false_det_fun, 
+                        FUN.VALUE = 1, z. = z, id. = id))/(2 * n_matches_fun(z)))
+    }
+  }
 }
 
-#' Calculates FNR when ground truth is available
-#'
-#' @param z Vector of cluster assignments
-#' @param id Vector of true cluster assignments (ground truth)
-#' @return FNR
-#' @export
-#' @examples
-#' truePartition <- c(50,50,50,50)
-#' maxPartitionSize<- length(truePartition)
-#' uniqueNumberRecords <- sum(truePartition)
-#' id <- rep(1:uniqueNumberRecords, times=rep(1:maxPartitionSize, times=truePartition))
-#' fnr_fun(z = truePartition, id)
-fnr_fun <- function(z, id) {
-	if (n_matches_fun(id) == 0) {
-		return(0)
-	} else {
-		return(sum(vapply(X = c(1:length(z)), FUN = i_false_neg_fun, 
-			FUN.VALUE = 1, z. = z, id. = id))/(2 * 
-			n_matches_fun(id)))
-	}
-}
 
-#' Calculates average FDR when ground truth is available
+#' @title Calculates false negative rate (FNR) when the ground truth is available
+#' 
+#' @description False negative rate (FNR) of the estimated record linkage (partition) based on the ground truth is defined as (Steorts, 2015)
+#' \deqn{FNR = \frac{FN}{CL + FN}}
+#' where FN is the number of false negatives (linked under the truth but not linked under the estimate) and CL is the number of correct links (true positives). 
+#' If both FN=0 and CL=0, define FNR = 0. 
+#' 
+#' FNR can be also defined as \eqn{FNR = 1 - Recall}, where \eqn{Recall = CL/(CL+FN)}.
 #'
-#' @param zm Matrix with posterior samples of cluster assignments
-#' @param id Vector of true cluster assignments (ground truth)
-#' @return Average FDR over posterior samples
+#' \code{fnr_fun} calculates FNR for an estimated partition, and \code{mean_fnr} calculates average FNR based on posterior samples of partition.
+#'
+#' @param z Integer vector of cluster assignments
+#' @param id Integer vector of true cluster assignments (ground truth)
+#' @param use_apply Logical (default F), whether to use \code{apply()} to calculate the rate. 
+#' Setting \code{use_apply = T} may be slower but memory efficient when \code{length(z)} is large.
+#' @return False negative rate (FNR)
+#' 
+#' @references Steorts, R. C. (2015). Entity resolution with empirically motivated priors. Bayesian Analysis, 10(4), 849-875.
+#' @seealso \code{\link{fdr_fun}}, \code{\link{mean_fdr}}
 #' @export
 #' @examples
-#' truePartition <- c(50,50,50,50)
-#' maxPartitionSize<- length(truePartition)
-#' uniqueNumberRecords <- sum(truePartition)
-#' id <- rep(1:uniqueNumberRecords, times=rep(1:maxPartitionSize, times=truePartition))
+#' nclusters_per_size <- c(50,50,50,50)
 #' numberFields <- 5
 #' numberCategories <- rep(10,5)
 #' trueBeta <- 0.01
-#' simulatedData <- SimData(truePartition, numberFields, numberCategories, trueBeta)
+#' # generate simulated data
+#' simulatedData <- SimData(nclusters_per_size, numberFields, numberCategories, trueBeta)
+#' # Fit ESCD model
 #' posteriorESCD <- SampleCluster(data=simulatedData, Prior="ESCD", burn=0, nsamples=10)
-#' mean_fdr(zm = posteriorESCD$Z, id)
-mean_fdr <- function(zm, id) {
+#' # true number of clusters
+#' trueK = sum(nclusters_per_size)
+#' # true cluster membership vector
+#' trueid = rep(1:trueK, times=rep(1:length(nclusters_per_size), times=nclusters_per_size))
+#' # FNR calculation for a single estimate
+#' fnr_fun(posteriorESCD$Z[10,], trueid)
+#' # average FNR calculation
+#' mean_fnr(posteriorESCD$Z, trueid)
+#' 
+fnr_fun <- function(z, id, use_apply = F) {
+  if(!use_apply){ # use_apply = F, faster, but memory intensive
+    estimatelinks = outer_equal(z)
+    diag(estimatelinks) = F
+    truelinks = outer_equal(id)
+    diag(truelinks) = F
+    denom = sum(truelinks)
+    if(denom == 0){
+      return(0)
+    }else{
+      return(sum(!estimatelinks & truelinks)/denom)
+    }
+  }else{ # use_apply = T, slower, but not memory intensive
+    if (n_matches_fun(id) == 0) {
+      return(0)
+    } else {
+      return(sum(vapply(X = c(1:length(z)), FUN = i_false_neg_fun, 
+                        FUN.VALUE = 1, z. = z, id. = id))/(2 * n_matches_fun(id)))
+    }  
+  }
+}
+
+
+#' @rdname fdr_fun 
+#' @param zm Matrix with posterior samples of cluster assignments, where each row corresponds to one sample from the posterior
+#' @export
+mean_fdr <- function(zm, id, use_apply = F) {
 	fdr_vec <- apply(X = zm, MARGIN = 1, FUN = fdr_fun, 
-		id = id)
+		id = id, use_apply = use_apply)
 	return(mean(fdr_vec))
 }
 
-#' Calculates average FNR when ground truth is available
-#'
-#' @param zm Matrix with posterior samples of cluster assignments
-#' @param id Vector of true cluster assignments (ground truth)
-#' @return Average FNR over posterior samples
+#' @rdname fnr_fun 
+#' @param zm Matrix with posterior samples of cluster assignments, where each row corresponds to one sample from the posterior
 #' @export
-#' @examples
-#' truePartition <- c(50,50,50,50)
-#' maxPartitionSize<- length(truePartition)
-#' uniqueNumberRecords <- sum(truePartition)
-#' id <- rep(1:uniqueNumberRecords, times=rep(1:maxPartitionSize, times=truePartition))
-#' numberFields <- 5
-#' numberCategories <- rep(10,5)
-#' trueBeta <- 0.01
-#' simulatedData <- SimData(truePartition, numberFields, numberCategories, trueBeta)
-#' posteriorESCD <- SampleCluster(data=simulatedData, Prior="ESCD", burn=0, nsamples=10)
-#' mean_fnr(zm = posteriorESCD$Z, id)
-mean_fnr <- function(zm, id) {
-	fnr_vec <- apply(X = zm, MARGIN = 1, FUN = fnr_fun, 
-		id = id)
-	return(mean(fnr_vec))
+mean_fnr <- function(zm, id, use_apply = F) {
+  fnr_vec <- apply(X = zm, MARGIN = 1, FUN = fnr_fun, 
+                   id = id, use_apply = use_apply)
+  return(mean(fnr_vec))
 }
 
 # auxiliary functions to compute fnr and fdr
 
+# faster version of base::outer(z, z, "==")
+outer_equal<- function(z){ 
+  z = as.integer(z)
+  n = length(z)
+  Y = rep.int(z, rep.int(n, n))
+  robj <- z==Y
+  dim(robj) = c(n,n)
+  robj
+}
+
+
 i_false_det_fun <- function(i, z., id.) {
 	i_match <- which(z.[-i] == z.[i])
 	i_match_true <- which(id.[-i] == id.[i])
@@ -108,10 +169,11 @@ i_false_neg_fun <- function(i, z., id.) {
 	return(i_false_neg)
 }
 
+# calculates all possible (undirected) links in the given partition.
 n_matches_fun <- function(z) {
 	sizes <- table(z)
 	n_match <- sum(vapply(X = sizes, FUN = function(s) {
 		return(choose(s, 2))
 	}, FUN.VALUE = 1))
 	return(sum(n_match))
-}
+}
diff --git a/R/package_description.R b/R/package_description.R
@@ -0,0 +1,33 @@
+#' @name microclustr-package
+#' @docType package
+#' @aliases microclustr
+#' 
+#' @title Entity Resolution with Random Partition Priors for Microclustering
+#' 
+#' @description An implementation of the model in Betancourt, Zanella, and Steorts (2020), which performs microclustering models for categorical data.
+#'  The package provides a vignette for two proposed methods in the paper as well as two standard Bayesian non-parametric clustering approaches for entity resolution.
+#'  The experiments are reproducible and illustrated using a simple vignette.
+#'  
+#'  (Update) An implementaton of the ESC-Binom, ESC-Poisson models in Lee and Sang (2022) has been added.
+#'  
+#'  LICENSE: GPL-3 + file license. 
+#' 
+#' @author Rebecca C. Steorts, Brenda Betancourt, Giacomo Zanella, Changwoo J. Lee, Huiyan Sang
+#' 
+#' @references   
+#' Steorts, R. C., Hall, R., & Fienberg, S. E. (2016). A Bayesian approach to graphical record linkage and deduplication. Journal of the American Statistical Association, 111(516), 1660-1672.
+#' 
+#' Zanella, G., Betancourt, B., Miller, J. W., Wallach, H., Zaidi, A., & Steorts, R. C. (2016). Flexible models for microclustering with application to entity resolution. Advances in neural information processing systems, 29.
+#' 
+#' Betancourt, B., Zanella, G., & Steorts, R. C. (2020). Random partition models for microclustering tasks. Journal of the American Statistical Association, 1-13.
+#' 
+#' Lee, C. J., & Sang, H. (2022). Why the Rich Get Richer? On the Balancedness of Random Partition Models. Proceedings of the 39th International Conference on Machine Learning (ICML), PMLR 162:12521 - 12541.
+#' 
+#' @examples 
+#' 
+#' library(microclustr)
+#' ?SimData
+#' ?SampleCluster
+#' ?mean_fnr
+#' ?mean_fdr
+NULL