From d49a26cfa11197f1dc61f519e59ac2c861e16f51 Mon Sep 17 00:00:00 2001
From: yhoriuchi <yusaku.horiuchi@gmail.com>
Date: Tue, 8 Aug 2023 08:23:23 -0400
Subject: [PATCH] accurate critical t-values, allow users to specify weights

---
 .DS_Store          | Bin 8196 -> 8196 bytes
 NAMESPACE          |   1 +
 R/pj_estimate.R    | 147 +++++++++++++++++++++++++++++++++------------
 R/projoint.R       |  18 +++++-
 man/pj_estimate.Rd |  20 +++++-
 man/projoint.Rd    |  22 ++++++-
 6 files changed, 164 insertions(+), 44 deletions(-)

diff --git a/.DS_Store b/.DS_Store
index 9604810f53813b1acc10e282804079754f3ce2dd..2165d2b56715edc2fa2728384aa8080acdad0274 100644
GIT binary patch
delta 18
ZcmZp1XmQw(Cd|k<`Ju4b=33!)9soUK1}6Xj

delta 16
XcmZp1XmQw(COp|)SbTG>a61nGGav<w

diff --git a/NAMESPACE b/NAMESPACE
index 2dcab1e..fed80ee 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -19,6 +19,7 @@ import(stringr)
 import(tidyr)
 import(tidyselect)
 importFrom(MASS,mvrnorm)
+importFrom(estimatr,lm_robust)
 importFrom(methods,is)
 importFrom(methods,new)
 importFrom(stats,setNames)
diff --git a/R/pj_estimate.R b/R/pj_estimate.R
index 4712809..3ee48de 100644
--- a/R/pj_estimate.R
+++ b/R/pj_estimate.R
@@ -4,6 +4,7 @@
 #'
 #' @import dplyr
 #' @import rlang
+#' @importFrom estimatr lm_robust
 #' @importFrom MASS mvrnorm
 #' @keywords internal
 #' @param .data A `projoint_data` object
@@ -18,21 +19,32 @@
 #' @param .ignore_position TRUE if you ignore the location of profile (left or right). Relevant only if .structure == "choice_level". Defaults to NULL.
 #' @param .n_sims The number of simulations. Relevant only if .method == "simulation" 
 #' @param .n_boot The number of bootstrapped samples. Relevant only if .method == "bootstrap"
+#' @param .weights_1 the weight to estimate IRR (see `lm_robust()`): NULL (default)
+#' @param .clusters_1 the clusters to estimate IRR (see `lm_robust()`): NULL (default)
+#' @param .se_type_1 the standard error type to estimate IRR (see `lm_robust()`): "classical" (default)
+#' @param .weights_2 the weight to estimate MM or AMCE (see `lm_robust()`): NULL (default)
+#' @param .clusters_2 the clusters to estimate MM or AMCE (see `lm_robust()`): NULL (default)
+#' @param .se_type_2 the standard error type to estimate MM or AMCE (see `lm_robust()`): "classical" (default)
 #' @return A data frame of estimates
 
-
 # .data = out1
 # .attribute = "att1"
-# .level = "level1"
+# .level = "level2"
 # .structure = "profile_level"
-# .estimand = "mm"
+# .estimand = "amce"
 # .se_method = "analytical"
 # .irr = NULL
-# .baseline = NULL
+# .baseline = "level1"
 # .remove_ties = TRUE
 # .ignore_position = NULL
 # .n_sims = NULL
 # .n_boot = NULL
+# .weights_1 <- NULL
+# .clusters_1 <- NULL
+# .se_type_1 <- "classical"
+# .weights_2 <- NULL
+# .clusters_2 <- NULL
+# .se_type_2 <- "classical"
 
 pj_estimate <- function(
     .data,
@@ -46,7 +58,13 @@ pj_estimate <- function(
     .remove_ties = TRUE,
     .ignore_position = NULL,
     .n_sims = NULL,
-    .n_boot = NULL
+    .n_boot = NULL,
+    .weights_1 = NULL,
+    .clusters_1 = NULL,
+    .se_type_1 = "classical",
+    .weights_2 = NULL,
+    .clusters_2 = NULL,
+    .se_type_2 = "classical"
 ){
   
   .dataframe <- .data@data
@@ -63,12 +81,13 @@ pj_estimate <- function(
   estimate <- NULL
   cov_mm_tau <- NULL
   reg_tau <- NULL
+  se <- NULL
   
   # check various settings --------------------------------------------------
   
   structure <- rlang::arg_match0(.structure, c("choice_level", "profile_level"))
   estimand  <- rlang::arg_match0(.estimand, c("mm", "amce"))
-  se_method    <- rlang::arg_match0(.se_method, c("analytical", "simulation", "bootstrap"))
+  se_method <- rlang::arg_match0(.se_method, c("analytical", "simulation", "bootstrap"))
   
   if(!is.null(.irr) & !is.numeric(.irr) & length(.irr) == 1){
     stop("The .irr argument must be either a numeric scalar or NULL.")
@@ -81,7 +100,7 @@ pj_estimate <- function(
   if (.structure == "profile_level" & !is.null(.ignore_position)){
     stop("The .ignore_position argument can be specified only when the .structure argument is choice_level.")
   }
-
+  
   if (.structure == "choice_level" & is.null(.ignore_position)){
     stop("Specify the .ignore_position argument.")
   }
@@ -101,20 +120,20 @@ pj_estimate <- function(
   if(.se_method == "simulation" & is.null(.n_sims)){
     stop("Specify the .n_sims arguement for simulation")
   }
-
+  
   if(.se_method != "simulation" & !is.null(.n_sims)){
     stop("You cannot specify the .n_sims arguement for analytical derivation or bootstrapping")
   }
-
+  
   if(.se_method == "bootstrap" & is.null(.n_boot)){
     stop("Specify the .n_boot arguement for bootstrapping")
   }
-
+  
   if(.se_method != "bootstrap" & !is.null(.n_boot)){
     stop("You cannot specify the .n_boot arguement for analytical derivation or simulation")
   }
   
-    
+  
   if (.structure == "choice_level" & .estimand == "mm" & .remove_ties == FALSE){
     stop("The .remove_ties argument should be TRUE to estimate choice-level MMs.")
   }
@@ -238,18 +257,22 @@ pj_estimate <- function(
       )
       
     }
-
+    
   }
-
+  
   # Estimate or specify tau -------------------------------------------------
   
   if (is.null(.irr)){
     
     # run intercept-only regression models
-    reg_irr <- stats::lm(agree ~ 1, data_for_irr)
+    reg_irr <- estimatr::lm_robust(agree ~ 1, 
+                                   weights = .weights_1, 
+                                   clusters = .clusters_1,
+                                   se_type = .se_type_1,
+                                   data = data_for_irr) %>% tidy()
     
-    irr     <- reg_irr$coefficient %>% as.numeric()
-    var_irr <- vcov(reg_irr) %>% as.numeric()
+    irr     <- reg_irr$estimate[1]
+    var_irr <- reg_irr$std.error[1]^2 
     
     tau     <- (1 - sqrt(1 - 2 * (1 - irr))) / 2
     var_tau <- 0.25 * (2 * irr - 1)^(-1) * var_irr
@@ -270,19 +293,33 @@ pj_estimate <- function(
   
   if (estimand == "mm"){
     
-    reg_mm  <- stats::lm(selected ~ 1, data_for_estimand)
+    reg_mm  <- estimatr::lm_robust(selected ~ 1, 
+                                   weights = .weights_2, 
+                                   clusters = .clusters_2,
+                                   se_type = .se_type_2,
+                                   data = data_for_estimand) %>% tidy()
+    
+    # the critical t-value
+    critical_t <- abs((reg_mm$conf.low[1] - reg_mm$estimate[1]) / reg_mm$std.error[1])
     
-    mm_uncorrected <-  reg_mm$coefficient %>% as.numeric()
-    var_uncorrected_mm <- vcov(reg_mm)  %>% as.numeric()
+    mm_uncorrected <-  reg_mm$estimate[1]
+    var_uncorrected_mm <- reg_mm$std.error[1]^2
     
   } 
   
   if (estimand == "amce") {
     
-    reg_amce  <- stats::lm(selected ~ x, data_for_estimand)
+    reg_amce  <- estimatr::lm_robust(selected ~ x, 
+                                     weights = .weights_2, 
+                                     clusters = .clusters_2,
+                                     se_type = .se_type_2,
+                                     data = data_for_estimand) %>% tidy()
     
-    amce_uncorrected <-  reg_amce$coefficient[2] %>% as.numeric()
-    var_uncorrected_amce <- vcov(reg_amce)[2, 2]  %>% as.numeric()
+    # the critical t-value
+    critical_t <- abs((reg_amce$conf.low[2] - reg_amce$estimate[2]) / reg_amce$std.error[2])
+    
+    amce_uncorrected <-  reg_amce$estimate[2]
+    var_uncorrected_amce <- reg_amce$std.error[2]^2
     
   } 
   
@@ -318,12 +355,12 @@ pj_estimate <- function(
       
       cov_mm0_irr  <- cov(d_cov0$selected, d_cov0$agree) / nrow(data_for_irr)
       cov_mm1_irr  <- cov(d_cov1$selected, d_cov1$agree) / nrow(data_for_irr)
-
+      
     } else {
       
       cov_mm0_irr  <- 0
       cov_mm1_irr  <- 0
-
+      
     }
     
     cov_mm0_tau  <- -0.5 * (2 * irr - 1)^(-1/2) * cov_mm0_irr
@@ -331,7 +368,7 @@ pj_estimate <- function(
     cov_amce_tau <- cov_mm1_tau - cov_mm0_tau
     
   }
-
+  
   # estimate and correct MMs ------------------------------------------------
   
   if (estimand == "mm"){
@@ -355,7 +392,9 @@ pj_estimate <- function(
                            "estimate" = c(mm_uncorrected, mm_corrected),
                            "se" = c(sqrt(var_uncorrected_mm),
                                     sqrt(var_corrected_mm))) %>% 
-        dplyr::mutate("tau" = tau)
+        dplyr::mutate("conf.low" = estimate - critical_t * se,
+                      "conf.high" = estimate + critical_t * se,
+                      "tau" = tau)
       
     } 
     
@@ -381,7 +420,9 @@ pj_estimate <- function(
       output <- data.frame("estimand" = c("mm_uncorrected", "mm_corrected"),
                            "estimate" = c(mm_uncorrected, mean(sim_mm$mm_corrected)),
                            "se" = c(sqrt(var_uncorrected_mm), sd(sim_mm$mm_corrected))) %>% 
-        dplyr::mutate("tau" = tau)
+        dplyr::mutate("conf.low" = estimate - critical_t * se,
+                      "conf.high" = estimate + critical_t * se,
+                      "tau" = tau)
       
     }  
     
@@ -411,14 +452,24 @@ pj_estimate <- function(
                            relationship = "many-to-many")
         
         # run intercept-only regression models
-        reg_irr <- stats::lm(agree ~ 1, bs_sample_1)
-        reg_mm  <- stats::lm(selected ~ 1, bs_sample_2)
+        reg_irr <- estimatr::lm_robust(agree ~ 1, 
+                                       weights = .weights_1, 
+                                       clusters = .clusters_1,
+                                       se_type = .se_type_1,
+                                       data = bs_sample_1) %>% tidy()
+        
+        reg_mm  <- estimatr::lm_robust(selected ~ 1, 
+                                       weights = .weights_2, 
+                                       clusters = .clusters_2,
+                                       se_type = .se_type_2,
+                                       data = bs_sample_2) %>% tidy()
         
         # calculate the means
-        mm_uncorrected <-  reg_mm$coefficient %>% as.numeric()
+        mm_uncorrected <-  reg_mm$estimate[1]
+        
         if (is.null(.irr)){
           
-          irr <- reg_irr$coefficient %>% as.numeric()
+          irr <- reg_irr$estimate[1]
           tau <- (1 - sqrt(1 - 2 * (1 - irr))) / 2
           
         } else {
@@ -445,7 +496,9 @@ pj_estimate <- function(
                                           mean(out %>% filter(estimand == "mm_corrected") %>% pull(estimate))),
                            "se"       = c(sd(out %>% filter(estimand == "mm_uncorrected") %>% pull(estimate)), 
                                           sd(out %>% filter(estimand == "mm_corrected") %>% pull(estimate)))) %>% 
-        dplyr::mutate("tau" = tau)
+        dplyr::mutate("conf.low" = estimate - critical_t * se,
+                      "conf.high" = estimate + critical_t * se,
+                      "tau" = tau)
     }
     
   } 
@@ -473,7 +526,9 @@ pj_estimate <- function(
                            "estimate" = c(amce_uncorrected, amce_corrected),
                            "se" = c(sqrt(var_uncorrected_amce),
                                     sqrt(var_corrected_amce))) %>% 
-        dplyr::mutate("tau" = tau)
+        dplyr::mutate("conf.low" = estimate - critical_t * se,
+                      "conf.high" = estimate + critical_t * se,
+                      "tau" = tau)
       
     } 
     
@@ -499,7 +554,9 @@ pj_estimate <- function(
       output <- data.frame("estimand" = c("amce_uncorrected", "amce_corrected"),
                            "estimate" = c(amce_uncorrected, mean(sim_amce$amce_corrected)),
                            "se" = c(sqrt(var_uncorrected_amce), sd(sim_amce$amce_corrected))) %>% 
-        dplyr::mutate("tau" = tau)
+        dplyr::mutate("conf.low" = estimate - critical_t * se,
+                      "conf.high" = estimate + critical_t * se,
+                      "tau" = tau)
       
     } 
     
@@ -529,14 +586,24 @@ pj_estimate <- function(
                            relationship = "many-to-many")
         
         # run intercept-only regression models
-        reg_irr  <- stats::lm(agree ~ 1, bs_sample_1)
-        reg_amce <- stats::lm(selected ~ x, bs_sample_2)
+        reg_irr <- estimatr::lm_robust(agree ~ 1, 
+                                       weights = .weights_1, 
+                                       clusters = .clusters_1,
+                                       se_type = .se_type_1,
+                                       data = bs_sample_1) %>% tidy()
+        
+        reg_amce  <- estimatr::lm_robust(selected ~ x, 
+                                         weights = .weights_2, 
+                                         clusters = .clusters_2,
+                                         se_type = .se_type_2,
+                                         bs_sample_2) %>% tidy()
+        
         
         # calculate the means
-        amce_uncorrected <-  reg_amce$coefficient[2] %>% as.numeric()
+        amce_uncorrected <-  reg_amce$estimate[2]
         if (is.null(.irr)){
           
-          irr <- reg_irr$coefficient %>% as.numeric()
+          irr <- reg_irr$estimate[1]
           tau <- (1 - sqrt(1 - 2 * (1 - irr))) / 2
           
         } else {
@@ -563,7 +630,9 @@ pj_estimate <- function(
                                           mean(out %>% filter(estimand == "amce_corrected") %>% pull(estimate))),
                            "se"       = c(sd(out %>% filter(estimand == "amce_uncorrected") %>% pull(estimate)), 
                                           sd(out %>% filter(estimand == "amce_corrected") %>% pull(estimate)))) %>% 
-        dplyr::mutate("tau" = tau)
+        dplyr::mutate("conf.low" = estimate - critical_t * se,
+                      "conf.high" = estimate + critical_t * se,
+                      "tau" = tau)
       
     }
   }  
diff --git a/R/projoint.R b/R/projoint.R
index 70fea59..b45eb95 100644
--- a/R/projoint.R
+++ b/R/projoint.R
@@ -8,6 +8,7 @@
 #' @importFrom MASS mvrnorm
 #' @importFrom methods is
 #' @importFrom methods new
+#' @importFrom estimatr lm_robust
 #' @param .data A `projoint_data` object
 #' @param .qoi A `projoint_qoi` object. If NULL, defaults to producing all MMs and all AMCEs.
 #' @param .structure Either "profile_level" or "choice_level"
@@ -18,6 +19,13 @@
 #' @param .se_method c("analytic", "simulation", "bootstrap") description
 #' @param .n_sims The number of simulations. Relevant only if .se_method == "simulation" 
 #' @param .n_boot The number of bootstrapped samples. Relevant only if .se_method == "bootstrap"
+#' @param .n_boot The number of bootstrapped samples. Relevant only if .method == "bootstrap"
+#' @param .weights_1 the weight to estimate IRR (see `lm_robust()`): NULL (default)
+#' @param .clusters_1 the clusters to estimate IRR (see `lm_robust()`): NULL (default)
+#' @param .se_type_1 the standard error type to estimate IRR (see `lm_robust()`): "classical" (default)
+#' @param .weights_2 the weight to estimate MM or AMCE (see `lm_robust()`): NULL (default)
+#' @param .clusters_2 the clusters to estimate MM or AMCE (see `lm_robust()`): NULL (default)
+#' @param .se_type_2 the standard error type to estimate MM or AMCE (see `lm_robust()`): "classical" (default)
 #' @return A `projoint_results` object
 #' @export
 #' @examples
@@ -51,13 +59,19 @@ projoint <- function(
     .remove_ties = TRUE,
     .ignore_position = NULL,
     .n_sims = NULL,
-    .n_boot = NULL
+    .n_boot = NULL,
+    .weights_1 = NULL,
+    .clusters_1 = NULL,
+    .se_type_1 = "classical",
+    .weights_2 = NULL,
+    .clusters_2 = NULL,
+    .se_type_2 = "classical"
 ){
   
   # bind variables locally to the function ----------------------------------
   
   .baseline <- NULL
-
+  
   # check various settings --------------------------------------------------
   # also see: many checks in pj_estimate()
   
diff --git a/man/pj_estimate.Rd b/man/pj_estimate.Rd
index 96d0f62..967b903 100644
--- a/man/pj_estimate.Rd
+++ b/man/pj_estimate.Rd
@@ -16,7 +16,13 @@ pj_estimate(
   .remove_ties = TRUE,
   .ignore_position = NULL,
   .n_sims = NULL,
-  .n_boot = NULL
+  .n_boot = NULL,
+  .weights_1 = NULL,
+  .clusters_1 = NULL,
+  .se_type_1 = "classical",
+  .weights_2 = NULL,
+  .clusters_2 = NULL,
+  .se_type_2 = "classical"
 )
 }
 \arguments{
@@ -43,6 +49,18 @@ pj_estimate(
 \item{.n_sims}{The number of simulations. Relevant only if .method == "simulation"}
 
 \item{.n_boot}{The number of bootstrapped samples. Relevant only if .method == "bootstrap"}
+
+\item{.weights_1}{the weight to estimate IRR (see `lm_robust()`): NULL (default)}
+
+\item{.clusters_1}{the clusters to estimate IRR (see `lm_robust()`): NULL (default)}
+
+\item{.se_type_1}{the standard error type to estimate IRR (see `lm_robust()`): "classical" (default)}
+
+\item{.weights_2}{the weight to estimate MM or AMCE (see `lm_robust()`): NULL (default)}
+
+\item{.clusters_2}{the clusters to estimate MM or AMCE (see `lm_robust()`): NULL (default)}
+
+\item{.se_type_2}{the standard error type to estimate MM or AMCE (see `lm_robust()`): "classical" (default)}
 }
 \value{
 A data frame of estimates
diff --git a/man/projoint.Rd b/man/projoint.Rd
index 237ebb4..66588aa 100644
--- a/man/projoint.Rd
+++ b/man/projoint.Rd
@@ -14,7 +14,13 @@ projoint(
   .remove_ties = TRUE,
   .ignore_position = NULL,
   .n_sims = NULL,
-  .n_boot = NULL
+  .n_boot = NULL,
+  .weights_1 = NULL,
+  .clusters_1 = NULL,
+  .se_type_1 = "classical",
+  .weights_2 = NULL,
+  .clusters_2 = NULL,
+  .se_type_2 = "classical"
 )
 }
 \arguments{
@@ -36,7 +42,19 @@ projoint(
 
 \item{.n_sims}{The number of simulations. Relevant only if .se_method == "simulation"}
 
-\item{.n_boot}{The number of bootstrapped samples. Relevant only if .se_method == "bootstrap"}
+\item{.n_boot}{The number of bootstrapped samples. Relevant only if .method == "bootstrap"}
+
+\item{.weights_1}{the weight to estimate IRR (see `lm_robust()`): NULL (default)}
+
+\item{.clusters_1}{the clusters to estimate IRR (see `lm_robust()`): NULL (default)}
+
+\item{.se_type_1}{the standard error type to estimate IRR (see `lm_robust()`): "classical" (default)}
+
+\item{.weights_2}{the weight to estimate MM or AMCE (see `lm_robust()`): NULL (default)}
+
+\item{.clusters_2}{the clusters to estimate MM or AMCE (see `lm_robust()`): NULL (default)}
+
+\item{.se_type_2}{the standard error type to estimate MM or AMCE (see `lm_robust()`): "classical" (default)}
 }
 \value{
 A `projoint_results` object