From ccebf6ce4484d8f677289b4ca53fccf0c39564d8 Mon Sep 17 00:00:00 2001 From: TsaiLintung <101155567+TsaiLintung@users.noreply.github.com> Date: Wed, 27 Dec 2023 08:08:56 +0800 Subject: [PATCH 01/15] remove didor --- development/did_or.R | 30 ------------------------------ 1 file changed, 30 deletions(-) delete mode 100644 development/did_or.R diff --git a/development/did_or.R b/development/did_or.R deleted file mode 100644 index 69fd9a0..0000000 --- a/development/did_or.R +++ /dev/null @@ -1,30 +0,0 @@ -library(did) -# install.packages("devtools") -#devtools::install_github("TsaiLintung/fastdid") -library(fastdid) - -simdt <- sim_did(1e+03, 5, cov = "cont", hetero = "all", balanced = TRUE, second_outcome = TRUE, - stratify = FALSE, - second_cov = TRUE) -dt <- simdt$dt - -did_result <- did::att_gt(yname = "y",gname = "G",idname = "unit",tname = "time",data = dt,base_period = "universal",est_method = "dr",cband = FALSE, - xformla = ~x+x2, - control_group = "notyettreated", - allow_unbalanced_panel = FALSE, #this is the only differece - clustervars = NULL, - bstrap = FALSE) -did_result2 <- did::att_gt(yname = "y",gname = "G",idname = "unit",tname = "time",data = dt,base_period = "universal",est_method = "dr",cband = FALSE, - xformla = ~x+x2, - control_group = "notyettreated", - allow_unbalanced_panel = TRUE, #this is the only differece - clustervars = NULL, - bstrap = FALSE) - -sum(did_result$att, na.rm = TRUE)/sum(did_result2$att, na.rm = TRUE) -sum(did_result$se, na.rm = TRUE)/sum(did_result2$se, na.rm = TRUE) #should be 1 if the SE is the same, but not?????? - -#very weird -#DRDID:drdid_rc1, line 203: inf.cont <- inf.cont.post - inf.cont.pre + inf.cont.ps + inf.cont.or -#DRDID:drdid_panel, line 169: inf.control <- (inf.cont.1 + inf.cont.2 - inf.cont.3) / mean(w.cont) -#influence from outcome have different sign?? \ No newline at end of file From 38516e1298089e2b195af9eaf3a923dfa98d54e4 Mon Sep 17 00:00:00 2001 From: TsaiLintung <101155567+TsaiLintung@users.noreply.github.com> Date: Fri, 29 Dec 2023 13:26:15 +0800 Subject: [PATCH 02/15] add benchmark compare --- development/compare_benchmark.R | 94 +++++++++++++++++++ development/{benchmark.R => solo_benchmark.R} | 0 interactive/rc_test.R | 50 ++++++++++ 3 files changed, 144 insertions(+) create mode 100644 development/compare_benchmark.R rename development/{benchmark.R => solo_benchmark.R} (100%) create mode 100644 interactive/rc_test.R diff --git a/development/compare_benchmark.R b/development/compare_benchmark.R new file mode 100644 index 0000000..0ce555b --- /dev/null +++ b/development/compare_benchmark.R @@ -0,0 +1,94 @@ +rm(list = ls()) +gc() + +library(peakRAM) + +library(magrittr) +library(ggplot2) +library(fixest) + +library(did) +#install_github("TsaiLintung/fastdid") +library(fastdid) +library(DiDforBigData) + + +data.table::setDTthreads(0) + +# setup ------------------------------------------------------------------------------------ + +min_order <- 2 +max_order <- 6 + +# functions for running the estimation ----------------------------------------------------- + +run_fastdid <- function(dt){ + result <- fastdid(dt, + timevar = "time", cohortvar = "G", unitvar = "unit", outcomevar = "y", + result_type = "group_time", copy = FALSE, allow_unbalance_panel = FALSE) +} + +run_did <- function(dt){ + result <- att_gt(yname = "y", tname = "time", idname = "unit", gname = "G", data = dt, + control_group = "notyettreated", bstrap = FALSE, cband = FALSE) +} + +run_dfbd <- function(dt){ + varnames = list() + varnames$time_name = "time" + varnames$outcome_name = "y" + varnames$cohort_name = "G" + varnames$id_name = "unit" + + # estimate the ATT for all cohorts at event time 1 only + result <- DiD(dt, varnames) +} + +run_benchmark <- function(min_order, max_order){ + all_bm <- data.table() + for(order in min_order:max_order){ + raw_dt <- sim_did(10^order, 10, seed = 1, cov = "cont", second_outcome = TRUE, second_cov = TRUE)[["dt"]] + + gc() + + dt <- copy(raw_dt) + fast_bm <- peakRAM(run_fastdid(dt)) + + gc() + + dt <- copy(raw_dt) + did_bm <- peakRAM(run_did(dt)) + + gc() + + dt <- copy(raw_dt) + dfbd_bm <- peakRAM(run_dfbd(dt)) + + gc() + + bm <- rbind(fast_bm, did_bm, dfbd_bm) |> as.data.table() + bm[, order := order] + all_bm <- rbind(all_bm, bm) + message(order) + } + + return(all_bm) + +} + +bm_result <- run_benchmark(min_order, max_order) + +# visualize benchmark result ------------------------------------------------------------------------------------ + +bm_result[Function_Call == "run_fastdid(dt)", pkg := "fastdid"] +bm_result[Function_Call == "run_dfbd(dt)", pkg := "dfbd"] +bm_result[Function_Call == "run_did(dt)", pkg := "did"] + +setnames(bm_result, c("Elapsed_Time_sec", "Peak_RAM_Used_MiB"), c("time", "RAM")) + +bm_result %>% ggplot(aes(x = order, y = time, color = pkg)) + geom_line() + geom_point() + coord_trans(y = "log10") + + ggtitle("Comparison of computing time") + +bm_result %>% ggplot(aes(x = order, y = RAM, color = pkg)) + geom_line() + geom_point() + + ggtitle("Comparison of peak RAM used") + diff --git a/development/benchmark.R b/development/solo_benchmark.R similarity index 100% rename from development/benchmark.R rename to development/solo_benchmark.R diff --git a/interactive/rc_test.R b/interactive/rc_test.R new file mode 100644 index 0000000..0f1f31a --- /dev/null +++ b/interactive/rc_test.R @@ -0,0 +1,50 @@ + +ipw_unbalanced_data <- function(method){ + result <- fastdid(dt2, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y", result_type = "group_time", + control_type = method, + covariatesvar = c("x", "x2"), + allow_unbalance_panel = TRUE) + did_result <- did::att_gt(yname = "y",gname = "G",idname = "unit",tname = "time",data = dt2, + base_period = "universal",cband = FALSE, + est_method = drdid_rc1,xformla = ~x+x2, + control_group = "notyettreated", + allow_unbalanced_panel = TRUE, + clustervars = NULL, + bstrap = FALSE) + est_diff_ratio(result, did_result) +} + +ipw_balanced_data <- function(method){ + result <- fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y", result_type = "group_time", + control_type = method, + covariatesvar = c("x", "x2"), + allow_unbalance_panel = TRUE) + did_result <- did::att_gt(yname = "y",gname = "G",idname = "unit",tname = "time",data = dt, + base_period = "universal",cband = FALSE, + est_method = drdid_rc,xformla = ~x+x2, + control_group = "notyettreated", + allow_unbalanced_panel = TRUE, + clustervars = NULL, + bstrap = FALSE) + est_diff_ratio(result, did_result) +} + +simple_unbalanced <- function(){ + result <- fastdid(dt2, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y", result_type = "group_time", + allow_unbalance_panel = TRUE) + did_result <- did::att_gt(yname = "y",gname = "G",idname = "unit",tname = "time",data = dt2,base_period = "universal",est_method = "ipw",cband = FALSE, + #xformla = ~x, + control_group = "notyettreated", + allow_unbalanced_panel = TRUE, + clustervars = NULL, + bstrap = FALSE) + + est_diff_ratio(result, did_result) +} + +run_new <- function(method){ + load_all() + print(simple_unbalanced()) + print(ipw_balanced_data(method)) + print(ipw_unbalanced_data(method)) +} From 1c4abf57bc951ea3cff203e94e27747a594a7f92 Mon Sep 17 00:00:00 2001 From: TsaiLintung <101155567+TsaiLintung@users.noreply.github.com> Date: Mon, 18 Mar 2024 16:06:51 +0800 Subject: [PATCH 03/15] added time varying control (no correctness test) --- R/estimate_gtatt.R | 25 ++++++++++++++--- R/fastdid.R | 43 ++++++++++++++++++++++------- R/sim_did.R | 13 +++++++-- R/validate_did.R | 19 ++++++------- development/buildtest.R | 2 +- inst/tinytest/test_1_fastdid.R | 12 +++++++- interactive/rc_test.R | 50 ---------------------------------- 7 files changed, 85 insertions(+), 79 deletions(-) delete mode 100644 interactive/rc_test.R diff --git a/R/estimate_gtatt.R b/R/estimate_gtatt.R index 2367d59..b30e529 100644 --- a/R/estimate_gtatt.R +++ b/R/estimate_gtatt.R @@ -1,4 +1,4 @@ -estimate_gtatt <- function(outcomes_list, outcomevar, covariates, control_type, weights, +estimate_gtatt <- function(outcomes_list, outcomevar, covariates, varycovariates, control_type, weights, cohort_sizes,cohorts,id_size,time_periods, control_option, allow_unbalance_panel) { @@ -40,15 +40,32 @@ estimate_gtatt <- function(outcomes_list, outcomevar, covariates, control_type, did_setup[get_cohort_pos(cohort_sizes, min_control_cohort, max_control_cohort)] <- 0 did_setup[get_cohort_pos(cohort_sizes, g)] <- 1 #treated cannot be controls, assign treated after control to overwrite + if(!is.null(varycovariates)){ + precov <- varycovariates[[base_period]] + names(precov) <- paste0("pre_", names(precov)) + + postcov <- varycovariates[[t]]-varycovariates[[base_period]] + names(postcov) <- paste0("post_", names(varycovariates[[t]])) + + if(is.null(covariates)){ + covariatesdt <- cbind(const = -1, cbind(precov, postcov)) #if covariates is NULL + } else { + covariatesdt <- cbind(covariates, cbind(precov, postcov)) #if covariates is NULL + } + + } else { + covariatesdt <- covariates + } + #construct the 2x2 dataset - cohort_did <- data.table(did_setup, outcomes[[t]], outcomes[[base_period]], weights, covariates) + cohort_did <- data.table(did_setup, outcomes[[t]], outcomes[[base_period]], weights, covariatesdt) setnames(cohort_did, c("did_setup", "V2", "V3", "weights"), c("D", "post.y", "pre.y", "weights")) #estimate did if(!allow_unbalance_panel){ - result <- estimate_did(cohort_did, colnames(covariates), control_type, last_coef, cache_ps_fit_list[[gt_name]], cache_hess_list[[gt_name]]) + result <- estimate_did(cohort_did, colnames(covariatesdt), control_type, last_coef, cache_ps_fit_list[[gt_name]], cache_hess_list[[gt_name]]) } else { - result <- estimate_did_rc(cohort_did, colnames(covariates), control_type, last_coef, cache_ps_fit_list[[gt_name]], cache_hess_list[[gt_name]]) + result <- estimate_did_rc(cohort_did, colnames(covariatesdt), control_type, last_coef, cache_ps_fit_list[[gt_name]], cache_hess_list[[gt_name]]) } diff --git a/R/fastdid.R b/R/fastdid.R index 0d2176c..9267c0f 100644 --- a/R/fastdid.R +++ b/R/fastdid.R @@ -16,7 +16,8 @@ #' @param biters The number of bootstrap iterations. Only relevant if boot = TRUE. Default is 1000. #' @param weightvar The name of the weight variable (optional). #' @param clustervar The name of the cluster variable, can only be used when boot == TRUE (optional). -#' @param covariatesvar A character vector containing the names of covariate variables (optional). +#' @param covariatesvar A character vector containing the names of time-invariant covariate variables (optional). +#' @param varycovariatesvar A character vector containing the names of time-varying covariate variables (optional). #' @param copy whether to copy the dataset before processing, set to false to speed up the process, but the input data will be altered. #' @param validate whether to validate the dataset before processing. #' @@ -58,7 +59,7 @@ fastdid <- function(data, timevar, cohortvar, unitvar, outcomevar, control_option="both",result_type="group_time", balanced_event_time = NULL, control_type = "ipw", allow_unbalance_panel = FALSE, boot=FALSE, biters = 1000, - weightvar=NULL,clustervar=NULL,covariatesvar = NULL, + weightvar=NULL,clustervar=NULL,covariatesvar = NULL,varycovariatesvar = NULL, copy = TRUE, validate = TRUE ){ @@ -77,7 +78,7 @@ fastdid <- function(data, check_set_arg(timevar, unitvar, cohortvar, "match", .choices = dt_names, .message = name_message) covariate_message <- "__ARG__ must be NULL or a character vector which are all names of columns from the dataset." - check_set_arg(covariatesvar, outcomevar, + check_set_arg(varycovariatesvar, covariatesvar, outcomevar, "NULL | multi match", .choices = dt_names, .message = covariate_message) checkvar_message <- "__ARG__ must be NULL or a character scalar if a name of columns from the dataset." @@ -97,14 +98,20 @@ fastdid <- function(data, stop("fastdid currently only supprts ipw when allowing for unbalanced panels.") } + if(allow_unbalance_panel == TRUE & !is.null(varycovariatesvar)){ + stop("fastdid currently only supprts time varying covariates when allowing for unbalanced panels.") + } + + if(any(covariatesvar %in% varycovariatesvar)){stop("time-varying var and invariant var have overlaps.")} + setnames(dt, c(timevar, cohortvar, unitvar), c("time", "G", "unit")) # validate data ----------------------------------------------------- if(validate){ - varnames <- c("time", "G", "unit", outcomevar,weightvar,clustervar,covariatesvar) - dt <- validate_did(dt, covariatesvar, varnames, balanced_event_time, allow_unbalance_panel) + varnames <- c("time", "G", "unit", outcomevar,weightvar,clustervar,covariatesvar, varycovariatesvar) + dt <- validate_did(dt, covariatesvar, varycovariatesvar, varnames, balanced_event_time, allow_unbalance_panel) } # preprocess ----------------------------------------------------------- @@ -151,9 +158,11 @@ fastdid <- function(data, #construct the outcomes list for fast access later id_size <- dt[, uniqueN(unit)] - outcomes_list <- list() + + # get auxiliary data ------------------------------ #loop for multiple outcome + outcomes_list <- list() for(outcol in outcomevar){ outcomes <- list() @@ -186,10 +195,7 @@ fastdid <- function(data, dt_inv <- dt[dt[, .I[1], by = unit]$V1] setorder(dt_inv, unit) #can't move this outside } - - - cohorts <- dt_inv[, unique(G)] cohort_sizes <- dt_inv[, .(cohort_size = .N) , by = G] @@ -200,6 +206,16 @@ fastdid <- function(data, covariates <- NULL } + varycovariates <- list() + if(!is.null(varycovariatesvar)){ + for(i in time_periods){ + start <- (i-1)*id_size+1 + end <- i*id_size + varycovariates[[i]] <- dt[seq(start,end), .SD, .SDcols = varycovariatesvar] + } + } else { + varycovariates <- NULL + } if(!is.null(clustervar)){ cluster <- dt_inv[, .SD, .SDcols = clustervar] |> unlist() } else {cluster <- NULL} @@ -207,11 +223,18 @@ fastdid <- function(data, if(!is.null(weightvar)){ weights <- dt_inv[, .SD, .SDcols = weightvar] |> unlist() } else {weights <- rep(1,id_size)} + + + # cluster <- ifelse(is.null(clustervar), NULL, dt_inv[, .SD, .SDcols = clustervar] |> unlist()) + #weights <- ifelse(is.null(weightvar), rep(1,id_size), dt_inv[, .SD, .SDcols = weightvar] |> unlist()) + # covariates <- ifelse(is.null(covariatesvar), NULL, cbind(const = -1, dt_inv[,.SD, .SDcols = covariatesvar])) + + # main part ------------------------------------------------- # attgt - gt_result_list <- estimate_gtatt(outcomes_list, outcomevar, covariates, control_type, weights, + gt_result_list <- estimate_gtatt(outcomes_list, outcomevar, covariates, varycovariates, control_type, weights, cohort_sizes,cohorts,id_size,time_periods, #info about the dt control_option, allow_unbalance_panel) diff --git a/R/sim_did.R b/R/sim_did.R index 2d7ced0..7ca136e 100644 --- a/R/sim_did.R +++ b/R/sim_did.R @@ -27,7 +27,7 @@ #' #' @export sim_did <- function(sample_size, time_period, untreated_prop = 0.3, epsilon_size = 0.001, - cov = "no", hetero = "all", second_outcome = FALSE, second_cov = FALSE, na = "none", + cov = "no", hetero = "all", second_outcome = FALSE, second_cov = FALSE, vary_cov = FALSE, na = "none", balanced = TRUE, seed = NA, stratify = FALSE, treatment_assign = "latent"){ if(!is.na(seed)){set.seed(seed)} @@ -91,8 +91,15 @@ sim_did <- function(sample_size, time_period, untreated_prop = 0.3, epsilon_size dt[, D := as.integer(time >= G)] + #add time_varying covariates + if(vary_cov){ + dt[, xvar := pmin(G, time_period)*time+rnorm(sample_size*time_period, 0,1)] #should be confounding....? + } else { + dt[, xvar := 1] + } + #untreated potential outcomes - dt[, y0 := unit_fe + time_fe + x*x_trend + x2*x_trend + rnorm(sample_size*time_period, sd = epsilon_size)] + dt[, y0 := unit_fe + time_fe + (x+x2)*x_trend + xvar + rnorm(sample_size*time_period, sd = epsilon_size)] #generate gtatt att <- CJ(G = 1:time_period, time = 1:time_period) @@ -112,7 +119,7 @@ sim_did <- function(sample_size, time_period, untreated_prop = 0.3, epsilon_size #potential outcome dt[, y1 := y0 + tau] dt[, y := y1*D + y0*(1-D)] - dt <- dt[, .SD, .SDcols = c("time", "G", "unit", "x", "x2", "y", "s")] + dt <- dt[, .SD, .SDcols = c("time", "G", "unit", "x", "x2", "y", "s", "xvar")] #additional ----------------- diff --git a/R/validate_did.R b/R/validate_did.R index ada0388..c0210c2 100644 --- a/R/validate_did.R +++ b/R/validate_did.R @@ -1,4 +1,4 @@ -validate_did <- function(dt,covariatesvar,varnames, balanced_event_time, allow_unbalance_panel){ +validate_did <- function(dt,covariatesvar, varycovariatesvar,varnames, balanced_event_time, allow_unbalance_panel){ raw_unit_size <- dt[, uniqueN(unit)] raw_time_size <- dt[, uniqueN(time)] @@ -8,7 +8,7 @@ validate_did <- function(dt,covariatesvar,varnames, balanced_event_time, allow_u } #doesn't allow missing value for now - for(col in c(covariatesvar, varnames)){ + for(col in varnames){ if(is.null(col)){next} na_obs <- whichNA(dt[[col]]) if(length(na_obs) != 0){ @@ -17,17 +17,16 @@ validate_did <- function(dt,covariatesvar,varnames, balanced_event_time, allow_u } } - if(!is.null(covariatesvar)){ - - if(uniqueN(dt, by = c("unit", covariatesvar)) > raw_unit_size){ - warning("some covariates is time-varying, fastdid only use the first observation for covariates.") - } - - for(cov in covariatesvar){ + if(!is.null(covariatesvar) & uniqueN(dt, by = c("unit", covariatesvar)) > raw_unit_size){ + warning("some covariates is time-varying, fastdid only use the first observation for covariates.") + } + + + if(!is.null(covariatesvar)|!is.null(varycovariatesvar)){ + for(cov in c(covariatesvar, varycovariatesvar)){ #check covaraites is not constant if(fnunique(dt[, get(cov)[1], by = "unit"][, V1]) == 1)stop(cov, " have no variation") } - } #check balanced panel diff --git a/development/buildtest.R b/development/buildtest.R index 1be45d6..26cbaf9 100644 --- a/development/buildtest.R +++ b/development/buildtest.R @@ -1,4 +1,4 @@ -setwd("~/GitHub/EventStudyCode") +setwd("~/GitHub/fastdid") library(devtools) library(tinytest) diff --git a/inst/tinytest/test_1_fastdid.R b/inst/tinytest/test_1_fastdid.R index eefd514..0c24c50 100644 --- a/inst/tinytest/test_1_fastdid.R +++ b/inst/tinytest/test_1_fastdid.R @@ -2,7 +2,7 @@ tol <- 1e-2 #allow 1% different between estimates simdt <- sim_did(1e+02, 10, cov = "cont", hetero = "all", balanced = TRUE, second_outcome = TRUE, seed = 1, - stratify = FALSE, second_cov = TRUE) + stratify = FALSE, second_cov = TRUE, vary_cov = TRUE) dt <- simdt$dt # basics --------------------------------------------- @@ -26,6 +26,16 @@ expect_silent(fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",ou info = "covariates dr") +expect_silent(fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y", result_type = "group_time", + control_type = "dr", + covariatesvar = c("x", "x2"), varycovariatesvar = "xvar"), + info = "with varying covariates dr") + +expect_silent(fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y", result_type = "group_time", + control_type = "dr", + varycovariatesvar = "xvar"), + info = "varying covariates only dr") + expect_silent(fastdid(dt[G != 3], timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y", result_type = "group_time"), info = "missing cohort") diff --git a/interactive/rc_test.R b/interactive/rc_test.R deleted file mode 100644 index 0f1f31a..0000000 --- a/interactive/rc_test.R +++ /dev/null @@ -1,50 +0,0 @@ - -ipw_unbalanced_data <- function(method){ - result <- fastdid(dt2, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y", result_type = "group_time", - control_type = method, - covariatesvar = c("x", "x2"), - allow_unbalance_panel = TRUE) - did_result <- did::att_gt(yname = "y",gname = "G",idname = "unit",tname = "time",data = dt2, - base_period = "universal",cband = FALSE, - est_method = drdid_rc1,xformla = ~x+x2, - control_group = "notyettreated", - allow_unbalanced_panel = TRUE, - clustervars = NULL, - bstrap = FALSE) - est_diff_ratio(result, did_result) -} - -ipw_balanced_data <- function(method){ - result <- fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y", result_type = "group_time", - control_type = method, - covariatesvar = c("x", "x2"), - allow_unbalance_panel = TRUE) - did_result <- did::att_gt(yname = "y",gname = "G",idname = "unit",tname = "time",data = dt, - base_period = "universal",cband = FALSE, - est_method = drdid_rc,xformla = ~x+x2, - control_group = "notyettreated", - allow_unbalanced_panel = TRUE, - clustervars = NULL, - bstrap = FALSE) - est_diff_ratio(result, did_result) -} - -simple_unbalanced <- function(){ - result <- fastdid(dt2, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y", result_type = "group_time", - allow_unbalance_panel = TRUE) - did_result <- did::att_gt(yname = "y",gname = "G",idname = "unit",tname = "time",data = dt2,base_period = "universal",est_method = "ipw",cband = FALSE, - #xformla = ~x, - control_group = "notyettreated", - allow_unbalanced_panel = TRUE, - clustervars = NULL, - bstrap = FALSE) - - est_diff_ratio(result, did_result) -} - -run_new <- function(method){ - load_all() - print(simple_unbalanced()) - print(ipw_balanced_data(method)) - print(ipw_unbalanced_data(method)) -} From 3e1801918b973918ebb3425ae91b7c6542716ebe Mon Sep 17 00:00:00 2001 From: TsaiLintung <101155567+TsaiLintung@users.noreply.github.com> Date: Mon, 18 Mar 2024 16:44:21 +0800 Subject: [PATCH 04/15] switch argument default from NULL to NA --- R/aggregate_gt.R | 2 +- R/estimate_did.R | 14 +++++------ R/estimate_did_rc.R | 12 +++++----- R/estimate_gtatt.R | 10 ++++---- R/fastdid.R | 44 +++++++++++++++++----------------- R/get_se.R | 6 ++--- R/utils.R | 26 ++++++++++++++++---- R/validate_did.R | 10 ++++---- inst/tinytest/test_1_fastdid.R | 1 - 9 files changed, 72 insertions(+), 53 deletions(-) diff --git a/R/aggregate_gt.R b/R/aggregate_gt.R index d90e51c..2fe0741 100644 --- a/R/aggregate_gt.R +++ b/R/aggregate_gt.R @@ -67,7 +67,7 @@ get_aggregate_scheme <- function(group_time, result_type, id_weights, id_cohorts #for balanced cohort composition in dynamic setting #a cohort us only used if it is seen for all dynamic time - if(result_type == "dynamic" & !is.null(balanced_event_time)){ + if(result_type == "dynamic" & !is.na(balanced_event_time)){ cohorts <- group_time[, .(max_et = max(time-G), min_et = min(time-G)), by = "G"] diff --git a/R/estimate_did.R b/R/estimate_did.R index ad3a56e..85b6821 100644 --- a/R/estimate_did.R +++ b/R/estimate_did.R @@ -7,14 +7,14 @@ estimate_did <- function(dt_did, covnames, control_type, data_pos <- which(dt_did[, !is.na(D)]) dt_did <- dt_did[data_pos] n <- dt_did[, .N] - if(is.null(covnames)){ - covvars <- NULL + if(allNA(covnames)){ + covvars <- NA } else { covvars <- as.matrix(dt_did[,.SD, .SDcols = covnames]) } - ipw <- control_type %in% c("ipw", "dr") & !is.null(covvars) - or <- control_type %in% c("reg", "dr") & !is.null(covvars) #OR is REG + ipw <- control_type %in% c("ipw", "dr") & !allNA(covvars) + or <- control_type %in% c("reg", "dr") & !allNA(covvars) #OR is REG # ipw -------- @@ -41,7 +41,7 @@ estimate_did <- function(dt_did, covnames, control_type, } else { #when using multiple outcome, ipw cache can be reused hess <- cache_hess prop_score_fit <- cache_ps_fit - logit_coef <- NULL #won't be needing the approximate cache + logit_coef <- NA #won't be needing the approximate cache } #get the results into the main did dt @@ -52,8 +52,8 @@ estimate_did <- function(dt_did, covnames, control_type, } else { prop_score_fit <- rep(1,n) - logit_coef <- NULL - hess <- NULL + logit_coef <- NA + hess <- NA dt_did[, treat_ipw_weight := weights*D] dt_did[, cont_ipw_weight := weights*(1-D)] diff --git a/R/estimate_did_rc.R b/R/estimate_did_rc.R index 9d21404..4107d76 100644 --- a/R/estimate_did_rc.R +++ b/R/estimate_did_rc.R @@ -21,14 +21,14 @@ estimate_did_rc <- function(dt_did, covnames, control_type, sum_weight_pre <- dt_did[, sum(inpre*weights)] sum_weight_post <- dt_did[, sum(inpost*weights)] - if(is.null(covnames)){ - covvars <- NULL + if(allNA(covnames)){ + covvars <- NA } else { covvars <- as.matrix(dt_did[,.SD, .SDcols = covnames]) } - ipw <- control_type %in% c("ipw", "dr") & !is.null(covvars) - or <- control_type %in% c("reg", "dr") & !is.null(covvars) #OR is REG + ipw <- control_type %in% c("ipw", "dr") & !allNA(covvars) + or <- control_type %in% c("reg", "dr") & !allNA(covvars) #OR is REG # ipw -------- @@ -62,8 +62,8 @@ estimate_did_rc <- function(dt_did, covnames, control_type, } else { prop_score_fit <- rep(1,n) - logit_coef <- NULL - hess <- NULL + logit_coef <- NA + hess <- NA dt_did[, treat_ipw_weight := weights*D] dt_did[, cont_ipw_weight := weights*(1-D)] diff --git a/R/estimate_gtatt.R b/R/estimate_gtatt.R index b30e529..2eb71c8 100644 --- a/R/estimate_gtatt.R +++ b/R/estimate_gtatt.R @@ -1,4 +1,4 @@ -estimate_gtatt <- function(outcomes_list, outcomevar, covariates, varycovariates, control_type, weights, +estimate_gtatt <- function(outcomes_list, outcomevar, varycovariatesvar, covariatesvar, covariates, varycovariates, control_type, weights, cohort_sizes,cohorts,id_size,time_periods, control_option, allow_unbalance_panel) { @@ -40,17 +40,17 @@ estimate_gtatt <- function(outcomes_list, outcomevar, covariates, varycovariates did_setup[get_cohort_pos(cohort_sizes, min_control_cohort, max_control_cohort)] <- 0 did_setup[get_cohort_pos(cohort_sizes, g)] <- 1 #treated cannot be controls, assign treated after control to overwrite - if(!is.null(varycovariates)){ + if(!allNA(varycovariatesvar)){ precov <- varycovariates[[base_period]] names(precov) <- paste0("pre_", names(precov)) postcov <- varycovariates[[t]]-varycovariates[[base_period]] names(postcov) <- paste0("post_", names(varycovariates[[t]])) - if(is.null(covariates)){ - covariatesdt <- cbind(const = -1, cbind(precov, postcov)) #if covariates is NULL + if(allNA(covariatesvar)){ + covariatesdt <- cbind(const = -1, cbind(precov, postcov)) #if covariates is na } else { - covariatesdt <- cbind(covariates, cbind(precov, postcov)) #if covariates is NULL + covariatesdt <- cbind(covariates, cbind(precov, postcov)) #if covariates is na } } else { diff --git a/R/fastdid.R b/R/fastdid.R index 9267c0f..623258f 100644 --- a/R/fastdid.R +++ b/R/fastdid.R @@ -9,7 +9,7 @@ #' @param outcomevar The name of the outcome variable. #' @param control_option The control units used for the DiD estimates. Default is "both". #' @param result_type A character string indicating the type of result to be returned. Default is "group_time". -#' @param balanced_event_time A numeric scalar that indicates the max event time to balance the cohort composition, only meaningful when result_type == "dynamic". Default is NULL +#' @param balanced_event_time A numeric scalar that indicates the max event time to balance the cohort composition, only meaningful when result_type == "dynamic". Default is NA #' @param control_type The method for controlling for covariates. "ipw" for inverse probability weighting, "reg" for outcome regression, or "dr" for doubly-robust #' @param allow_unbalance_panel Whether allow unbalance panel as input (if false will coerce the dataset to a balanced panel). Default is FALSE #' @param boot Logical, indicating whether bootstrapping should be performed. Default is FALSE. @@ -57,9 +57,9 @@ #' @keywords difference-in-differences fast computation panel data estimation did fastdid <- function(data, timevar, cohortvar, unitvar, outcomevar, - control_option="both",result_type="group_time", balanced_event_time = NULL, + control_option="both",result_type="group_time", balanced_event_time = NA, control_type = "ipw", allow_unbalance_panel = FALSE, boot=FALSE, biters = 1000, - weightvar=NULL,clustervar=NULL,covariatesvar = NULL,varycovariatesvar = NULL, + weightvar=NA,clustervar=NA,covariatesvar = NA,varycovariatesvar = NA, copy = TRUE, validate = TRUE ){ @@ -77,19 +77,19 @@ fastdid <- function(data, name_message <- "__ARG__ must be a character scalar and a name of a column from the dataset." check_set_arg(timevar, unitvar, cohortvar, "match", .choices = dt_names, .message = name_message) - covariate_message <- "__ARG__ must be NULL or a character vector which are all names of columns from the dataset." + covariate_message <- "__ARG__ must be NA or a character vector which are all names of columns from the dataset." check_set_arg(varycovariatesvar, covariatesvar, outcomevar, - "NULL | multi match", .choices = dt_names, .message = covariate_message) + "NA | multi match", .choices = dt_names, .message = covariate_message) - checkvar_message <- "__ARG__ must be NULL or a character scalar if a name of columns from the dataset." + checkvar_message <- "__ARG__ must be NA or a character scalar if a name of columns from the dataset." check_set_arg(weightvar, clustervar, - "NULL | match", .choices = dt_names, .message = checkvar_message) + "NA | match", .choices = dt_names, .message = checkvar_message) check_set_arg(control_option, "match", .choices = c("both", "never", "notyet")) #kinda bad names since did's notyet include both notyet and never check_set_arg(control_type, "match", .choices = c("ipw", "reg", "dr")) check_arg(copy, validate, boot, allow_unbalance_panel, "scalar logical") - if(!is.null(balanced_event_time)){ + if(!is.na(balanced_event_time)){ if(result_type != "dynamic"){stop("balanced_event_time is only meaningful with result_type == 'dynamic'")} check_arg(balanced_event_time, "numeric scalar") } @@ -98,11 +98,11 @@ fastdid <- function(data, stop("fastdid currently only supprts ipw when allowing for unbalanced panels.") } - if(allow_unbalance_panel == TRUE & !is.null(varycovariatesvar)){ + if(allow_unbalance_panel == TRUE & !allNA(varycovariatesvar)){ stop("fastdid currently only supprts time varying covariates when allowing for unbalanced panels.") } - if(any(covariatesvar %in% varycovariatesvar)){stop("time-varying var and invariant var have overlaps.")} + if(any(covariatesvar %in% varycovariatesvar) & !allNA(varycovariatesvar) & !allNA(covariatesvar)){stop("time-varying var and invariant var have overlaps.")} setnames(dt, c(timevar, cohortvar, unitvar), c("time", "G", "unit")) @@ -200,41 +200,41 @@ fastdid <- function(data, cohort_sizes <- dt_inv[, .(cohort_size = .N) , by = G] # the optional columns - if(!is.null(covariatesvar)){ + if(!allNA(covariatesvar)){ covariates <- cbind(const = -1, dt_inv[,.SD, .SDcols = covariatesvar]) # const } else { - covariates <- NULL + covariates <- NA } varycovariates <- list() - if(!is.null(varycovariatesvar)){ + if(!allNA(varycovariatesvar)){ for(i in time_periods){ start <- (i-1)*id_size+1 end <- i*id_size varycovariates[[i]] <- dt[seq(start,end), .SD, .SDcols = varycovariatesvar] } } else { - varycovariates <- NULL + varycovariates <- NA } - if(!is.null(clustervar)){ + if(!is.na(clustervar)){ cluster <- dt_inv[, .SD, .SDcols = clustervar] |> unlist() - } else {cluster <- NULL} + } else {cluster <- NA} - if(!is.null(weightvar)){ + if(!is.na(weightvar)){ weights <- dt_inv[, .SD, .SDcols = weightvar] |> unlist() } else {weights <- rep(1,id_size)} - # cluster <- ifelse(is.null(clustervar), NULL, dt_inv[, .SD, .SDcols = clustervar] |> unlist()) - #weights <- ifelse(is.null(weightvar), rep(1,id_size), dt_inv[, .SD, .SDcols = weightvar] |> unlist()) - # covariates <- ifelse(is.null(covariatesvar), NULL, cbind(const = -1, dt_inv[,.SD, .SDcols = covariatesvar])) + # cluster <- ifelse(is.na(clustervar), NA, dt_inv[, .SD, .SDcols = clustervar] |> unlist()) + #weights <- ifelse(is.na(weightvar), rep(1,id_size), dt_inv[, .SD, .SDcols = weightvar] |> unlist()) + # covariates <- ifelse(is.na(covariatesvar), NA, cbind(const = -1, dt_inv[,.SD, .SDcols = covariatesvar])) # main part ------------------------------------------------- # attgt - gt_result_list <- estimate_gtatt(outcomes_list, outcomevar, covariates, varycovariates, control_type, weights, + gt_result_list <- estimate_gtatt(outcomes_list, outcomevar, varycovariatesvar, covariatesvar, covariates, varycovariates, control_type, weights, cohort_sizes,cohorts,id_size,time_periods, #info about the dt control_option, allow_unbalance_panel) @@ -248,7 +248,7 @@ fastdid <- function(data, result_type, balanced_event_time) #get se from the influence function - agg_se <- get_se(agg_result$inf_matrix, boot, biters, cluster) + agg_se <- get_se(agg_result$inf_matrix, boot, biters, cluster, clustervar) # post process ----------------------------------------------- diff --git a/R/get_se.R b/R/get_se.R index 8fd8f46..8c71f74 100644 --- a/R/get_se.R +++ b/R/get_se.R @@ -1,10 +1,10 @@ -get_se <- function(inf_matrix, boot, biters, cluster) { +get_se <- function(inf_matrix, boot, biters, cluster, clustervar) { if(boot){ top_quant <- 0.75 bot_quant <- 0.25 - if(!is.null(cluster)){ + if(!allNA(clustervar)){ cluster_n <- stats::aggregate(cluster, by=list(cluster), length)[,2] inf_matrix <- fsum(inf_matrix, cluster) / cluster_n #the mean without 0 for each cluster of each setting } @@ -21,7 +21,7 @@ get_se <- function(inf_matrix, boot, biters, cluster) { se[se < sqrt(.Machine$double.eps)*10] <- NA } else { - if(!is.null(cluster)){stop("clustering only available with bootstrap")} + if(!allNA(clustervar)){stop("clustering only available with bootstrap")} inf_matrix <- inf_matrix |> as.data.table() se <- inf_matrix[, lapply(.SD, function(x) sqrt(sum(x^2, na.rm = TRUE)/length(x)^2))] %>% as.vector() #should maybe use n-1 but did use n diff --git a/R/utils.R b/R/utils.R index c2397f1..03478a8 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,6 +1,3 @@ - - - set_max_thread <- function(){ data.table::setDTthreads(0) options(kit.nThread = getDTthreads()) @@ -8,4 +5,25 @@ set_max_thread <- function(){ reverse_col <- function(x){ return(x[,ncol(x):1]) -} \ No newline at end of file +} + + +release <- function(p){ + for(name in names(p)){ + assign(name, p[[name]], envir = parent.frame()) #assign it to the envir where the func is called + } +} + +gather <- function(...){ + names <- substitute(list(...)) |> deparse() |> + str_remove_all("\\)|list\\(") |> str_flatten() |> str_split_1(",") |> str_trim() + values <- list(...) + p <- list() + + for(i in 1:length(names)){ + #p[[names[[i]]]] <- values[[i]] + p <- c(p, list(values[[i]])) #for nulls + } + names(p) <- names + return(p) +} diff --git a/R/validate_did.R b/R/validate_did.R index c0210c2..c847150 100644 --- a/R/validate_did.R +++ b/R/validate_did.R @@ -3,13 +3,13 @@ validate_did <- function(dt,covariatesvar, varycovariatesvar,varnames, balanced_ raw_time_size <- dt[, uniqueN(time)] - if(!is.null(balanced_event_time)){ + if(!is.na(balanced_event_time)){ if(balanced_event_time > dt[, max(time-G)]){stop("balanced_event_time is larger than the max event time in the data")} } #doesn't allow missing value for now for(col in varnames){ - if(is.null(col)){next} + if(is.na(col)){next} na_obs <- whichNA(dt[[col]]) if(length(na_obs) != 0){ warning("missing values detected in ", col, ", removing ", length(na_obs), " observation.") @@ -17,13 +17,15 @@ validate_did <- function(dt,covariatesvar, varycovariatesvar,varnames, balanced_ } } - if(!is.null(covariatesvar) & uniqueN(dt, by = c("unit", covariatesvar)) > raw_unit_size){ + + if(!allNA(covariatesvar) && uniqueN(dt, by = c("unit", covariatesvar)) > raw_unit_size){ warning("some covariates is time-varying, fastdid only use the first observation for covariates.") } - if(!is.null(covariatesvar)|!is.null(varycovariatesvar)){ + if(!allNA(covariatesvar)|!allNA(varycovariatesvar)){ for(cov in c(covariatesvar, varycovariatesvar)){ + if(is.na(cov)){next} #check covaraites is not constant if(fnunique(dt[, get(cov)[1], by = "unit"][, V1]) == 1)stop(cov, " have no variation") } diff --git a/inst/tinytest/test_1_fastdid.R b/inst/tinytest/test_1_fastdid.R index 0c24c50..2bf3b15 100644 --- a/inst/tinytest/test_1_fastdid.R +++ b/inst/tinytest/test_1_fastdid.R @@ -25,7 +25,6 @@ expect_silent(fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",ou covariatesvar = c("x", "x2")), info = "covariates dr") - expect_silent(fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y", result_type = "group_time", control_type = "dr", covariatesvar = c("x", "x2"), varycovariatesvar = "xvar"), From 4fa8aeb747b34482e8fd38d2210acf9862c13092 Mon Sep 17 00:00:00 2001 From: TsaiLintung <101155567+TsaiLintung@users.noreply.github.com> Date: Mon, 18 Mar 2024 17:55:37 +0800 Subject: [PATCH 05/15] reorganize with my gather-release pattern --- R/aggregate_gt.R | 83 ++++++++++++++++------ R/estimate_did.R | 6 +- R/estimate_did_rc.R | 6 +- R/estimate_gtatt.R | 7 +- R/fastdid.R | 163 +++++++++++++++++++++++++------------------- R/get_se.R | 31 --------- R/validate_did.R | 5 +- 7 files changed, 168 insertions(+), 133 deletions(-) delete mode 100644 R/get_se.R diff --git a/R/aggregate_gt.R b/R/aggregate_gt.R index 2fe0741..9094cb0 100644 --- a/R/aggregate_gt.R +++ b/R/aggregate_gt.R @@ -1,12 +1,15 @@ -aggregate_gt <- function(gt_result, cohort_sizes, - id_weights, id_cohorts, - result_type, balanced_event_time){ +aggregate_gt <- function(gt_result, auxdata, params){ + + release(auxdata) + release(params) gt_att <- gt_result$att gt_inf_func <- gt_result$inf_func gt <- gt_result$gt + id_cohorts <- dt_inv[, G] + - id_dt <- data.table(weight = id_weights/sum(id_weights), G = id_cohorts) + id_dt <- data.table(weight = weights/sum(weights), G = id_cohorts) pg_dt <- id_dt[, .(pg = sum(weight)), by = "G"] group_time <- gt |> merge(pg_dt, by = "G") @@ -22,31 +25,39 @@ aggregate_gt <- function(gt_result, cohort_sizes, } else { - agg_sch <- get_aggregate_scheme(group_time, result_type, id_weights, id_cohorts, balanced_event_time) + agg_sch <- get_aggregate_scheme(group_time, result_type, weights, id_cohorts, balanced_event_time) targets <- agg_sch$targets - weights <- as.matrix(agg_sch$weights) + agg_weights <- as.matrix(agg_sch$agg_weights) #aggregated att - agg_att <- weights %*% gt_att + agg_att <- agg_weights %*% gt_att #get the influence from weight estimation #this needs to be optimized!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - inf_weights <- sapply(asplit(weights, 1), function (x){ - get_weight_influence(x, gt_att, id_weights, id_cohorts, group_time[, .(G, time)]) + inf_weights <- sapply(asplit(agg_weights, 1), function (x){ + get_weight_influence(x, gt_att, weights, id_cohorts, group_time[, .(G, time)]) }) #aggregated influence function - inf_matrix <- (gt_inf_func %*% t(weights)) + inf_weights + inf_matrix <- (gt_inf_func %*% t(agg_weights)) + inf_weights } - return(list(inf_matrix = inf_matrix, agg_att = agg_att, targets = targets)) + + #get se!! + agg_se <- get_se(inf_matrix, boot, biters, cluster, clustervar) + + # post process + result <- data.table(targets, agg_att, agg_se) + names(result) <- c("target", "att", "se") + + return(result) } -get_aggregate_scheme <- function(group_time, result_type, id_weights, id_cohorts, balanced_event_time){ +get_aggregate_scheme <- function(group_time, result_type, weights, id_cohorts, balanced_event_time){ #browser() - weights <- data.table() + agg_weights <- data.table() gt_count <- group_time[, .N] bool_to_pn <- function(x){ifelse(x, 1, -1)} @@ -89,18 +100,18 @@ get_aggregate_scheme <- function(group_time, result_type, id_weights, id_cohorts group_time[, targeted := NULL] - weights <- rbind(weights, target_weights) + agg_weights <- rbind(agg_weights, target_weights) } - return(list(weights = weights, #a matrix of each target and gt's weight in it + return(list(agg_weights = agg_weights, #a matrix of each target and gt's weight in it targets = targets)) } -get_weight_influence <- function(agg_weights, gt_att, id_weights, id_cohorts, group) { +get_weight_influence <- function(agg_weights, gt_att, weights, id_cohorts, group) { keepers <- which(agg_weights > 0) - id_dt <- data.table(weight = id_weights/sum(id_weights), G = id_cohorts) + id_dt <- data.table(weight = weights/sum(weights), G = id_cohorts) pg_dt <- id_dt[, .(pg = sum(weight)), by = "G"] group <- group |> merge(pg_dt, by = "G") @@ -110,16 +121,48 @@ get_weight_influence <- function(agg_weights, gt_att, id_weights, id_cohorts, gr # effect of estimating weights in the numerator if1 <- sapply(keepers, function(k) { - (id_weights*BMisc::TorF(id_cohorts == group[k,G]) - group[k,pg]) / + (weights*BMisc::TorF(id_cohorts == group[k,G]) - group[k,pg]) / sum(group[keepers,pg]) }) # effect of estimating weights in the denominator if2 <- base::rowSums(sapply(keepers, function(k) { - id_weights*BMisc::TorF(id_cohorts == group[k,G]) - group[k,pg] + weights*BMisc::TorF(id_cohorts == group[k,G]) - group[k,pg] })) %*% t(group[keepers,pg]/(sum(group[keepers,pg])^2)) # return the influence function for the weights inf_weight <- (if1 - if2) %*% as.vector(gt_att[keepers]) inf_weight[abs(inf_weight) < sqrt(.Machine$double.eps)*10] <- 0 #fill zero return(inf_weight) -} \ No newline at end of file +} + +get_se <- function(inf_matrix, boot, biters, cluster, clustervar) { + + if(boot){ + + top_quant <- 0.75 + bot_quant <- 0.25 + if(!allNA(clustervar)){ + cluster_n <- stats::aggregate(cluster, by=list(cluster), length)[,2] + inf_matrix <- fsum(inf_matrix, cluster) / cluster_n #the mean without 0 for each cluster of each setting + } + + boot_results <- BMisc::multiplier_bootstrap(inf_matrix, biters = biters) %>% as.data.table() + + boot_top <- boot_results[, lapply(.SD, function(x) stats::quantile(x, top_quant, type=1, na.rm = TRUE)),] + boot_bot <- boot_results[, lapply(.SD, function(x) stats::quantile(x, bot_quant, type=1, na.rm = TRUE)),] + + dt_se <- rbind(boot_bot, boot_top) %>% transpose() + names(dt_se) <- c("boot_bot", "boot_top") + + se <- dt_se[,(boot_top-boot_bot)/(qnorm(top_quant) - qnorm(bot_quant))] + se[se < sqrt(.Machine$double.eps)*10] <- NA + + } else { + if(!allNA(clustervar)){stop("clustering only available with bootstrap")} + inf_matrix <- inf_matrix |> as.data.table() + se <- inf_matrix[, lapply(.SD, function(x) sqrt(sum(x^2, na.rm = TRUE)/length(x)^2))] %>% as.vector() #should maybe use n-1 but did use n + + } + return(unlist(se)) +} + diff --git a/R/estimate_did.R b/R/estimate_did.R index 85b6821..d1d58fe 100644 --- a/R/estimate_did.R +++ b/R/estimate_did.R @@ -13,9 +13,9 @@ estimate_did <- function(dt_did, covnames, control_type, covvars <- as.matrix(dt_did[,.SD, .SDcols = covnames]) } - ipw <- control_type %in% c("ipw", "dr") & !allNA(covvars) - or <- control_type %in% c("reg", "dr") & !allNA(covvars) #OR is REG - + ipw <- control_type %in% c("ipw", "dr") & !allNA(covnames) + or <- control_type %in% c("reg", "dr") & !allNA(covnames) #OR is REG + # ipw -------- if(ipw){ diff --git a/R/estimate_did_rc.R b/R/estimate_did_rc.R index 4107d76..351bfc9 100644 --- a/R/estimate_did_rc.R +++ b/R/estimate_did_rc.R @@ -27,9 +27,9 @@ estimate_did_rc <- function(dt_did, covnames, control_type, covvars <- as.matrix(dt_did[,.SD, .SDcols = covnames]) } - ipw <- control_type %in% c("ipw", "dr") & !allNA(covvars) - or <- control_type %in% c("reg", "dr") & !allNA(covvars) #OR is REG - + ipw <- control_type %in% c("ipw", "dr") & !allNA(covnames) + or <- control_type %in% c("reg", "dr") & !allNA(covnames) #OR is REG + # ipw -------- if(ipw){ diff --git a/R/estimate_gtatt.R b/R/estimate_gtatt.R index 2eb71c8..87639f5 100644 --- a/R/estimate_gtatt.R +++ b/R/estimate_gtatt.R @@ -1,7 +1,8 @@ -estimate_gtatt <- function(outcomes_list, outcomevar, varycovariatesvar, covariatesvar, covariates, varycovariates, control_type, weights, - cohort_sizes,cohorts,id_size,time_periods, - control_option, allow_unbalance_panel) { +estimate_gtatt <- function(auxdata, params) { + release(auxdata) + release(params) + treated_cohorts <- cohorts[!is.infinite(cohorts)] if(!Inf %in% cohorts & control_option != "notyet"){ diff --git a/R/fastdid.R b/R/fastdid.R index 623258f..68df16e 100644 --- a/R/fastdid.R +++ b/R/fastdid.R @@ -63,15 +63,7 @@ fastdid <- function(data, copy = TRUE, validate = TRUE ){ - # validation arguments -------------------------------------------------------- - - if(!is.data.table(data)){ - warning("coercing input into a data.table.") - data <- as.data.table(data) - } - - if(copy){dt <- copy(data)} else {dt <- data} dt_names <- names(dt) name_message <- "__ARG__ must be a character scalar and a name of a column from the dataset." @@ -93,30 +85,74 @@ fastdid <- function(data, if(result_type != "dynamic"){stop("balanced_event_time is only meaningful with result_type == 'dynamic'")} check_arg(balanced_event_time, "numeric scalar") } - if(allow_unbalance_panel == TRUE & control_type %in% c("dr", "reg")){ stop("fastdid currently only supprts ipw when allowing for unbalanced panels.") } - if(allow_unbalance_panel == TRUE & !allNA(varycovariatesvar)){ stop("fastdid currently only supprts time varying covariates when allowing for unbalanced panels.") } + if(any(covariatesvar %in% varycovariatesvar) & !allNA(varycovariatesvar) & !allNA(covariatesvar)){ + stop("time-varying var and invariant var have overlaps.") + } - if(any(covariatesvar %in% varycovariatesvar) & !allNA(varycovariatesvar) & !allNA(covariatesvar)){stop("time-varying var and invariant var have overlaps.")} + params <- gather(timevar, cohortvar, unitvar, outcomevar, + control_option,result_type, balanced_event_time, + control_type, allow_unbalance_panel , boot, biters, + weightvar,clustervar,covariatesvar,varycovariatesvar) - setnames(dt, c(timevar, cohortvar, unitvar), c("time", "G", "unit")) - # validate data ----------------------------------------------------- + if(!is.data.table(data)){ + warning("coercing input into a data.table.") + data <- as.data.table(data) + } + if(copy){dt <- copy(data)} else {dt <- data} + setnames(dt, c(timevar, cohortvar, unitvar), c("time", "G", "unit")) if(validate){ varnames <- c("time", "G", "unit", outcomevar,weightvar,clustervar,covariatesvar, varycovariatesvar) - dt <- validate_did(dt, covariatesvar, varycovariatesvar, varnames, balanced_event_time, allow_unbalance_panel) + dt <- validate_did(dt, varnames, params) } # preprocess ----------------------------------------------------------- #make dt conform to the WLOG assumptions of fastdid + coerce_result <- coerce_dt(dt, params) #also changed dt + dt <- coerce_result$dt + # get auxiliary data + auxdata <- get_auxdata(dt, params) + + # main part ------------------------------------------------- + + # estimate attgt + gt_result_list <- estimate_gtatt(auxdata, params) + + # aggregate the result + all_result <- data.table() + for(outcol in outcomevar){ + gt_result <- gt_result_list[[outcol]] + + # aggregate att and inf function and get se + result <- aggregate_gt(gt_result, auxdata, params) + + #convert "targets" back to meaningful parameter identifiers like cohort 1 post, time 2 post + result <- result |> convert_targets(result_type, coerce_result$time_change) + result[, outcome := outcol] + all_result <- rbind(all_result, result) + + rm(result) + + } + + return(all_result) + +} + +# small steps ---------------------------------------------------------------------- + +coerce_dt <- function(dt, params){ + + release(params) #change to int before sorting if(!is.numeric(dt[, G])){ @@ -135,11 +171,10 @@ fastdid <- function(data, } setorder(dt, time, G, unit) #sort the dataset essential for the sort-once-quick-access - #deal with time, coerice time to 1,2,3,4,5....... time_periods <- dt[, unique(time)] time_size <- length(time_periods) - + time_offset <- min(time_periods) - 1 #assume time starts at 1, first is min after sort :) if(time_offset != 0){ dt[, G := G-time_offset] @@ -156,11 +191,26 @@ fastdid <- function(data, dt[time != 1, time := (time-1)/time_step+1] } - #construct the outcomes list for fast access later + return(list(dt = dt, + time_change = list(time_step = time_step, + max_time = max(time_periods), + time_offset = time_offset))) + +} + +get_auxdata <- function(dt, params){ + + release(params) + + + + time_periods <- dt[, unique(time)] id_size <- dt[, uniqueN(unit)] # get auxiliary data ------------------------------ + setorder(dt, time, G, unit) #sort the dataset essential for the sort-once-quick-access + #construct the outcomes list for fast access later #loop for multiple outcome outcomes_list <- list() for(outcol in outcomevar){ @@ -184,28 +234,22 @@ fastdid <- function(data, } } - + outcomes_list[[outcol]] <- outcomes } - + #the time-invariant parts if(!allow_unbalance_panel){ dt_inv <- dt[1:id_size] } else { - dt_inv <- dt[dt[, .I[1], by = unit]$V1] + dt_inv <- dt[dt[, .I[1], by = unit]$V1] #the first observation setorder(dt_inv, unit) #can't move this outside } - + cohorts <- dt_inv[, unique(G)] cohort_sizes <- dt_inv[, .(cohort_size = .N) , by = G] - - # the optional columns - if(!allNA(covariatesvar)){ - covariates <- cbind(const = -1, dt_inv[,.SD, .SDcols = covariatesvar]) # const - } else { - covariates <- NA - } + # the optional columns varycovariates <- list() if(!allNA(varycovariatesvar)){ for(i in time_periods){ @@ -216,60 +260,35 @@ fastdid <- function(data, } else { varycovariates <- NA } + + if(!allNA(covariatesvar)){ + covariates <- cbind(const = -1, dt_inv[,.SD, .SDcols = covariatesvar]) + } else { + covariates <- NA + } + if(!is.na(clustervar)){ cluster <- dt_inv[, .SD, .SDcols = clustervar] |> unlist() - } else {cluster <- NA} + } else { + cluster <- NA + } if(!is.na(weightvar)){ weights <- dt_inv[, .SD, .SDcols = weightvar] |> unlist() - } else {weights <- rep(1,id_size)} - + } else { + weights <- rep(1, id_size) + } - # cluster <- ifelse(is.na(clustervar), NA, dt_inv[, .SD, .SDcols = clustervar] |> unlist()) - #weights <- ifelse(is.na(weightvar), rep(1,id_size), dt_inv[, .SD, .SDcols = weightvar] |> unlist()) - # covariates <- ifelse(is.na(covariatesvar), NA, cbind(const = -1, dt_inv[,.SD, .SDcols = covariatesvar])) + auxdata <- gather(time_periods, id_size, outcomes_list, dt_inv, cohorts, cohort_sizes, + varycovariates, covariates, cluster, weights) - - - # main part ------------------------------------------------- - - # attgt - gt_result_list <- estimate_gtatt(outcomes_list, outcomevar, varycovariatesvar, covariatesvar, covariates, varycovariates, control_type, weights, - cohort_sizes,cohorts,id_size,time_periods, #info about the dt - control_option, allow_unbalance_panel) - - all_result <- data.table() - for(outcol in outcomevar){ - gt_result <- gt_result_list[[outcol]] - - # aggregate att and inf function - agg_result <- aggregate_gt(gt_result, cohort_sizes, - weights, dt_inv[, G], - result_type, balanced_event_time) - - #get se from the influence function - agg_se <- get_se(agg_result$inf_matrix, boot, biters, cluster, clustervar) - - # post process ----------------------------------------------- - - result <- data.table(agg_result$targets, agg_result$agg_att, agg_se) - names(result) <- c("target", "att", "se") - - #convert "targets" back to meaningful parameter identifiers like cohort 1 post, time 2 post - result <- result |> convert_targets(result_type, time_offset, time_step, max(time_periods)) - result[, outcome := outcol] - all_result <- rbind(all_result, result) - - rm(result) - - } - - return(all_result) + return(auxdata) } -convert_targets <- function(results, result_type, - time_offset, time_step, max_time){ +convert_targets <- function(results, result_type, time_change){ + + release(time_change) #from coerce_dt if(result_type == "dynamic"){ setnames(results, "target", "event_time") diff --git a/R/get_se.R b/R/get_se.R deleted file mode 100644 index 8c71f74..0000000 --- a/R/get_se.R +++ /dev/null @@ -1,31 +0,0 @@ -get_se <- function(inf_matrix, boot, biters, cluster, clustervar) { - - if(boot){ - - top_quant <- 0.75 - bot_quant <- 0.25 - if(!allNA(clustervar)){ - cluster_n <- stats::aggregate(cluster, by=list(cluster), length)[,2] - inf_matrix <- fsum(inf_matrix, cluster) / cluster_n #the mean without 0 for each cluster of each setting - } - - boot_results <- BMisc::multiplier_bootstrap(inf_matrix, biters = biters) %>% as.data.table() - - boot_top <- boot_results[, lapply(.SD, function(x) stats::quantile(x, top_quant, type=1, na.rm = TRUE)),] - boot_bot <- boot_results[, lapply(.SD, function(x) stats::quantile(x, bot_quant, type=1, na.rm = TRUE)),] - - dt_se <- rbind(boot_bot, boot_top) %>% transpose() - names(dt_se) <- c("boot_bot", "boot_top") - - se <- dt_se[,(boot_top-boot_bot)/(qnorm(top_quant) - qnorm(bot_quant))] - se[se < sqrt(.Machine$double.eps)*10] <- NA - - } else { - if(!allNA(clustervar)){stop("clustering only available with bootstrap")} - inf_matrix <- inf_matrix |> as.data.table() - se <- inf_matrix[, lapply(.SD, function(x) sqrt(sum(x^2, na.rm = TRUE)/length(x)^2))] %>% as.vector() #should maybe use n-1 but did use n - - } - return(unlist(se)) -} - diff --git a/R/validate_did.R b/R/validate_did.R index c847150..6eca763 100644 --- a/R/validate_did.R +++ b/R/validate_did.R @@ -1,4 +1,7 @@ -validate_did <- function(dt,covariatesvar, varycovariatesvar,varnames, balanced_event_time, allow_unbalance_panel){ +validate_did <- function(dt,varnames,params){ + + release(params) + raw_unit_size <- dt[, uniqueN(unit)] raw_time_size <- dt[, uniqueN(time)] From 42dda9ddfdc3d30ec6c3f464883e5dc8b6019da4 Mon Sep 17 00:00:00 2001 From: TsaiLintung <101155567+TsaiLintung@users.noreply.github.com> Date: Tue, 19 Mar 2024 12:03:55 +0800 Subject: [PATCH 06/15] some reorganization --- R/aggregate_gt.R | 14 ++++--- R/estimate_did.R | 16 +++---- R/estimate_gtatt.R | 77 ++++++++++++++++++---------------- R/fastdid.R | 28 +++++-------- R/validate_did.R | 1 - inst/tinytest/test_1_fastdid.R | 4 ++ man/fastdid.Rd | 15 ++++--- man/sim_did.Rd | 1 + 8 files changed, 84 insertions(+), 72 deletions(-) diff --git a/R/aggregate_gt.R b/R/aggregate_gt.R index 9094cb0..3cb7a4a 100644 --- a/R/aggregate_gt.R +++ b/R/aggregate_gt.R @@ -3,12 +3,12 @@ aggregate_gt <- function(gt_result, auxdata, params){ release(auxdata) release(params) + #release the stuff gt_att <- gt_result$att gt_inf_func <- gt_result$inf_func gt <- gt_result$gt id_cohorts <- dt_inv[, G] - id_dt <- data.table(weight = weights/sum(weights), G = id_cohorts) pg_dt <- id_dt[, .(pg = sum(weight)), by = "G"] group_time <- gt |> merge(pg_dt, by = "G") @@ -19,12 +19,14 @@ aggregate_gt <- function(gt_result, auxdata, params){ if(result_type == "group_time"){ + #don't need to do anything targets <- group_time[, unique(G*max(time)+time)] inf_matrix <- gt_inf_func agg_att <- as.vector(gt_att) } else { + #get which gt(s) is a part of the aggregated param agg_sch <- get_aggregate_scheme(group_time, result_type, weights, id_cohorts, balanced_event_time) targets <- agg_sch$targets agg_weights <- as.matrix(agg_sch$agg_weights) @@ -43,12 +45,13 @@ aggregate_gt <- function(gt_result, auxdata, params){ } - #get se!! + #get se from influence function agg_se <- get_se(inf_matrix, boot, biters, cluster, clustervar) # post process result <- data.table(targets, agg_att, agg_se) names(result) <- c("target", "att", "se") + result[,outcome := gt_result$outname] return(result) } @@ -142,14 +145,15 @@ get_se <- function(inf_matrix, boot, biters, cluster, clustervar) { top_quant <- 0.75 bot_quant <- 0.25 if(!allNA(clustervar)){ + #take average within the cluster cluster_n <- stats::aggregate(cluster, by=list(cluster), length)[,2] inf_matrix <- fsum(inf_matrix, cluster) / cluster_n #the mean without 0 for each cluster of each setting } boot_results <- BMisc::multiplier_bootstrap(inf_matrix, biters = biters) %>% as.data.table() - boot_top <- boot_results[, lapply(.SD, function(x) stats::quantile(x, top_quant, type=1, na.rm = TRUE)),] - boot_bot <- boot_results[, lapply(.SD, function(x) stats::quantile(x, bot_quant, type=1, na.rm = TRUE)),] + boot_top <- boot_results[, lapply(.SD, function(x) stats::quantile(x, top_quant, type=1, na.rm = TRUE))] + boot_bot <- boot_results[, lapply(.SD, function(x) stats::quantile(x, bot_quant, type=1, na.rm = TRUE))] dt_se <- rbind(boot_bot, boot_top) %>% transpose() names(dt_se) <- c("boot_bot", "boot_top") @@ -158,7 +162,7 @@ get_se <- function(inf_matrix, boot, biters, cluster, clustervar) { se[se < sqrt(.Machine$double.eps)*10] <- NA } else { - if(!allNA(clustervar)){stop("clustering only available with bootstrap")} + inf_matrix <- inf_matrix |> as.data.table() se <- inf_matrix[, lapply(.SD, function(x) sqrt(sum(x^2, na.rm = TRUE)/length(x)^2))] %>% as.vector() #should maybe use n-1 but did use n diff --git a/R/estimate_did.R b/R/estimate_did.R index d1d58fe..597a328 100644 --- a/R/estimate_did.R +++ b/R/estimate_did.R @@ -1,4 +1,4 @@ -estimate_did <- function(dt_did, covnames, control_type, +estimate_did <- function(dt_did, covvars, control_type, last_coef = NULL, cache_ps_fit, cache_hess){ # preprocess -------- @@ -7,14 +7,15 @@ estimate_did <- function(dt_did, covnames, control_type, data_pos <- which(dt_did[, !is.na(D)]) dt_did <- dt_did[data_pos] n <- dt_did[, .N] - if(allNA(covnames)){ - covvars <- NA + + if(is.matrix(covvars)){ + ipw <- control_type %in% c("ipw", "dr") + or <- control_type %in% c("reg", "dr") + covvars <- covvars[data_pos,] } else { - covvars <- as.matrix(dt_did[,.SD, .SDcols = covnames]) + ipw <- FALSE + or <- FALSE } - - ipw <- control_type %in% c("ipw", "dr") & !allNA(covnames) - or <- control_type %in% c("reg", "dr") & !allNA(covnames) #OR is REG # ipw -------- @@ -104,6 +105,7 @@ estimate_did <- function(dt_did, covnames, control_type, M2 <- colMeans(dt_did[, cont_ipw_weight*(delta_y-weighted_cont_delta-or_delta)] * covvars) score_ps <- dt_did[, weights*(D-ps)] * covvars + asym_linear_ps <- score_ps %*% hess #ipw for control diff --git a/R/estimate_gtatt.R b/R/estimate_gtatt.R index 87639f5..abe05ab 100644 --- a/R/estimate_gtatt.R +++ b/R/estimate_gtatt.R @@ -3,25 +3,24 @@ estimate_gtatt <- function(auxdata, params) { release(auxdata) release(params) - treated_cohorts <- cohorts[!is.infinite(cohorts)] - if(!Inf %in% cohorts & control_option != "notyet"){ warning("no never-treated availble, switching to not-yet-treated control") control_option <- "notyet" } + + treated_cohorts <- cohorts[!is.infinite(cohorts)] max_control_cohort <- ifelse(control_option == "notyet", max(treated_cohorts), Inf) cache_ps_fit_list <- list() #for the first outcome won't be able to use cache, empty list returns null for call like cache_ps_fit[["1.3"]] cache_hess_list <- list() outcome_result_list <- list() - for(outcol in outcomevar){ outcomes <- outcomes_list[[outcol]] last_coef <- NULL gt <- data.table() gt_att <- c() - gt_inf_func <- data.table(placeholder = rep(NA, id_size)) + gt_inf_func <- data.table(placeholder = rep(NA, id_size)) #populate with NA for(t in time_periods){ for(g in cohorts){ @@ -31,9 +30,9 @@ estimate_gtatt <- function(auxdata, params) { #setup and checks base_period <- g-1 min_control_cohort <- ifelse(control_option == "never", Inf, max(t+1, base_period+1)) #not-yet treated / never treated in both base and "treated" period - if(t == base_period){next} #no treatmenteffect for the base period - if(base_period < min(time_periods)){next} #no treatmenteffect for the first period, since base period is not observed - if(g >= max_control_cohort){next} #no treatmenteffect for never treated or the last treated cohort (notyet) + if(t == base_period){next} #no treatment effect for the base period + if(base_period < min(time_periods)){next} #no treatment effect for the first period, since base period is not observed + if(g >= max_control_cohort){next} #no treatment effect for never treated or the last treated cohort (for not yet notyet) if(t >= max_control_cohort){next} #no control available if the last cohort is treated too #select the control and treated cohorts @@ -41,37 +40,24 @@ estimate_gtatt <- function(auxdata, params) { did_setup[get_cohort_pos(cohort_sizes, min_control_cohort, max_control_cohort)] <- 0 did_setup[get_cohort_pos(cohort_sizes, g)] <- 1 #treated cannot be controls, assign treated after control to overwrite - if(!allNA(varycovariatesvar)){ - precov <- varycovariates[[base_period]] - names(precov) <- paste0("pre_", names(precov)) - - postcov <- varycovariates[[t]]-varycovariates[[base_period]] - names(postcov) <- paste0("post_", names(varycovariates[[t]])) - - if(allNA(covariatesvar)){ - covariatesdt <- cbind(const = -1, cbind(precov, postcov)) #if covariates is na - } else { - covariatesdt <- cbind(covariates, cbind(precov, postcov)) #if covariates is na - } - - } else { - covariatesdt <- covariates - } + #construct the covariates matrix + covvars <- get_covvars(covariates, varycovariates) #construct the 2x2 dataset - cohort_did <- data.table(did_setup, outcomes[[t]], outcomes[[base_period]], weights, covariatesdt) - setnames(cohort_did, c("did_setup", "V2", "V3", "weights"), c("D", "post.y", "pre.y", "weights")) - + cohort_did <- data.table(did_setup, outcomes[[t]], outcomes[[base_period]], weights) + setnames(cohort_did, c("did_setup", "V2", "V3"), c("D", "post.y", "pre.y")) + #estimate did if(!allow_unbalance_panel){ - result <- estimate_did(cohort_did, colnames(covariatesdt), control_type, last_coef, cache_ps_fit_list[[gt_name]], cache_hess_list[[gt_name]]) + result <- estimate_did(cohort_did, covvars, control_type, + last_coef, cache_ps_fit_list[[gt_name]], cache_hess_list[[gt_name]]) #cache } else { - result <- estimate_did_rc(cohort_did, colnames(covariatesdt), control_type, last_coef, cache_ps_fit_list[[gt_name]], cache_hess_list[[gt_name]]) + result <- estimate_did_rc(cohort_did, covvars, control_type, + last_coef, cache_ps_fit_list[[gt_name]], cache_hess_list[[gt_name]]) #cache } - #collect the result - last_coef <- result$logit_coef + last_coef <- result$logit_coef #for faster convergence in next iter gt <- rbind(gt, data.table(G = g, time = t)) #the sequence matters for the weights gt_att <- c(gt_att, att = result$att) gt_inf_func[[gt_name]] <- result$inf_func @@ -86,10 +72,7 @@ estimate_gtatt <- function(auxdata, params) { gt_inf_func[,placeholder := NULL] names(gt_att) <- names(gt_inf_func) - outcome_result <- list(att = gt_att, inf_func = gt_inf_func, gt = gt) - outcome_result_list[[outcol]] <- outcome_result - - rm(outcome_result) + outcome_result_list[[outcol]] <- list(att = gt_att, inf_func = gt_inf_func, gt = gt, outname = outcol) } @@ -97,8 +80,32 @@ estimate_gtatt <- function(auxdata, params) { } get_cohort_pos <- function(cohort_sizes, start_cohort, end_cohort = start_cohort){ - #if(!start_cohort %in% cohort_sizes[,unique(G)]|!end_cohort %in% cohort_sizes[,unique(G)]) {stop("cohort not in cohort_sizes")} start <- cohort_sizes[G < start_cohort, sum(cohort_size)]+1 end <- cohort_sizes[G <= end_cohort, sum(cohort_size)] return(start:end) } + +get_covvars <- function(covariates, varycovariates){ + if(is.data.table(varycovariates)){ + + precov <- varycovariates[[base_period]] + names(precov) <- paste0("pre_", names(precov)) + postcov <- varycovariates[[t]]-varycovariates[[base_period]] + names(postcov) <- paste0("post_", names(varycovariates[[t]])) + + if(is.data.table(covariates)){ + covvars <- cbind(covariates, cbind(precov, postcov)) #if covariates is na + } else { + covvars <- cbind(const = -1, cbind(precov, postcov)) #if covariates is na + } + covvars <- as.matrix(covvars) + + } else { + if(is.data.table(covariates)){ + covvars <- as.matrix(covariates) #will be NA if covariatesvar have nothing + } else { + covvars <- NA + } + } + return(covvars) +} diff --git a/R/fastdid.R b/R/fastdid.R index 68df16e..9989fac 100644 --- a/R/fastdid.R +++ b/R/fastdid.R @@ -63,7 +63,7 @@ fastdid <- function(data, copy = TRUE, validate = TRUE ){ - # validation arguments -------------------------------------------------------- + # validate arguments -------------------------------------------------------- dt_names <- names(dt) name_message <- "__ARG__ must be a character scalar and a name of a column from the dataset." @@ -94,6 +94,9 @@ fastdid <- function(data, if(any(covariatesvar %in% varycovariatesvar) & !allNA(varycovariatesvar) & !allNA(covariatesvar)){ stop("time-varying var and invariant var have overlaps.") } + if(!boot & !allNA(clustervar)){ + stop("clustering only available with bootstrap") + } params <- gather(timevar, cohortvar, unitvar, outcomevar, control_option,result_type, balanced_event_time, @@ -106,6 +109,7 @@ fastdid <- function(data, warning("coercing input into a data.table.") data <- as.data.table(data) } + if(copy){dt <- copy(data)} else {dt <- data} setnames(dt, c(timevar, cohortvar, unitvar), c("time", "G", "unit")) @@ -127,22 +131,13 @@ fastdid <- function(data, # estimate attgt gt_result_list <- estimate_gtatt(auxdata, params) - # aggregate the result - all_result <- data.table() - for(outcol in outcomevar){ - gt_result <- gt_result_list[[outcol]] - - # aggregate att and inf function and get se - result <- aggregate_gt(gt_result, auxdata, params) - + # aggregate the result by outcome + all_result <- rbindlist(lapply(gt_result_list, function(x){ + result <- aggregate_gt(x, auxdata, params) #convert "targets" back to meaningful parameter identifiers like cohort 1 post, time 2 post result <- result |> convert_targets(result_type, coerce_result$time_change) - result[, outcome := outcol] - all_result <- rbind(all_result, result) - - rm(result) - - } + return(result) + })) return(all_result) @@ -202,12 +197,9 @@ get_auxdata <- function(dt, params){ release(params) - - time_periods <- dt[, unique(time)] id_size <- dt[, uniqueN(unit)] - # get auxiliary data ------------------------------ setorder(dt, time, G, unit) #sort the dataset essential for the sort-once-quick-access #construct the outcomes list for fast access later diff --git a/R/validate_did.R b/R/validate_did.R index 6eca763..746f6fb 100644 --- a/R/validate_did.R +++ b/R/validate_did.R @@ -5,7 +5,6 @@ validate_did <- function(dt,varnames,params){ raw_unit_size <- dt[, uniqueN(unit)] raw_time_size <- dt[, uniqueN(time)] - if(!is.na(balanced_event_time)){ if(balanced_event_time > dt[, max(time-G)]){stop("balanced_event_time is larger than the max event time in the data")} } diff --git a/inst/tinytest/test_1_fastdid.R b/inst/tinytest/test_1_fastdid.R index 2bf3b15..c73ee9f 100644 --- a/inst/tinytest/test_1_fastdid.R +++ b/inst/tinytest/test_1_fastdid.R @@ -7,6 +7,10 @@ dt <- simdt$dt # basics --------------------------------------------- +# fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y", result_type = "group_time", +# control_type = "ipw", +# covariatesvar = c("x", "x2")) + expect_silent(fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y", result_type = "group_time"), info = "simple call") diff --git a/man/fastdid.Rd b/man/fastdid.Rd index 35dd253..a2dfdd3 100644 --- a/man/fastdid.Rd +++ b/man/fastdid.Rd @@ -12,14 +12,15 @@ fastdid( outcomevar, control_option = "both", result_type = "group_time", - balanced_event_time = NULL, + balanced_event_time = NA, control_type = "ipw", allow_unbalance_panel = FALSE, boot = FALSE, biters = 1000, - weightvar = NULL, - clustervar = NULL, - covariatesvar = NULL, + weightvar = NA, + clustervar = NA, + covariatesvar = NA, + varycovariatesvar = NA, copy = TRUE, validate = TRUE ) @@ -39,7 +40,7 @@ fastdid( \item{result_type}{A character string indicating the type of result to be returned. Default is "group_time".} -\item{balanced_event_time}{A numeric scalar that indicates the max event time to balance the cohort composition, only meaningful when result_type == "dynamic". Default is NULL} +\item{balanced_event_time}{A numeric scalar that indicates the max event time to balance the cohort composition, only meaningful when result_type == "dynamic". Default is NA} \item{control_type}{The method for controlling for covariates. "ipw" for inverse probability weighting, "reg" for outcome regression, or "dr" for doubly-robust} @@ -53,7 +54,9 @@ fastdid( \item{clustervar}{The name of the cluster variable, can only be used when boot == TRUE (optional).} -\item{covariatesvar}{A character vector containing the names of covariate variables (optional).} +\item{covariatesvar}{A character vector containing the names of time-invariant covariate variables (optional).} + +\item{varycovariatesvar}{A character vector containing the names of time-varying covariate variables (optional).} \item{copy}{whether to copy the dataset before processing, set to false to speed up the process, but the input data will be altered.} diff --git a/man/sim_did.Rd b/man/sim_did.Rd index 664caf3..540022a 100644 --- a/man/sim_did.Rd +++ b/man/sim_did.Rd @@ -13,6 +13,7 @@ sim_did( hetero = "all", second_outcome = FALSE, second_cov = FALSE, + vary_cov = FALSE, na = "none", balanced = TRUE, seed = NA, From 11825e054bc58fb4d8dfbdae40f389e21b31ac92 Mon Sep 17 00:00:00 2001 From: TsaiLintung <101155567+TsaiLintung@users.noreply.github.com> Date: Tue, 19 Mar 2024 12:04:26 +0800 Subject: [PATCH 07/15] still organization --- R/estimate_did_rc.R | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/R/estimate_did_rc.R b/R/estimate_did_rc.R index 351bfc9..57e3680 100644 --- a/R/estimate_did_rc.R +++ b/R/estimate_did_rc.R @@ -1,18 +1,17 @@ -estimate_did_rc <- function(dt_did, covnames, control_type, +estimate_did_rc <- function(dt_did, covvars, control_type, last_coef = NULL, cache_ps_fit, cache_hess){ #TODO: skip if not enough valid data # preprocess -------- - oldn <- dt_did[, .N] - #separate the dataset into pre and post - + oldn <- dt_did[, .N] data_pos <- which(dt_did[, !is.na(D)]) dt_did <- dt_did[data_pos] n <- dt_did[, .N] + #separate the dataset into pre and post dt_did[, inpre := as.numeric(!is.na(pre.y))] dt_did[, inpost := as.numeric(!is.na(post.y))] n_pre <- dt_did[, sum(!is.na(pre.y))] @@ -21,14 +20,14 @@ estimate_did_rc <- function(dt_did, covnames, control_type, sum_weight_pre <- dt_did[, sum(inpre*weights)] sum_weight_post <- dt_did[, sum(inpost*weights)] - if(allNA(covnames)){ - covvars <- NA + if(is.matrix(covvars)){ + ipw <- control_type %in% c("ipw", "dr") + or <- control_type %in% c("reg", "dr") + covvars <- covvars[data_pos,] } else { - covvars <- as.matrix(dt_did[,.SD, .SDcols = covnames]) + ipw <- FALSE + or <- FALSE } - - ipw <- control_type %in% c("ipw", "dr") & !allNA(covnames) - or <- control_type %in% c("reg", "dr") & !allNA(covnames) #OR is REG # ipw -------- @@ -191,8 +190,8 @@ estimate_did_rc <- function(dt_did, covnames, control_type, inf_treat_post <- inf_treat_did_post+inf_treat_or_post inf_cont_pre <- inf_cont_did_pre+inf_cont_ipw_pre+inf_cont_or_pre inf_treat_pre <- inf_treat_did_pre+inf_treat_or_pre + #post process - inf_func_no_na_post <- (inf_treat_post - inf_cont_post) * oldn / n_post #adjust the value such that mean over the whole id size give the right result inf_func_no_na_post[is.na(inf_func_no_na_post)] <- 0 #fill 0 for NA part (no influce if not in this gt) From 3ac3577ae28f6574c52e28bb7ee6920228bac955 Mon Sep 17 00:00:00 2001 From: TsaiLintung <101155567+TsaiLintung@users.noreply.github.com> Date: Tue, 19 Mar 2024 15:39:04 +0800 Subject: [PATCH 08/15] abandon release and gather, more readable I hope --- R/aggregate_gt.R | 40 +++++++++----------- R/estimate_gtatt.R | 45 +++++++++++----------- R/fastdid.R | 94 ++++++++++++++++++++++++++-------------------- R/utils.R | 21 ----------- R/validate_did.R | 18 ++++----- 5 files changed, 100 insertions(+), 118 deletions(-) diff --git a/R/aggregate_gt.R b/R/aggregate_gt.R index 3cb7a4a..48b800f 100644 --- a/R/aggregate_gt.R +++ b/R/aggregate_gt.R @@ -1,52 +1,46 @@ -aggregate_gt <- function(gt_result, auxdata, params){ - - release(auxdata) - release(params) +aggregate_gt <- function(gt_result, aux, p){ + #release the stuff - gt_att <- gt_result$att - gt_inf_func <- gt_result$inf_func - gt <- gt_result$gt - id_cohorts <- dt_inv[, G] + id_cohorts <- aux$dt_inv[, G] - id_dt <- data.table(weight = weights/sum(weights), G = id_cohorts) + id_dt <- data.table(weight = aux$weights/sum(aux$weights), G = id_cohorts) pg_dt <- id_dt[, .(pg = sum(weight)), by = "G"] - group_time <- gt |> merge(pg_dt, by = "G") + group_time <- gt_result$gt |> merge(pg_dt, by = "G") setorder(group_time, time, G) #change the order to match the order in gtatt - gt_inf_func <- as.matrix(gt_inf_func) + gt_result$inf_func <- as.matrix(gt_result$inf_func) - if(result_type == "group_time"){ + if(p$result_type == "group_time"){ #don't need to do anything targets <- group_time[, unique(G*max(time)+time)] - inf_matrix <- gt_inf_func - agg_att <- as.vector(gt_att) + inf_matrix <- gt_result$inf_func + agg_att <- as.vector(gt_result$att) } else { #get which gt(s) is a part of the aggregated param - agg_sch <- get_aggregate_scheme(group_time, result_type, weights, id_cohorts, balanced_event_time) + agg_sch <- get_aggregate_scheme(group_time, p$result_type, aux$weights, id_cohorts, p$balanced_event_time) targets <- agg_sch$targets - agg_weights <- as.matrix(agg_sch$agg_weights) #aggregated att - agg_att <- agg_weights %*% gt_att + agg_att <- agg_sch$agg_weights %*% gt_result$att #get the influence from weight estimation #this needs to be optimized!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - inf_weights <- sapply(asplit(agg_weights, 1), function (x){ - get_weight_influence(x, gt_att, weights, id_cohorts, group_time[, .(G, time)]) + inf_weights <- sapply(asplit(agg_sch$agg_weights, 1), function (x){ + get_weight_influence(x, gt_result$att, aux$weights, id_cohorts, group_time[, .(G, time)]) }) #aggregated influence function - inf_matrix <- (gt_inf_func %*% t(agg_weights)) + inf_weights - + inf_matrix <- (gt_result$inf_func %*% t(agg_sch$agg_weights)) + inf_weights + } #get se from influence function - agg_se <- get_se(inf_matrix, boot, biters, cluster, clustervar) + agg_se <- get_se(inf_matrix, p$boot, p$biters, aux$cluster, p$clustervar) # post process result <- data.table(targets, agg_att, agg_se) @@ -106,7 +100,7 @@ get_aggregate_scheme <- function(group_time, result_type, weights, id_cohorts, b agg_weights <- rbind(agg_weights, target_weights) } - return(list(agg_weights = agg_weights, #a matrix of each target and gt's weight in it + return(list(agg_weights = as.matrix(agg_weights), #a matrix of each target and gt's weight in it targets = targets)) } diff --git a/R/estimate_gtatt.R b/R/estimate_gtatt.R index abe05ab..c6fdb23 100644 --- a/R/estimate_gtatt.R +++ b/R/estimate_gtatt.R @@ -1,58 +1,55 @@ -estimate_gtatt <- function(auxdata, params) { +estimate_gtatt <- function(aux, p) { - release(auxdata) - release(params) - - if(!Inf %in% cohorts & control_option != "notyet"){ + if(!Inf %in% aux$cohorts & p$control_option != "notyet"){ warning("no never-treated availble, switching to not-yet-treated control") - control_option <- "notyet" + p$control_option <- "notyet" } - treated_cohorts <- cohorts[!is.infinite(cohorts)] - max_control_cohort <- ifelse(control_option == "notyet", max(treated_cohorts), Inf) + treated_cohorts <- aux$cohorts[!is.infinite(aux$cohorts)] + max_control_cohort <- ifelse(p$control_option == "notyet", max(treated_cohorts), Inf) cache_ps_fit_list <- list() #for the first outcome won't be able to use cache, empty list returns null for call like cache_ps_fit[["1.3"]] cache_hess_list <- list() outcome_result_list <- list() - for(outcol in outcomevar){ + for(outcol in p$outcomevar){ - outcomes <- outcomes_list[[outcol]] + outcomes <- aux$outcomes_list[[outcol]] last_coef <- NULL gt <- data.table() gt_att <- c() - gt_inf_func <- data.table(placeholder = rep(NA, id_size)) #populate with NA + gt_inf_func <- data.table(placeholder = rep(NA, aux$id_size)) #populate with NA - for(t in time_periods){ - for(g in cohorts){ + for(t in aux$time_periods){ + for(g in aux$cohorts){ gt_name <- paste0(g, ".", t) #setup and checks base_period <- g-1 - min_control_cohort <- ifelse(control_option == "never", Inf, max(t+1, base_period+1)) #not-yet treated / never treated in both base and "treated" period + min_control_cohort <- ifelse(p$control_option == "never", Inf, max(t+1, base_period+1)) #not-yet treated / never treated in both base and "treated" period if(t == base_period){next} #no treatment effect for the base period - if(base_period < min(time_periods)){next} #no treatment effect for the first period, since base period is not observed + if(base_period < min(aux$time_periods)){next} #no treatment effect for the first period, since base period is not observed if(g >= max_control_cohort){next} #no treatment effect for never treated or the last treated cohort (for not yet notyet) if(t >= max_control_cohort){next} #no control available if the last cohort is treated too #select the control and treated cohorts - did_setup <- rep(NA, id_size) - did_setup[get_cohort_pos(cohort_sizes, min_control_cohort, max_control_cohort)] <- 0 - did_setup[get_cohort_pos(cohort_sizes, g)] <- 1 #treated cannot be controls, assign treated after control to overwrite + did_setup <- rep(NA, aux$id_size) + did_setup[get_cohort_pos(aux$cohort_sizes, min_control_cohort, max_control_cohort)] <- 0 + did_setup[get_cohort_pos(aux$cohort_sizes, g)] <- 1 #treated cannot be controls, assign treated after control to overwrite #construct the covariates matrix - covvars <- get_covvars(covariates, varycovariates) + covvars <- get_covvars(aux$covariates, aux$varycovariates) #construct the 2x2 dataset - cohort_did <- data.table(did_setup, outcomes[[t]], outcomes[[base_period]], weights) - setnames(cohort_did, c("did_setup", "V2", "V3"), c("D", "post.y", "pre.y")) + cohort_did <- data.table(did_setup, outcomes[[t]], outcomes[[base_period]], aux$weights) + names(cohort_did) <- c("D", "post.y", "pre.y", "weights") #estimate did - if(!allow_unbalance_panel){ - result <- estimate_did(cohort_did, covvars, control_type, + if(!p$allow_unbalance_panel){ + result <- estimate_did(cohort_did, covvars, p$control_type, last_coef, cache_ps_fit_list[[gt_name]], cache_hess_list[[gt_name]]) #cache } else { - result <- estimate_did_rc(cohort_did, covvars, control_type, + result <- estimate_did_rc(cohort_did, covvars, p$control_type, last_coef, cache_ps_fit_list[[gt_name]], cache_hess_list[[gt_name]]) #cache } diff --git a/R/fastdid.R b/R/fastdid.R index 9989fac..c2fe2e0 100644 --- a/R/fastdid.R +++ b/R/fastdid.R @@ -98,10 +98,22 @@ fastdid <- function(data, stop("clustering only available with bootstrap") } - params <- gather(timevar, cohortvar, unitvar, outcomevar, - control_option,result_type, balanced_event_time, - control_type, allow_unbalance_panel , boot, biters, - weightvar,clustervar,covariatesvar,varycovariatesvar) + p <- list(timevar = timevar, + cohortvar = cohortvar, + unitvar = unitvar, + outcomevar = outcomevar, + weightvar = weightvar, + clustervar = clustervar, + covariatesvar = covariatesvar, + varycovariatesvar = varycovariatesvar, + control_option = control_option, + result_type = result_type, + balanced_event_time = balanced_event_time, + control_type = control_type, + allow_unbalance_panel = allow_unbalance_panel, + boot = boot, + biters = biters) + # validate data ----------------------------------------------------- @@ -114,26 +126,26 @@ fastdid <- function(data, setnames(dt, c(timevar, cohortvar, unitvar), c("time", "G", "unit")) if(validate){ - varnames <- c("time", "G", "unit", outcomevar,weightvar,clustervar,covariatesvar, varycovariatesvar) - dt <- validate_did(dt, varnames, params) + varnames <- c("time", "G", "unit", outcomevar,weightvar,clustervar,covariatesvar,varycovariatesvar) + dt <- validate_did(dt, varnames, p) } # preprocess ----------------------------------------------------------- #make dt conform to the WLOG assumptions of fastdid - coerce_result <- coerce_dt(dt, params) #also changed dt + coerce_result <- coerce_dt(dt, p) #also changed dt dt <- coerce_result$dt # get auxiliary data - auxdata <- get_auxdata(dt, params) + aux <- get_auxdata(dt, p) # main part ------------------------------------------------- # estimate attgt - gt_result_list <- estimate_gtatt(auxdata, params) + gt_result_list <- estimate_gtatt(aux, p) # aggregate the result by outcome all_result <- rbindlist(lapply(gt_result_list, function(x){ - result <- aggregate_gt(x, auxdata, params) + result <- aggregate_gt(x, aux, p) #convert "targets" back to meaningful parameter identifiers like cohort 1 post, time 2 post result <- result |> convert_targets(result_type, coerce_result$time_change) return(result) @@ -145,9 +157,7 @@ fastdid <- function(data, # small steps ---------------------------------------------------------------------- -coerce_dt <- function(dt, params){ - - release(params) +coerce_dt <- function(dt, p){ #change to int before sorting if(!is.numeric(dt[, G])){ @@ -157,7 +167,7 @@ coerce_dt <- function(dt, params){ dt[, time := as.numeric(time)] } - if(allow_unbalance_panel){ + if(p$allow_unbalance_panel){ dt_inv_raw <- dt[dt[, .I[1], by = unit]$V1] setorder(dt_inv_raw, G) dt_inv_raw[, new_unit := 1:.N] #let unit start from 1 .... N, useful for knowing which unit is missing @@ -193,9 +203,7 @@ coerce_dt <- function(dt, params){ } -get_auxdata <- function(dt, params){ - - release(params) +get_auxdata <- function(dt, p){ time_periods <- dt[, unique(time)] id_size <- dt[, uniqueN(unit)] @@ -205,10 +213,10 @@ get_auxdata <- function(dt, params){ #construct the outcomes list for fast access later #loop for multiple outcome outcomes_list <- list() - for(outcol in outcomevar){ + for(outcol in p$outcomevar){ outcomes <- list() - if(!allow_unbalance_panel){ + if(!p$allow_unbalance_panel){ for(i in time_periods){ start <- (i-1)*id_size+1 end <- i*id_size @@ -231,7 +239,7 @@ get_auxdata <- function(dt, params){ } #the time-invariant parts - if(!allow_unbalance_panel){ + if(!p$allow_unbalance_panel){ dt_inv <- dt[1:id_size] } else { dt_inv <- dt[dt[, .I[1], by = unit]$V1] #the first observation @@ -243,44 +251,50 @@ get_auxdata <- function(dt, params){ # the optional columns varycovariates <- list() - if(!allNA(varycovariatesvar)){ + if(!allNA(p$varycovariatesvar)){ for(i in time_periods){ start <- (i-1)*id_size+1 end <- i*id_size - varycovariates[[i]] <- dt[seq(start,end), .SD, .SDcols = varycovariatesvar] + varycovariates[[i]] <- dt[seq(start,end), .SD, .SDcols = p$varycovariatesvar] } } else { varycovariates <- NA } - if(!allNA(covariatesvar)){ - covariates <- cbind(const = -1, dt_inv[,.SD, .SDcols = covariatesvar]) + if(!allNA(p$covariatesvar)){ + covariates <- cbind(const = -1, dt_inv[,.SD, .SDcols = p$covariatesvar]) } else { covariates <- NA } - if(!is.na(clustervar)){ - cluster <- dt_inv[, .SD, .SDcols = clustervar] |> unlist() + if(!is.na(p$clustervar)){ + cluster <- dt_inv[, .SD, .SDcols = p$clustervar] |> unlist() } else { cluster <- NA } - if(!is.na(weightvar)){ - weights <- dt_inv[, .SD, .SDcols = weightvar] |> unlist() + if(!is.na(p$weightvar)){ + weights <- dt_inv[, .SD, .SDcols = p$weightvar] |> unlist() } else { weights <- rep(1, id_size) } - auxdata <- gather(time_periods, id_size, outcomes_list, dt_inv, cohorts, cohort_sizes, - varycovariates, covariates, cluster, weights) + aux <- list(time_periods = time_periods, + id_size = id_size, + outcomes_list = outcomes_list, + dt_inv= dt_inv, + cohorts = cohorts, + cohort_sizes = cohort_sizes, + varycovariates = varycovariates, + covariates = covariates, + cluster = cluster, + weights = weights) - return(auxdata) + return(aux) } -convert_targets <- function(results, result_type, time_change){ - - release(time_change) #from coerce_dt +convert_targets <- function(results, result_type, t){ if(result_type == "dynamic"){ setnames(results, "target", "event_time") @@ -288,23 +302,23 @@ convert_targets <- function(results, result_type, time_change){ } else if (result_type == "cohort"){ results[, type := ifelse(target >= 0, "post", "pre")] - results[, target := recover_time(abs(target), time_offset, time_step)] + results[, target := recover_time(abs(target), t$time_offset, t$time_step)] setnames(results, "target", "cohort") } else if (result_type == "calendar"){ results[, type := ifelse(target >= 0, "post", "pre")] - results[, target := recover_time(abs(target), time_offset, time_step)] + results[, target := recover_time(abs(target), t$time_offset, t$time_step)] setnames(results, "target", "time") } else if (result_type == "group_time"){ - results[, cohort := floor((target-1)/max_time)] - results[, time := (target-cohort*max_time)] + results[, cohort := floor((target-1)/t$max_time)] + results[, time := (target-cohort*t$max_time)] #recover the time - results[, cohort := recover_time(cohort, time_offset, time_step)] - results[, time := recover_time(time, time_offset, time_step)] + results[, cohort := recover_time(cohort, t$time_offset, t$time_step)] + results[, time := recover_time(time, t$time_offset, t$time_step)] results[, target := NULL] diff --git a/R/utils.R b/R/utils.R index 03478a8..2a855a5 100644 --- a/R/utils.R +++ b/R/utils.R @@ -6,24 +6,3 @@ set_max_thread <- function(){ reverse_col <- function(x){ return(x[,ncol(x):1]) } - - -release <- function(p){ - for(name in names(p)){ - assign(name, p[[name]], envir = parent.frame()) #assign it to the envir where the func is called - } -} - -gather <- function(...){ - names <- substitute(list(...)) |> deparse() |> - str_remove_all("\\)|list\\(") |> str_flatten() |> str_split_1(",") |> str_trim() - values <- list(...) - p <- list() - - for(i in 1:length(names)){ - #p[[names[[i]]]] <- values[[i]] - p <- c(p, list(values[[i]])) #for nulls - } - names(p) <- names - return(p) -} diff --git a/R/validate_did.R b/R/validate_did.R index 746f6fb..839f751 100644 --- a/R/validate_did.R +++ b/R/validate_did.R @@ -1,12 +1,10 @@ -validate_did <- function(dt,varnames,params){ - - release(params) - +validate_did <- function(dt,varnames,p){ + raw_unit_size <- dt[, uniqueN(unit)] raw_time_size <- dt[, uniqueN(time)] - if(!is.na(balanced_event_time)){ - if(balanced_event_time > dt[, max(time-G)]){stop("balanced_event_time is larger than the max event time in the data")} + if(!is.na(p$balanced_event_time)){ + if(p$balanced_event_time > dt[, max(time-G)]){stop("balanced_event_time is larger than the max event time in the data")} } #doesn't allow missing value for now @@ -20,13 +18,13 @@ validate_did <- function(dt,varnames,params){ } - if(!allNA(covariatesvar) && uniqueN(dt, by = c("unit", covariatesvar)) > raw_unit_size){ + if(!allNA(p$covariatesvar) && uniqueN(dt, by = c("unit", p$covariatesvar)) > raw_unit_size){ warning("some covariates is time-varying, fastdid only use the first observation for covariates.") } - if(!allNA(covariatesvar)|!allNA(varycovariatesvar)){ - for(cov in c(covariatesvar, varycovariatesvar)){ + if(!allNA(p$covariatesvar)|!allNA(p$varycovariatesvar)){ + for(cov in c(p$covariatesvar, p$varycovariatesvar)){ if(is.na(cov)){next} #check covaraites is not constant if(fnunique(dt[, get(cov)[1], by = "unit"][, V1]) == 1)stop(cov, " have no variation") @@ -41,7 +39,7 @@ validate_did <- function(dt,varnames,params){ } #check if any is missing - if(!allow_unbalance_panel){ + if(!p$allow_unbalance_panel){ unit_count <- dt[, .(count = .N), by = unit] if(any(unit_count[, count < raw_time_size])){ mis_unit <- unit_count[count < raw_time_size] From 1f91ae87a1c3cd52ba64228ca84fd9497f17144f Mon Sep 17 00:00:00 2001 From: TsaiLintung <101155567+TsaiLintung@users.noreply.github.com> Date: Wed, 20 Mar 2024 15:00:09 +0800 Subject: [PATCH 09/15] hot fix for wrong varycovarites condition and some warning --- R/estimate_did.R | 13 +++++++++---- R/estimate_gtatt.R | 7 ++++--- R/fastdid.R | 2 +- R/sim_did.R | 2 +- development/playground.R | 28 ++++++++++++++++++++++++++++ 5 files changed, 43 insertions(+), 9 deletions(-) create mode 100644 development/playground.R diff --git a/R/estimate_did.R b/R/estimate_did.R index 597a328..e768b2c 100644 --- a/R/estimate_did.R +++ b/R/estimate_did.R @@ -1,8 +1,7 @@ estimate_did <- function(dt_did, covvars, control_type, last_coef = NULL, cache_ps_fit, cache_hess){ - + # preprocess -------- - oldn <- dt_did[, .N] data_pos <- which(dt_did[, !is.na(D)]) dt_did <- dt_did[data_pos] @@ -30,14 +29,20 @@ estimate_did <- function(dt_did, covvars, control_type, intercept = FALSE)) class(prop_score_est) <- "glm" #trick the vcov function to think that this is a glm object to dispatch the write method #const is implicitly put into the ipw formula, need to incorporate it manually - hess <- stats::vcov(prop_score_est) * n #for the influence function logit_coef <- prop_score_est$coefficients + + if(anyNA(logit_coef)){ + warning("some propensity score estimation resulted in NA coefficients, likely cause by perfect colinearity") + } + logit_coef[is.na(logit_coef)|abs(logit_coef) > 1e10] <- 0 #put extreme value and na to 0 prop_score_fit <- fitted(prop_score_est) if(max(prop_score_fit) >= 1){warning(paste0("support overlap condition violated for some group_time"))} prop_score_fit <- pmin(1-1e-16, prop_score_fit) #for the ipw + hess <- stats::vcov(prop_score_est) * n #for the influence function + hess[is.na(hess)|abs(hess) > 1e10] <- 0 } else { #when using multiple outcome, ipw cache can be reused hess <- cache_hess @@ -142,6 +147,7 @@ estimate_did <- function(dt_did, covvars, control_type, #get overall influence function + #if(dt_did[, mean(cont_ipw_weight)] < 1e-10){warning("little/no overlap in covariates between control and treat group, estimates are unstable.")} inf_cont <- (inf_cont_did+inf_cont_ipw+inf_cont_or)/dt_did[, mean(cont_ipw_weight)] inf_treat <- (inf_treat_did+inf_treat_or)/dt_did[,mean(treat_ipw_weight)] inf_func_no_na <- inf_treat - inf_cont @@ -151,7 +157,6 @@ estimate_did <- function(dt_did, covvars, control_type, inf_func_no_na <- inf_func_no_na * oldn / n #adjust the value such that mean over the whole id size give the right result inf_func[data_pos] <- inf_func_no_na - return(list(att = att, inf_func = inf_func, logit_coef = logit_coef, #for next gt cache_ps_fit = prop_score_fit, cache_hess = hess)) #for next outcome } diff --git a/R/estimate_gtatt.R b/R/estimate_gtatt.R index c6fdb23..80ecea3 100644 --- a/R/estimate_gtatt.R +++ b/R/estimate_gtatt.R @@ -38,7 +38,7 @@ estimate_gtatt <- function(aux, p) { did_setup[get_cohort_pos(aux$cohort_sizes, g)] <- 1 #treated cannot be controls, assign treated after control to overwrite #construct the covariates matrix - covvars <- get_covvars(aux$covariates, aux$varycovariates) + covvars <- get_covvars(aux$covariates, aux$varycovariates, base_period, t) #construct the 2x2 dataset cohort_did <- data.table(did_setup, outcomes[[t]], outcomes[[base_period]], aux$weights) @@ -82,8 +82,9 @@ get_cohort_pos <- function(cohort_sizes, start_cohort, end_cohort = start_cohort return(start:end) } -get_covvars <- function(covariates, varycovariates){ - if(is.data.table(varycovariates)){ +get_covvars <- function(covariates, varycovariates, base_period, t){ + + if(is.list(varycovariates)){ precov <- varycovariates[[base_period]] names(precov) <- paste0("pre_", names(precov)) diff --git a/R/fastdid.R b/R/fastdid.R index c2fe2e0..67b6513 100644 --- a/R/fastdid.R +++ b/R/fastdid.R @@ -59,7 +59,7 @@ fastdid <- function(data, timevar, cohortvar, unitvar, outcomevar, control_option="both",result_type="group_time", balanced_event_time = NA, control_type = "ipw", allow_unbalance_panel = FALSE, boot=FALSE, biters = 1000, - weightvar=NA,clustervar=NA,covariatesvar = NA,varycovariatesvar = NA, + weightvar=NA,clustervar=NA,covariatesvar = NA, varycovariatesvar = NA, copy = TRUE, validate = TRUE ){ diff --git a/R/sim_did.R b/R/sim_did.R index 7ca136e..d8ba2e5 100644 --- a/R/sim_did.R +++ b/R/sim_did.R @@ -93,7 +93,7 @@ sim_did <- function(sample_size, time_period, untreated_prop = 0.3, epsilon_size #add time_varying covariates if(vary_cov){ - dt[, xvar := pmin(G, time_period)*time+rnorm(sample_size*time_period, 0,1)] #should be confounding....? + dt[, xvar := pmin(G, time_period+4)*time+rnorm(sample_size*time_period, 0,30)] #should be confounding....? } else { dt[, xvar := 1] } diff --git a/development/playground.R b/development/playground.R new file mode 100644 index 0000000..88ebfb5 --- /dev/null +++ b/development/playground.R @@ -0,0 +1,28 @@ +setwd("~/GitHub/fastdid") + +library(devtools) +library(tinytest) +library(roxygen2) + +load_all() + + +tol <- 1e-2 #allow 1% different between estimates +simdt <- sim_did(1e+02, 4, cov = "cont", hetero = "all", balanced = TRUE, second_outcome = TRUE, seed = 1, + stratify = FALSE, second_cov = TRUE, vary_cov = TRUE) +dt <- simdt$dt + + +fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y", result_type = "group_time") + +fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y", result_type = "group_time", + control_type = "ipw", + varycovariatesvar = c("xvar")) + +fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y", result_type = "group_time", + control_type = "ipw", + varycovariatesvar = c("x")) + +fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y", result_type = "group_time", + control_type = "ipw", + covariatesvar = c("x")) \ No newline at end of file From f5c13a3d67b7fa425ec065cfe3821ce25b5a472a Mon Sep 17 00:00:00 2001 From: TsaiLintung <101155567+TsaiLintung@users.noreply.github.com> Date: Tue, 2 Apr 2024 12:29:59 +0800 Subject: [PATCH 10/15] fix bug with dt and data --- R/fastdid.R | 20 ++++++++++---------- README.md | 12 ++++++------ 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/R/fastdid.R b/R/fastdid.R index 67b6513..bb3e4c7 100644 --- a/R/fastdid.R +++ b/R/fastdid.R @@ -32,24 +32,24 @@ #' dt <- simdt$dt #' #' #basic call -#' result <- fastdid(dt, timevar = "time", cohortvar = "G", +#' result <- fastdid(data = dt, timevar = "time", cohortvar = "G", #' unitvar = "unit", outcomevar = "y", #' result_type = "group_time") #' #' #control for covariates -#' result2 <- fastdid(dt, timevar = "time", cohortvar = "G", +#' result2 <- fastdid(data = dt, timevar = "time", cohortvar = "G", #' unitvar = "unit", outcomevar = "y", #' result_type = "group_time", #' covariatesvar = c("x", "x2")) #' #' #bootstrap and clustering -#' result3 <- fastdid(dt, timevar = "time", cohortvar = "G", +#' result3 <- fastdid(data = dt, timevar = "time", cohortvar = "G", #' unitvar = "unit", outcomevar = "y", #' result_type = "group_time", #' boot = TRUE, clustervar = "x") #' #' #estimate for multiple outcomes -#' result4 <- fastdid(dt, #the dataset +#' result4 <- fastdid(data = dt, #the dataset #' timevar = "time", cohortvar = "G", unitvar = "unit", #' outcomevar = c("y", "y2"), #name of the outcome columns #' result_type = "group_time") @@ -65,6 +65,12 @@ fastdid <- function(data, # validate arguments -------------------------------------------------------- + if(!is.data.table(data)){ + warning("coercing input into a data.table.") + data <- as.data.table(data) + } + if(copy){dt <- copy(data)} else {dt <- data} + dt_names <- names(dt) name_message <- "__ARG__ must be a character scalar and a name of a column from the dataset." check_set_arg(timevar, unitvar, cohortvar, "match", .choices = dt_names, .message = name_message) @@ -117,12 +123,6 @@ fastdid <- function(data, # validate data ----------------------------------------------------- - if(!is.data.table(data)){ - warning("coercing input into a data.table.") - data <- as.data.table(data) - } - - if(copy){dt <- copy(data)} else {dt <- data} setnames(dt, c(timevar, cohortvar, unitvar), c("time", "G", "unit")) if(validate){ diff --git a/README.md b/README.md index e2e055e..3c8fd85 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ devtools::install_github("TsaiLintung/fastdid") # Usage -`fastdid` is the main function provided by **fastdid**. When using `fastdid`, you need to provide the dataset (`dt`), specify the names of the relevant columns (`-var`), and the type of target (aggregated) parameters (`result_type` such as `"group_time"`, `"time"`, `"dynamic"`, or `"simple"`.) Here is a simple call. +`fastdid` is the main function provided by **fastdid**. When using `fastdid`, you need to provide the dataset (`data`), specify the names of the relevant columns (`-var`), and the type of target (aggregated) parameters (`result_type` such as `"group_time"`, `"time"`, `"dynamic"`, or `"simple"`.) Here is a simple call. ``` #loading the package @@ -30,7 +30,7 @@ simdt <- sim_did(1e+03, 10, cov = "cont", second_cov = TRUE, second_outcome = TR dt <- simdt$dt #calling fastdid -result <- fastdid(dt, #the dataset +result <- fastdid(data = dt, #the dataset timevar = "time", cohortvar = "G", unitvar = "unit", outcomevar = "y", #name of the columns result_type = "group_time") #the result type ``` @@ -38,7 +38,7 @@ result <- fastdid(dt, #the dataset You can control for covariates by providing the name of the data columns, and choose the method among doubly-robust (`"dr"`), inverse probability weight (`"ipw"`), and outcome regression (`"or"`). ``` -result <- fastdid(dt, +result <- fastdid(data = dt, timevar = "time", cohortvar = "G", unitvar = "unit", outcomevar = "y", result_type = "group_time", control_option = "dr", #choose the control method @@ -48,7 +48,7 @@ result <- fastdid(dt, While the default is to coerce the data into a balanced panel, you can allow for unbalanced panel. Note that currently only "ipw" is available when `allow_unbalance_panel = TRUE`. ``` -result <- fastdid(dt, +result <- fastdid(data = dt, timevar = "time", cohortvar = "G", unitvar = "unit", outcomevar = "y", result_type = "group_time", allow_unbalance_panel = TRUE, #allow for unbalanced panel @@ -59,7 +59,7 @@ result <- fastdid(dt, Clustered standard error can be obtained from multiplier bootstrap. ``` -result <- fastdid(dt, +result <- fastdid(data = dt, timevar = "time", cohortvar = "G", unitvar = "unit", outcomevar = "y", result_type = "group_time", clustervar = "x", boot = TRUE) #add clustering by using bootstrap @@ -69,7 +69,7 @@ Estimation for multiple outcomes can be done in one call by providing a vector o ``` #calling fastdid -result <- fastdid(dt, #the dataset +result <- fastdid(data = dt, #the dataset timevar = "time", cohortvar = "G", unitvar = "unit", outcomevar = c("y", "y2"), #name of the columns result_type = "group_time") #the result type ``` From ece5a371d94ce53639f547b327b9d3788da25ff2 Mon Sep 17 00:00:00 2001 From: TsaiLintung <101155567+TsaiLintung@users.noreply.github.com> Date: Tue, 2 Apr 2024 17:17:47 +0800 Subject: [PATCH 11/15] add max control cohort diff --- DESCRIPTION | 2 +- R/estimate_gtatt.R | 13 ++++++++++++- R/fastdid.R | 6 ++++-- inst/tinytest/test_1_fastdid.R | 4 ++++ man/fastdid.Rd | 11 ++++++----- 5 files changed, 27 insertions(+), 9 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index c9ba888..c9865a5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -25,4 +25,4 @@ Imports: parglm Suggests: ggplot2, did Encoding: UTF-8 -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.1 diff --git a/R/estimate_gtatt.R b/R/estimate_gtatt.R index 80ecea3..eb165d5 100644 --- a/R/estimate_gtatt.R +++ b/R/estimate_gtatt.R @@ -5,8 +5,12 @@ estimate_gtatt <- function(aux, p) { p$control_option <- "notyet" } + if(!is.na(p$max_control_cohort_diff)){ + warning("max_control_cohort_diff can only be used with not yet") + p$control_option <- "notyet" + } + treated_cohorts <- aux$cohorts[!is.infinite(aux$cohorts)] - max_control_cohort <- ifelse(p$control_option == "notyet", max(treated_cohorts), Inf) cache_ps_fit_list <- list() #for the first outcome won't be able to use cache, empty list returns null for call like cache_ps_fit[["1.3"]] cache_hess_list <- list() @@ -27,6 +31,13 @@ estimate_gtatt <- function(aux, p) { #setup and checks base_period <- g-1 min_control_cohort <- ifelse(p$control_option == "never", Inf, max(t+1, base_period+1)) #not-yet treated / never treated in both base and "treated" period + + if(!is.na(p$max_control_cohort_diff)){ + max_control_cohort <- min(g+p$max_control_cohort_diff, max(treated_cohorts)) + } else { + max_control_cohort <- ifelse(p$control_option == "notyet", max(treated_cohorts), Inf) + } + if(t == base_period){next} #no treatment effect for the base period if(base_period < min(aux$time_periods)){next} #no treatment effect for the first period, since base period is not observed if(g >= max_control_cohort){next} #no treatment effect for never treated or the last treated cohort (for not yet notyet) diff --git a/R/fastdid.R b/R/fastdid.R index bb3e4c7..04b9d84 100644 --- a/R/fastdid.R +++ b/R/fastdid.R @@ -60,7 +60,8 @@ fastdid <- function(data, control_option="both",result_type="group_time", balanced_event_time = NA, control_type = "ipw", allow_unbalance_panel = FALSE, boot=FALSE, biters = 1000, weightvar=NA,clustervar=NA,covariatesvar = NA, varycovariatesvar = NA, - copy = TRUE, validate = TRUE + copy = TRUE, validate = TRUE, + max_control_cohort_diff = NA ){ # validate arguments -------------------------------------------------------- @@ -118,7 +119,8 @@ fastdid <- function(data, control_type = control_type, allow_unbalance_panel = allow_unbalance_panel, boot = boot, - biters = biters) + biters = biters, + max_control_cohort_diff = max_control_cohort_diff) # validate data ----------------------------------------------------- diff --git a/inst/tinytest/test_1_fastdid.R b/inst/tinytest/test_1_fastdid.R index c73ee9f..7acb010 100644 --- a/inst/tinytest/test_1_fastdid.R +++ b/inst/tinytest/test_1_fastdid.R @@ -61,6 +61,10 @@ expect_silent(fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",ou expect_silent(fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y", result_type = "group_time", allow_unbalance_panel = TRUE), info = "balance panel false but dt is balance") +expect_silent(fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y", result_type = "group_time", allow_unbalance_panel = TRUE, + max_control_cohort_diff = 2), + info = "balance panel false but dt is balance") + # dt that needs adjustment --------------------------- base_result <- fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y", result_type = "group_time") diff --git a/man/fastdid.Rd b/man/fastdid.Rd index a2dfdd3..0ea6537 100644 --- a/man/fastdid.Rd +++ b/man/fastdid.Rd @@ -22,7 +22,8 @@ fastdid( covariatesvar = NA, varycovariatesvar = NA, copy = TRUE, - validate = TRUE + validate = TRUE, + max_control_cohort_diff = NA ) } \arguments{ @@ -74,24 +75,24 @@ simdt <- sim_did(1e+03, 10, cov = "cont", second_cov = TRUE, second_outcome = TR dt <- simdt$dt #basic call -result <- fastdid(dt, timevar = "time", cohortvar = "G", +result <- fastdid(data = dt, timevar = "time", cohortvar = "G", unitvar = "unit", outcomevar = "y", result_type = "group_time") #control for covariates -result2 <- fastdid(dt, timevar = "time", cohortvar = "G", +result2 <- fastdid(data = dt, timevar = "time", cohortvar = "G", unitvar = "unit", outcomevar = "y", result_type = "group_time", covariatesvar = c("x", "x2")) #bootstrap and clustering -result3 <- fastdid(dt, timevar = "time", cohortvar = "G", +result3 <- fastdid(data = dt, timevar = "time", cohortvar = "G", unitvar = "unit", outcomevar = "y", result_type = "group_time", boot = TRUE, clustervar = "x") #estimate for multiple outcomes -result4 <- fastdid(dt, #the dataset +result4 <- fastdid(data = dt, #the dataset timevar = "time", cohortvar = "G", unitvar = "unit", outcomevar = c("y", "y2"), #name of the outcome columns result_type = "group_time") From 19d2e78865565e0bf2d464995cefc2a0c7703716 Mon Sep 17 00:00:00 2001 From: TsaiLintung <101155567+TsaiLintung@users.noreply.github.com> Date: Thu, 11 Apr 2024 17:13:25 +0800 Subject: [PATCH 12/15] change source --- development/build_source.R | 5 +- development/fastdid_sourcever.R | 579 +++++++++++++++++++------------- 2 files changed, 345 insertions(+), 239 deletions(-) diff --git a/development/build_source.R b/development/build_source.R index cde2d46..24d1b5e 100644 --- a/development/build_source.R +++ b/development/build_source.R @@ -1,16 +1,17 @@ rm(list = ls()) gc() -setwd("~/GitHub/EventStudyCode") +setwd("~/GitHub/fastdid") source_files <- list.files("R", include.dirs = FALSE, full.names = TRUE) ver <- "0.9.2" vername <- "unbalanced" -sink("interactive/fastdid_sourcever.R") +sink("development/fastdid_sourcever.R") cat(paste0("#", as.character(Sys.Date()), "\n")) cat(paste0("message('loading fastdid source ver. ver: ", ver, " (", vername ,"), date: " , as.character(Sys.Date()), "')\n")) +cat("require(data.table);require(stringr);require(BMisc);require(collapse);require(dreamerr);require(parglm)\n") for(file in source_files){ current_file = readLines(file) cat(current_file, sep ="\n") diff --git a/development/fastdid_sourcever.R b/development/fastdid_sourcever.R index 78b3969..f9f4ae4 100644 --- a/development/fastdid_sourcever.R +++ b/development/fastdid_sourcever.R @@ -1,54 +1,63 @@ -#2023-12-21 -message('loading fastdid source ver. ver: 0.9.2 (unbalanced), date: 2023-12-21') -aggregate_gt <- function(gt_result, cohort_sizes, - id_weights, id_cohorts, - result_type, balanced_event_time){ +#2024-04-09 +message('loading fastdid source ver. ver: 0.9.2 (unbalanced), date: 2024-04-09') +require(data.table);require(stringr);require(BMisc);require(collapse);require(dreamerr);require(parglm) +aggregate_gt <- function(gt_result, aux, p){ - gt_att <- gt_result$att - gt_inf_func <- gt_result$inf_func - gt <- gt_result$gt + + #release the stuff + id_cohorts <- aux$dt_inv[, G] - id_dt <- data.table(weight = id_weights/sum(id_weights), G = id_cohorts) + id_dt <- data.table(weight = aux$weights/sum(aux$weights), G = id_cohorts) pg_dt <- id_dt[, .(pg = sum(weight)), by = "G"] - group_time <- gt |> merge(pg_dt, by = "G") + group_time <- gt_result$gt |> merge(pg_dt, by = "G") setorder(group_time, time, G) #change the order to match the order in gtatt - gt_inf_func <- as.matrix(gt_inf_func) + gt_result$inf_func <- as.matrix(gt_result$inf_func) - if(result_type == "group_time"){ + if(p$result_type == "group_time"){ + #don't need to do anything targets <- group_time[, unique(G*max(time)+time)] - inf_matrix <- gt_inf_func - agg_att <- as.vector(gt_att) + inf_matrix <- gt_result$inf_func + agg_att <- as.vector(gt_result$att) } else { - agg_sch <- get_aggregate_scheme(group_time, result_type, id_weights, id_cohorts, balanced_event_time) + #get which gt(s) is a part of the aggregated param + agg_sch <- get_aggregate_scheme(group_time, p$result_type, aux$weights, id_cohorts, p$balanced_event_time) targets <- agg_sch$targets - weights <- as.matrix(agg_sch$weights) #aggregated att - agg_att <- weights %*% gt_att + agg_att <- agg_sch$agg_weights %*% gt_result$att #get the influence from weight estimation #this needs to be optimized!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! - inf_weights <- sapply(asplit(weights, 1), function (x){ - get_weight_influence(x, gt_att, id_weights, id_cohorts, group_time[, .(G, time)]) + inf_weights <- sapply(asplit(agg_sch$agg_weights, 1), function (x){ + get_weight_influence(x, gt_result$att, aux$weights, id_cohorts, group_time[, .(G, time)]) }) #aggregated influence function - inf_matrix <- (gt_inf_func %*% t(weights)) + inf_weights - + inf_matrix <- (gt_result$inf_func %*% t(agg_sch$agg_weights)) + inf_weights + } - return(list(inf_matrix = inf_matrix, agg_att = agg_att, targets = targets)) + + #get se from influence function + agg_se <- get_se(inf_matrix, p$boot, p$biters, aux$cluster, p$clustervar) + + # post process + result <- data.table(targets, agg_att, agg_se) + names(result) <- c("target", "att", "se") + result[,outcome := gt_result$outname] + + return(result) } -get_aggregate_scheme <- function(group_time, result_type, id_weights, id_cohorts, balanced_event_time){ +get_aggregate_scheme <- function(group_time, result_type, weights, id_cohorts, balanced_event_time){ #browser() - weights <- data.table() + agg_weights <- data.table() gt_count <- group_time[, .N] bool_to_pn <- function(x){ifelse(x, 1, -1)} @@ -69,7 +78,7 @@ get_aggregate_scheme <- function(group_time, result_type, id_weights, id_cohorts #for balanced cohort composition in dynamic setting #a cohort us only used if it is seen for all dynamic time - if(result_type == "dynamic" & !is.null(balanced_event_time)){ + if(result_type == "dynamic" & !is.na(balanced_event_time)){ cohorts <- group_time[, .(max_et = max(time-G), min_et = min(time-G)), by = "G"] @@ -91,18 +100,18 @@ get_aggregate_scheme <- function(group_time, result_type, id_weights, id_cohorts group_time[, targeted := NULL] - weights <- rbind(weights, target_weights) + agg_weights <- rbind(agg_weights, target_weights) } - return(list(weights = weights, #a matrix of each target and gt's weight in it + return(list(agg_weights = as.matrix(agg_weights), #a matrix of each target and gt's weight in it targets = targets)) } -get_weight_influence <- function(agg_weights, gt_att, id_weights, id_cohorts, group) { +get_weight_influence <- function(agg_weights, gt_att, weights, id_cohorts, group) { keepers <- which(agg_weights > 0) - id_dt <- data.table(weight = id_weights/sum(id_weights), G = id_cohorts) + id_dt <- data.table(weight = weights/sum(weights), G = id_cohorts) pg_dt <- id_dt[, .(pg = sum(weight)), by = "G"] group <- group |> merge(pg_dt, by = "G") @@ -112,12 +121,12 @@ get_weight_influence <- function(agg_weights, gt_att, id_weights, id_cohorts, gr # effect of estimating weights in the numerator if1 <- sapply(keepers, function(k) { - (id_weights*BMisc::TorF(id_cohorts == group[k,G]) - group[k,pg]) / + (weights*BMisc::TorF(id_cohorts == group[k,G]) - group[k,pg]) / sum(group[keepers,pg]) }) # effect of estimating weights in the denominator if2 <- base::rowSums(sapply(keepers, function(k) { - id_weights*BMisc::TorF(id_cohorts == group[k,G]) - group[k,pg] + weights*BMisc::TorF(id_cohorts == group[k,G]) - group[k,pg] })) %*% t(group[keepers,pg]/(sum(group[keepers,pg])^2)) # return the influence function for the weights @@ -126,24 +135,57 @@ get_weight_influence <- function(agg_weights, gt_att, id_weights, id_cohorts, gr return(inf_weight) } -estimate_did <- function(dt_did, covnames, control_type, - last_coef = NULL, cache_ps_fit, cache_hess){ +get_se <- function(inf_matrix, boot, biters, cluster, clustervar) { + + if(boot){ + + top_quant <- 0.75 + bot_quant <- 0.25 + if(!allNA(clustervar)){ + #take average within the cluster + cluster_n <- stats::aggregate(cluster, by=list(cluster), length)[,2] + inf_matrix <- fsum(inf_matrix, cluster) / cluster_n #the mean without 0 for each cluster of each setting + } + + boot_results <- BMisc::multiplier_bootstrap(inf_matrix, biters = biters) %>% as.data.table() + + boot_top <- boot_results[, lapply(.SD, function(x) stats::quantile(x, top_quant, type=1, na.rm = TRUE))] + boot_bot <- boot_results[, lapply(.SD, function(x) stats::quantile(x, bot_quant, type=1, na.rm = TRUE))] + + dt_se <- rbind(boot_bot, boot_top) %>% transpose() + names(dt_se) <- c("boot_bot", "boot_top") + + se <- dt_se[,(boot_top-boot_bot)/(qnorm(top_quant) - qnorm(bot_quant))] + se[se < sqrt(.Machine$double.eps)*10] <- NA + + } else { + + inf_matrix <- inf_matrix |> as.data.table() + se <- inf_matrix[, lapply(.SD, function(x) sqrt(sum(x^2, na.rm = TRUE)/length(x)^2))] %>% as.vector() #should maybe use n-1 but did use n + + } + return(unlist(se)) +} - # preprocess -------- +estimate_did <- function(dt_did, covvars, control_type, + last_coef = NULL, cache_ps_fit, cache_hess){ + + # preprocess -------- oldn <- dt_did[, .N] data_pos <- which(dt_did[, !is.na(D)]) dt_did <- dt_did[data_pos] n <- dt_did[, .N] - if(is.null(covnames)){ - covvars <- NULL + + if(is.matrix(covvars)){ + ipw <- control_type %in% c("ipw", "dr") + or <- control_type %in% c("reg", "dr") + covvars <- covvars[data_pos,] } else { - covvars <- as.matrix(dt_did[,.SD, .SDcols = covnames]) + ipw <- FALSE + or <- FALSE } - - ipw <- control_type %in% c("ipw", "dr") & !is.null(covvars) - or <- control_type %in% c("reg", "dr") & !is.null(covvars) #OR is REG - + # ipw -------- if(ipw){ @@ -157,19 +199,25 @@ estimate_did <- function(dt_did, covnames, control_type, intercept = FALSE)) class(prop_score_est) <- "glm" #trick the vcov function to think that this is a glm object to dispatch the write method #const is implicitly put into the ipw formula, need to incorporate it manually - hess <- stats::vcov(prop_score_est) * n #for the influence function logit_coef <- prop_score_est$coefficients + + if(anyNA(logit_coef)){ + warning("some propensity score estimation resulted in NA coefficients, likely cause by perfect colinearity") + } + logit_coef[is.na(logit_coef)|abs(logit_coef) > 1e10] <- 0 #put extreme value and na to 0 prop_score_fit <- fitted(prop_score_est) if(max(prop_score_fit) >= 1){warning(paste0("support overlap condition violated for some group_time"))} prop_score_fit <- pmin(1-1e-16, prop_score_fit) #for the ipw + hess <- stats::vcov(prop_score_est) * n #for the influence function + hess[is.na(hess)|abs(hess) > 1e10] <- 0 } else { #when using multiple outcome, ipw cache can be reused hess <- cache_hess prop_score_fit <- cache_ps_fit - logit_coef <- NULL #won't be needing the approximate cache + logit_coef <- NA #won't be needing the approximate cache } #get the results into the main did dt @@ -180,8 +228,8 @@ estimate_did <- function(dt_did, covnames, control_type, } else { prop_score_fit <- rep(1,n) - logit_coef <- NULL - hess <- NULL + logit_coef <- NA + hess <- NA dt_did[, treat_ipw_weight := weights*D] dt_did[, cont_ipw_weight := weights*(1-D)] @@ -232,6 +280,7 @@ estimate_did <- function(dt_did, covnames, control_type, M2 <- colMeans(dt_did[, cont_ipw_weight*(delta_y-weighted_cont_delta-or_delta)] * covvars) score_ps <- dt_did[, weights*(D-ps)] * covvars + asym_linear_ps <- score_ps %*% hess #ipw for control @@ -268,6 +317,7 @@ estimate_did <- function(dt_did, covnames, control_type, #get overall influence function + #if(dt_did[, mean(cont_ipw_weight)] < 1e-10){warning("little/no overlap in covariates between control and treat group, estimates are unstable.")} inf_cont <- (inf_cont_did+inf_cont_ipw+inf_cont_or)/dt_did[, mean(cont_ipw_weight)] inf_treat <- (inf_treat_did+inf_treat_or)/dt_did[,mean(treat_ipw_weight)] inf_func_no_na <- inf_treat - inf_cont @@ -277,26 +327,24 @@ estimate_did <- function(dt_did, covnames, control_type, inf_func_no_na <- inf_func_no_na * oldn / n #adjust the value such that mean over the whole id size give the right result inf_func[data_pos] <- inf_func_no_na - return(list(att = att, inf_func = inf_func, logit_coef = logit_coef, #for next gt cache_ps_fit = prop_score_fit, cache_hess = hess)) #for next outcome } -estimate_did_rc <- function(dt_did, covnames, control_type, +estimate_did_rc <- function(dt_did, covvars, control_type, last_coef = NULL, cache_ps_fit, cache_hess){ #TODO: skip if not enough valid data # preprocess -------- - oldn <- dt_did[, .N] - #separate the dataset into pre and post - + oldn <- dt_did[, .N] data_pos <- which(dt_did[, !is.na(D)]) dt_did <- dt_did[data_pos] n <- dt_did[, .N] + #separate the dataset into pre and post dt_did[, inpre := as.numeric(!is.na(pre.y))] dt_did[, inpost := as.numeric(!is.na(post.y))] n_pre <- dt_did[, sum(!is.na(pre.y))] @@ -305,15 +353,15 @@ estimate_did_rc <- function(dt_did, covnames, control_type, sum_weight_pre <- dt_did[, sum(inpre*weights)] sum_weight_post <- dt_did[, sum(inpost*weights)] - if(is.null(covnames)){ - covvars <- NULL + if(is.matrix(covvars)){ + ipw <- control_type %in% c("ipw", "dr") + or <- control_type %in% c("reg", "dr") + covvars <- covvars[data_pos,] } else { - covvars <- as.matrix(dt_did[,.SD, .SDcols = covnames]) + ipw <- FALSE + or <- FALSE } - - ipw <- control_type %in% c("ipw", "dr") & !is.null(covvars) - or <- control_type %in% c("reg", "dr") & !is.null(covvars) #OR is REG - + # ipw -------- if(ipw){ @@ -346,8 +394,8 @@ estimate_did_rc <- function(dt_did, covnames, control_type, } else { prop_score_fit <- rep(1,n) - logit_coef <- NULL - hess <- NULL + logit_coef <- NA + hess <- NA dt_did[, treat_ipw_weight := weights*D] dt_did[, cont_ipw_weight := weights*(1-D)] @@ -475,8 +523,8 @@ estimate_did_rc <- function(dt_did, covnames, control_type, inf_treat_post <- inf_treat_did_post+inf_treat_or_post inf_cont_pre <- inf_cont_did_pre+inf_cont_ipw_pre+inf_cont_or_pre inf_treat_pre <- inf_treat_did_pre+inf_treat_or_pre + #post process - inf_func_no_na_post <- (inf_treat_post - inf_cont_post) * oldn / n_post #adjust the value such that mean over the whole id size give the right result inf_func_no_na_post[is.na(inf_func_no_na_post)] <- 0 #fill 0 for NA part (no influce if not in this gt) @@ -490,62 +538,74 @@ estimate_did_rc <- function(dt_did, covnames, control_type, cache_ps_fit = prop_score_fit, cache_hess = hess)) #for next outcome } -estimate_gtatt <- function(outcomes_list, outcomevar, covariates, control_type, weights, - cohort_sizes,cohorts,id_size,time_periods, - control_option, allow_unbalance_panel) { +estimate_gtatt <- function(aux, p) { - treated_cohorts <- cohorts[!is.infinite(cohorts)] - - if(!Inf %in% cohorts & control_option != "notyet"){ + if(!Inf %in% aux$cohorts & p$control_option != "notyet"){ warning("no never-treated availble, switching to not-yet-treated control") - control_option <- "notyet" + p$control_option <- "notyet" } - max_control_cohort <- ifelse(control_option == "notyet", max(treated_cohorts), Inf) + + if(!is.na(p$max_control_cohort_diff)){ + warning("max_control_cohort_diff can only be used with not yet") + p$control_option <- "notyet" + } + + treated_cohorts <- aux$cohorts[!is.infinite(aux$cohorts)] cache_ps_fit_list <- list() #for the first outcome won't be able to use cache, empty list returns null for call like cache_ps_fit[["1.3"]] cache_hess_list <- list() outcome_result_list <- list() - - for(outcol in outcomevar){ + for(outcol in p$outcomevar){ - outcomes <- outcomes_list[[outcol]] + outcomes <- aux$outcomes_list[[outcol]] last_coef <- NULL gt <- data.table() gt_att <- c() - gt_inf_func <- data.table(placeholder = rep(NA, id_size)) + gt_inf_func <- data.table(placeholder = rep(NA, aux$id_size)) #populate with NA - for(t in time_periods){ - for(g in cohorts){ + for(t in aux$time_periods){ + for(g in aux$cohorts){ gt_name <- paste0(g, ".", t) #setup and checks base_period <- g-1 - min_control_cohort <- ifelse(control_option == "never", Inf, max(t+1, base_period+1)) #not-yet treated / never treated in both base and "treated" period - if(t == base_period){next} #no treatmenteffect for the base period - if(base_period < min(time_periods)){next} #no treatmenteffect for the first period, since base period is not observed - if(g >= max_control_cohort){next} #no treatmenteffect for never treated or the last treated cohort (notyet) + min_control_cohort <- ifelse(p$control_option == "never", Inf, max(t+1, base_period+1)) #not-yet treated / never treated in both base and "treated" period + + if(!is.na(p$max_control_cohort_diff)){ + max_control_cohort <- min(g+p$max_control_cohort_diff, max(treated_cohorts)) + } else { + max_control_cohort <- ifelse(p$control_option == "notyet", max(treated_cohorts), Inf) + } + + if(t == base_period){next} #no treatment effect for the base period + if(base_period < min(aux$time_periods)){next} #no treatment effect for the first period, since base period is not observed + if(g >= max_control_cohort){next} #no treatment effect for never treated or the last treated cohort (for not yet notyet) if(t >= max_control_cohort){next} #no control available if the last cohort is treated too #select the control and treated cohorts - did_setup <- rep(NA, id_size) - did_setup[get_cohort_pos(cohort_sizes, min_control_cohort, max_control_cohort)] <- 0 - did_setup[get_cohort_pos(cohort_sizes, g)] <- 1 #treated cannot be controls, assign treated after control to overwrite + did_setup <- rep(NA, aux$id_size) + did_setup[get_cohort_pos(aux$cohort_sizes, min_control_cohort, max_control_cohort)] <- 0 + did_setup[get_cohort_pos(aux$cohort_sizes, g)] <- 1 #treated cannot be controls, assign treated after control to overwrite + + #construct the covariates matrix + covvars <- get_covvars(aux$covariates, aux$varycovariates, base_period, t) #construct the 2x2 dataset - cohort_did <- data.table(did_setup, outcomes[[t]], outcomes[[base_period]], weights, covariates) - setnames(cohort_did, c("did_setup", "V2", "V3", "weights"), c("D", "post.y", "pre.y", "weights")) - + cohort_did <- data.table(did_setup, outcomes[[t]], outcomes[[base_period]], aux$weights) + names(cohort_did) <- c("D", "post.y", "pre.y", "weights") + #estimate did - if(!allow_unbalance_panel){ - result <- estimate_did(cohort_did, colnames(covariates), control_type, last_coef, cache_ps_fit_list[[gt_name]], cache_hess_list[[gt_name]]) + if(!p$allow_unbalance_panel){ + result <- estimate_did(cohort_did, covvars, p$control_type, + last_coef, cache_ps_fit_list[[gt_name]], cache_hess_list[[gt_name]]) #cache } else { - result <- estimate_did_rc(cohort_did, colnames(covariates), control_type, last_coef, cache_ps_fit_list[[gt_name]], cache_hess_list[[gt_name]]) + result <- estimate_did_rc(cohort_did, covvars, p$control_type, + last_coef, cache_ps_fit_list[[gt_name]], cache_hess_list[[gt_name]]) #cache } - #collect the result - last_coef <- result$logit_coef + last_coef <- result$logit_coef #for faster convergence in next iter gt <- rbind(gt, data.table(G = g, time = t)) #the sequence matters for the weights gt_att <- c(gt_att, att = result$att) gt_inf_func[[gt_name]] <- result$inf_func @@ -560,10 +620,7 @@ estimate_gtatt <- function(outcomes_list, outcomevar, covariates, control_type, gt_inf_func[,placeholder := NULL] names(gt_att) <- names(gt_inf_func) - outcome_result <- list(att = gt_att, inf_func = gt_inf_func, gt = gt) - outcome_result_list[[outcol]] <- outcome_result - - rm(outcome_result) + outcome_result_list[[outcol]] <- list(att = gt_att, inf_func = gt_inf_func, gt = gt, outname = outcol) } @@ -571,12 +628,37 @@ estimate_gtatt <- function(outcomes_list, outcomevar, covariates, control_type, } get_cohort_pos <- function(cohort_sizes, start_cohort, end_cohort = start_cohort){ - #if(!start_cohort %in% cohort_sizes[,unique(G)]|!end_cohort %in% cohort_sizes[,unique(G)]) {stop("cohort not in cohort_sizes")} start <- cohort_sizes[G < start_cohort, sum(cohort_size)]+1 end <- cohort_sizes[G <= end_cohort, sum(cohort_size)] return(start:end) } +get_covvars <- function(covariates, varycovariates, base_period, t){ + + if(is.list(varycovariates)){ + + precov <- varycovariates[[base_period]] + names(precov) <- paste0("pre_", names(precov)) + postcov <- varycovariates[[t]]-varycovariates[[base_period]] + names(postcov) <- paste0("post_", names(varycovariates[[t]])) + + if(is.data.table(covariates)){ + covvars <- cbind(covariates, cbind(precov, postcov)) #if covariates is na + } else { + covvars <- cbind(const = -1, cbind(precov, postcov)) #if covariates is na + } + covvars <- as.matrix(covvars) + + } else { + if(is.data.table(covariates)){ + covvars <- as.matrix(covariates) #will be NA if covariatesvar have nothing + } else { + covvars <- NA + } + } + return(covvars) +} + #' Fast Staggered DID Estimation #' #' Performs Difference-in-Differences (DID) estimation fast. @@ -588,14 +670,15 @@ get_cohort_pos <- function(cohort_sizes, start_cohort, end_cohort = start_cohort #' @param outcomevar The name of the outcome variable. #' @param control_option The control units used for the DiD estimates. Default is "both". #' @param result_type A character string indicating the type of result to be returned. Default is "group_time". -#' @param balanced_event_time A numeric scalar that indicates the max event time to balance the cohort composition, only meaningful when result_type == "dynamic". Default is NULL +#' @param balanced_event_time A numeric scalar that indicates the max event time to balance the cohort composition, only meaningful when result_type == "dynamic". Default is NA #' @param control_type The method for controlling for covariates. "ipw" for inverse probability weighting, "reg" for outcome regression, or "dr" for doubly-robust #' @param allow_unbalance_panel Whether allow unbalance panel as input (if false will coerce the dataset to a balanced panel). Default is FALSE #' @param boot Logical, indicating whether bootstrapping should be performed. Default is FALSE. #' @param biters The number of bootstrap iterations. Only relevant if boot = TRUE. Default is 1000. #' @param weightvar The name of the weight variable (optional). #' @param clustervar The name of the cluster variable, can only be used when boot == TRUE (optional). -#' @param covariatesvar A character vector containing the names of covariate variables (optional). +#' @param covariatesvar A character vector containing the names of time-invariant covariate variables (optional). +#' @param varycovariatesvar A character vector containing the names of time-varying covariate variables (optional). #' @param copy whether to copy the dataset before processing, set to false to speed up the process, but the input data will be altered. #' @param validate whether to validate the dataset before processing. #' @@ -610,24 +693,24 @@ get_cohort_pos <- function(cohort_sizes, start_cohort, end_cohort = start_cohort #' dt <- simdt$dt #' #' #basic call -#' result <- fastdid(dt, timevar = "time", cohortvar = "G", +#' result <- fastdid(data = dt, timevar = "time", cohortvar = "G", #' unitvar = "unit", outcomevar = "y", #' result_type = "group_time") #' #' #control for covariates -#' result2 <- fastdid(dt, timevar = "time", cohortvar = "G", +#' result2 <- fastdid(data = dt, timevar = "time", cohortvar = "G", #' unitvar = "unit", outcomevar = "y", #' result_type = "group_time", #' covariatesvar = c("x", "x2")) #' #' #bootstrap and clustering -#' result3 <- fastdid(dt, timevar = "time", cohortvar = "G", +#' result3 <- fastdid(data = dt, timevar = "time", cohortvar = "G", #' unitvar = "unit", outcomevar = "y", #' result_type = "group_time", #' boot = TRUE, clustervar = "x") #' #' #estimate for multiple outcomes -#' result4 <- fastdid(dt, #the dataset +#' result4 <- fastdid(data = dt, #the dataset #' timevar = "time", cohortvar = "G", unitvar = "unit", #' outcomevar = c("y", "y2"), #name of the outcome columns #' result_type = "group_time") @@ -635,60 +718,109 @@ get_cohort_pos <- function(cohort_sizes, start_cohort, end_cohort = start_cohort #' @keywords difference-in-differences fast computation panel data estimation did fastdid <- function(data, timevar, cohortvar, unitvar, outcomevar, - control_option="both",result_type="group_time", balanced_event_time = NULL, + control_option="both",result_type="group_time", balanced_event_time = NA, control_type = "ipw", allow_unbalance_panel = FALSE, boot=FALSE, biters = 1000, - weightvar=NULL,clustervar=NULL,covariatesvar = NULL, - copy = TRUE, validate = TRUE + weightvar=NA,clustervar=NA,covariatesvar = NA, varycovariatesvar = NA, + copy = TRUE, validate = TRUE, + max_control_cohort_diff = NA ){ - - # validation arguments -------------------------------------------------------- - + # validate arguments -------------------------------------------------------- + if(!is.data.table(data)){ warning("coercing input into a data.table.") data <- as.data.table(data) } - if(copy){dt <- copy(data)} else {dt <- data} - + dt_names <- names(dt) name_message <- "__ARG__ must be a character scalar and a name of a column from the dataset." check_set_arg(timevar, unitvar, cohortvar, "match", .choices = dt_names, .message = name_message) - covariate_message <- "__ARG__ must be NULL or a character vector which are all names of columns from the dataset." - check_set_arg(covariatesvar, outcomevar, - "NULL | multi match", .choices = dt_names, .message = covariate_message) + covariate_message <- "__ARG__ must be NA or a character vector which are all names of columns from the dataset." + check_set_arg(varycovariatesvar, covariatesvar, outcomevar, + "NA | multi match", .choices = dt_names, .message = covariate_message) - checkvar_message <- "__ARG__ must be NULL or a character scalar if a name of columns from the dataset." + checkvar_message <- "__ARG__ must be NA or a character scalar if a name of columns from the dataset." check_set_arg(weightvar, clustervar, - "NULL | match", .choices = dt_names, .message = checkvar_message) + "NA | match", .choices = dt_names, .message = checkvar_message) check_set_arg(control_option, "match", .choices = c("both", "never", "notyet")) #kinda bad names since did's notyet include both notyet and never check_set_arg(control_type, "match", .choices = c("ipw", "reg", "dr")) check_arg(copy, validate, boot, allow_unbalance_panel, "scalar logical") - if(!is.null(balanced_event_time)){ + if(!is.na(balanced_event_time)){ if(result_type != "dynamic"){stop("balanced_event_time is only meaningful with result_type == 'dynamic'")} check_arg(balanced_event_time, "numeric scalar") } - if(allow_unbalance_panel == TRUE & control_type %in% c("dr", "reg")){ stop("fastdid currently only supprts ipw when allowing for unbalanced panels.") } + if(allow_unbalance_panel == TRUE & !allNA(varycovariatesvar)){ + stop("fastdid currently only supprts time varying covariates when allowing for unbalanced panels.") + } + if(any(covariatesvar %in% varycovariatesvar) & !allNA(varycovariatesvar) & !allNA(covariatesvar)){ + stop("time-varying var and invariant var have overlaps.") + } + if(!boot & !allNA(clustervar)){ + stop("clustering only available with bootstrap") + } + + p <- list(timevar = timevar, + cohortvar = cohortvar, + unitvar = unitvar, + outcomevar = outcomevar, + weightvar = weightvar, + clustervar = clustervar, + covariatesvar = covariatesvar, + varycovariatesvar = varycovariatesvar, + control_option = control_option, + result_type = result_type, + balanced_event_time = balanced_event_time, + control_type = control_type, + allow_unbalance_panel = allow_unbalance_panel, + boot = boot, + biters = biters, + max_control_cohort_diff = max_control_cohort_diff) + - setnames(dt, c(timevar, cohortvar, unitvar), c("time", "G", "unit")) - # validate data ----------------------------------------------------- + setnames(dt, c(timevar, cohortvar, unitvar), c("time", "G", "unit")) if(validate){ - varnames <- c("time", "G", "unit", outcomevar,weightvar,clustervar,covariatesvar) - dt <- validate_did(dt, covariatesvar, varnames, balanced_event_time, allow_unbalance_panel) + varnames <- c("time", "G", "unit", outcomevar,weightvar,clustervar,covariatesvar,varycovariatesvar) + dt <- validate_did(dt, varnames, p) } # preprocess ----------------------------------------------------------- #make dt conform to the WLOG assumptions of fastdid + coerce_result <- coerce_dt(dt, p) #also changed dt + dt <- coerce_result$dt + # get auxiliary data + aux <- get_auxdata(dt, p) + + # main part ------------------------------------------------- + + # estimate attgt + gt_result_list <- estimate_gtatt(aux, p) + + # aggregate the result by outcome + all_result <- rbindlist(lapply(gt_result_list, function(x){ + result <- aggregate_gt(x, aux, p) + #convert "targets" back to meaningful parameter identifiers like cohort 1 post, time 2 post + result <- result |> convert_targets(result_type, coerce_result$time_change) + return(result) + })) + + return(all_result) + +} + +# small steps ---------------------------------------------------------------------- + +coerce_dt <- function(dt, p){ #change to int before sorting if(!is.numeric(dt[, G])){ @@ -698,7 +830,7 @@ fastdid <- function(data, dt[, time := as.numeric(time)] } - if(allow_unbalance_panel){ + if(p$allow_unbalance_panel){ dt_inv_raw <- dt[dt[, .I[1], by = unit]$V1] setorder(dt_inv_raw, G) dt_inv_raw[, new_unit := 1:.N] #let unit start from 1 .... N, useful for knowing which unit is missing @@ -707,11 +839,10 @@ fastdid <- function(data, } setorder(dt, time, G, unit) #sort the dataset essential for the sort-once-quick-access - #deal with time, coerice time to 1,2,3,4,5....... time_periods <- dt[, unique(time)] time_size <- length(time_periods) - + time_offset <- min(time_periods) - 1 #assume time starts at 1, first is min after sort :) if(time_offset != 0){ dt[, G := G-time_offset] @@ -728,15 +859,27 @@ fastdid <- function(data, dt[time != 1, time := (time-1)/time_step+1] } - #construct the outcomes list for fast access later + return(list(dt = dt, + time_change = list(time_step = time_step, + max_time = max(time_periods), + time_offset = time_offset))) + +} + +get_auxdata <- function(dt, p){ + + time_periods <- dt[, unique(time)] id_size <- dt[, uniqueN(unit)] - outcomes_list <- list() + setorder(dt, time, G, unit) #sort the dataset essential for the sort-once-quick-access + + #construct the outcomes list for fast access later #loop for multiple outcome - for(outcol in outcomevar){ + outcomes_list <- list() + for(outcol in p$outcomevar){ outcomes <- list() - if(!allow_unbalance_panel){ + if(!p$allow_unbalance_panel){ for(i in time_periods){ start <- (i-1)*id_size+1 end <- i*id_size @@ -754,78 +897,67 @@ fastdid <- function(data, } } - + outcomes_list[[outcol]] <- outcomes } - + #the time-invariant parts - if(!allow_unbalance_panel){ + if(!p$allow_unbalance_panel){ dt_inv <- dt[1:id_size] } else { - dt_inv <- dt[dt[, .I[1], by = unit]$V1] + dt_inv <- dt[dt[, .I[1], by = unit]$V1] #the first observation setorder(dt_inv, unit) #can't move this outside } - - - cohorts <- dt_inv[, unique(G)] cohort_sizes <- dt_inv[, .(cohort_size = .N) , by = G] - + # the optional columns - if(!is.null(covariatesvar)){ - covariates <- cbind(const = -1, dt_inv[,.SD, .SDcols = covariatesvar]) # const + varycovariates <- list() + if(!allNA(p$varycovariatesvar)){ + for(i in time_periods){ + start <- (i-1)*id_size+1 + end <- i*id_size + varycovariates[[i]] <- dt[seq(start,end), .SD, .SDcols = p$varycovariatesvar] + } } else { - covariates <- NULL + varycovariates <- NA } - if(!is.null(clustervar)){ - cluster <- dt_inv[, .SD, .SDcols = clustervar] |> unlist() - } else {cluster <- NULL} - - if(!is.null(weightvar)){ - weights <- dt_inv[, .SD, .SDcols = weightvar] |> unlist() - } else {weights <- rep(1,id_size)} + if(!allNA(p$covariatesvar)){ + covariates <- cbind(const = -1, dt_inv[,.SD, .SDcols = p$covariatesvar]) + } else { + covariates <- NA + } - # main part ------------------------------------------------- - - # attgt - gt_result_list <- estimate_gtatt(outcomes_list, outcomevar, covariates, control_type, weights, - cohort_sizes,cohorts,id_size,time_periods, #info about the dt - control_option, allow_unbalance_panel) + if(!is.na(p$clustervar)){ + cluster <- dt_inv[, .SD, .SDcols = p$clustervar] |> unlist() + } else { + cluster <- NA + } - all_result <- data.table() - for(outcol in outcomevar){ - gt_result <- gt_result_list[[outcol]] - - # aggregate att and inf function - agg_result <- aggregate_gt(gt_result, cohort_sizes, - weights, dt_inv[, G], - result_type, balanced_event_time) - - #get se from the influence function - agg_se <- get_se(agg_result$inf_matrix, boot, biters, cluster) - - # post process ----------------------------------------------- - - result <- data.table(agg_result$targets, agg_result$agg_att, agg_se) - names(result) <- c("target", "att", "se") - - #convert "targets" back to meaningful parameter identifiers like cohort 1 post, time 2 post - result <- result |> convert_targets(result_type, time_offset, time_step, max(time_periods)) - result[, outcome := outcol] - all_result <- rbind(all_result, result) - - rm(result) - + if(!is.na(p$weightvar)){ + weights <- dt_inv[, .SD, .SDcols = p$weightvar] |> unlist() + } else { + weights <- rep(1, id_size) } - - return(all_result) + + aux <- list(time_periods = time_periods, + id_size = id_size, + outcomes_list = outcomes_list, + dt_inv= dt_inv, + cohorts = cohorts, + cohort_sizes = cohort_sizes, + varycovariates = varycovariates, + covariates = covariates, + cluster = cluster, + weights = weights) + + return(aux) } -convert_targets <- function(results, result_type, - time_offset, time_step, max_time){ +convert_targets <- function(results, result_type, t){ if(result_type == "dynamic"){ setnames(results, "target", "event_time") @@ -833,23 +965,23 @@ convert_targets <- function(results, result_type, } else if (result_type == "cohort"){ results[, type := ifelse(target >= 0, "post", "pre")] - results[, target := recover_time(abs(target), time_offset, time_step)] + results[, target := recover_time(abs(target), t$time_offset, t$time_step)] setnames(results, "target", "cohort") } else if (result_type == "calendar"){ results[, type := ifelse(target >= 0, "post", "pre")] - results[, target := recover_time(abs(target), time_offset, time_step)] + results[, target := recover_time(abs(target), t$time_offset, t$time_step)] setnames(results, "target", "time") } else if (result_type == "group_time"){ - results[, cohort := floor((target-1)/max_time)] - results[, time := (target-cohort*max_time)] + results[, cohort := floor((target-1)/t$max_time)] + results[, time := (target-cohort*t$max_time)] #recover the time - results[, cohort := recover_time(cohort, time_offset, time_step)] - results[, time := recover_time(time, time_offset, time_step)] + results[, cohort := recover_time(cohort, t$time_offset, t$time_step)] + results[, time := recover_time(time, t$time_offset, t$time_step)] results[, target := NULL] @@ -864,38 +996,6 @@ recover_time <- function(time, time_offset, time_step){ return(((time-1)*time_step)+1+time_offset) } -get_se <- function(inf_matrix, boot, biters, cluster) { - - if(boot){ - - top_quant <- 0.75 - bot_quant <- 0.25 - if(!is.null(cluster)){ - cluster_n <- stats::aggregate(cluster, by=list(cluster), length)[,2] - inf_matrix <- fsum(inf_matrix, cluster) / cluster_n #the mean without 0 for each cluster of each setting - } - - boot_results <- BMisc::multiplier_bootstrap(inf_matrix, biters = biters) %>% as.data.table() - - boot_top <- boot_results[, lapply(.SD, function(x) stats::quantile(x, top_quant, type=1, na.rm = TRUE)),] - boot_bot <- boot_results[, lapply(.SD, function(x) stats::quantile(x, bot_quant, type=1, na.rm = TRUE)),] - - dt_se <- rbind(boot_bot, boot_top) %>% transpose() - names(dt_se) <- c("boot_bot", "boot_top") - - se <- dt_se[,(boot_top-boot_bot)/(qnorm(top_quant) - qnorm(bot_quant))] - se[se < sqrt(.Machine$double.eps)*10] <- NA - - } else { - if(!is.null(cluster)){stop("clustering only available with bootstrap")} - inf_matrix <- inf_matrix |> as.data.table() - se <- inf_matrix[, lapply(.SD, function(x) sqrt(sum(x^2, na.rm = TRUE)/length(x)^2))] %>% as.vector() #should maybe use n-1 but did use n - - } - return(unlist(se)) -} - - NULL if(FALSE){ @@ -1008,7 +1108,7 @@ plot_did_dynamics <-function(dt, #' #' @export sim_did <- function(sample_size, time_period, untreated_prop = 0.3, epsilon_size = 0.001, - cov = "no", hetero = "all", second_outcome = FALSE, second_cov = FALSE, na = "none", + cov = "no", hetero = "all", second_outcome = FALSE, second_cov = FALSE, vary_cov = FALSE, na = "none", balanced = TRUE, seed = NA, stratify = FALSE, treatment_assign = "latent"){ if(!is.na(seed)){set.seed(seed)} @@ -1072,8 +1172,15 @@ sim_did <- function(sample_size, time_period, untreated_prop = 0.3, epsilon_size dt[, D := as.integer(time >= G)] + #add time_varying covariates + if(vary_cov){ + dt[, xvar := pmin(G, time_period+4)*time+rnorm(sample_size*time_period, 0,30)] #should be confounding....? + } else { + dt[, xvar := 1] + } + #untreated potential outcomes - dt[, y0 := unit_fe + time_fe + x*x_trend + x2*x_trend + rnorm(sample_size*time_period, sd = epsilon_size)] + dt[, y0 := unit_fe + time_fe + (x+x2)*x_trend + xvar + rnorm(sample_size*time_period, sd = epsilon_size)] #generate gtatt att <- CJ(G = 1:time_period, time = 1:time_period) @@ -1093,7 +1200,7 @@ sim_did <- function(sample_size, time_period, untreated_prop = 0.3, epsilon_size #potential outcome dt[, y1 := y0 + tau] dt[, y := y1*D + y0*(1-D)] - dt <- dt[, .SD, .SDcols = c("time", "G", "unit", "x", "x2", "y", "s")] + dt <- dt[, .SD, .SDcols = c("time", "G", "unit", "x", "x2", "y", "s", "xvar")] #additional ----------------- @@ -1120,9 +1227,6 @@ sim_did <- function(sample_size, time_period, untreated_prop = 0.3, epsilon_size - - - set_max_thread <- function(){ data.table::setDTthreads(0) options(kit.nThread = getDTthreads()) @@ -1132,18 +1236,18 @@ reverse_col <- function(x){ return(x[,ncol(x):1]) } -validate_did <- function(dt,covariatesvar,varnames, balanced_event_time, allow_unbalance_panel){ +validate_did <- function(dt,varnames,p){ + raw_unit_size <- dt[, uniqueN(unit)] raw_time_size <- dt[, uniqueN(time)] - - if(!is.null(balanced_event_time)){ - if(balanced_event_time > dt[, max(time-G)]){stop("balanced_event_time is larger than the max event time in the data")} + if(!is.na(p$balanced_event_time)){ + if(p$balanced_event_time > dt[, max(time-G)]){stop("balanced_event_time is larger than the max event time in the data")} } #doesn't allow missing value for now - for(col in c(covariatesvar, varnames)){ - if(is.null(col)){next} + for(col in varnames){ + if(is.na(col)){next} na_obs <- whichNA(dt[[col]]) if(length(na_obs) != 0){ warning("missing values detected in ", col, ", removing ", length(na_obs), " observation.") @@ -1151,17 +1255,18 @@ validate_did <- function(dt,covariatesvar,varnames, balanced_event_time, allow_u } } - if(!is.null(covariatesvar)){ - - if(uniqueN(dt, by = c("unit", covariatesvar)) > raw_unit_size){ - warning("some covariates is time-varying, fastdid only use the first observation for covariates.") - } - - for(cov in covariatesvar){ + + if(!allNA(p$covariatesvar) && uniqueN(dt, by = c("unit", p$covariatesvar)) > raw_unit_size){ + warning("some covariates is time-varying, fastdid only use the first observation for covariates.") + } + + + if(!allNA(p$covariatesvar)|!allNA(p$varycovariatesvar)){ + for(cov in c(p$covariatesvar, p$varycovariatesvar)){ + if(is.na(cov)){next} #check covaraites is not constant if(fnunique(dt[, get(cov)[1], by = "unit"][, V1]) == 1)stop(cov, " have no variation") } - } #check balanced panel @@ -1172,7 +1277,7 @@ validate_did <- function(dt,covariatesvar,varnames, balanced_event_time, allow_u } #check if any is missing - if(!allow_unbalance_panel){ + if(!p$allow_unbalance_panel){ unit_count <- dt[, .(count = .N), by = unit] if(any(unit_count[, count < raw_time_size])){ mis_unit <- unit_count[count < raw_time_size] From 9e606d285a89f1539e2621c8e4b437312e96263f Mon Sep 17 00:00:00 2001 From: TsaiLintung <101155567+TsaiLintung@users.noreply.github.com> Date: Tue, 7 May 2024 11:28:55 +0800 Subject: [PATCH 13/15] timing stuff mostly done --- R/estimate_gtatt.R | 66 ++++++++++++++++++++----------- R/fastdid.R | 19 +++++++-- inst/tinytest/test_1_fastdid.R | 15 ++++++- inst/tinytest/test_2_compare_gt.R | 30 ++++++++++++++ 4 files changed, 102 insertions(+), 28 deletions(-) diff --git a/R/estimate_gtatt.R b/R/estimate_gtatt.R index eb165d5..93a56ac 100644 --- a/R/estimate_gtatt.R +++ b/R/estimate_gtatt.R @@ -1,17 +1,11 @@ estimate_gtatt <- function(aux, p) { + #chcek if there is availble never-treated group if(!Inf %in% aux$cohorts & p$control_option != "notyet"){ warning("no never-treated availble, switching to not-yet-treated control") p$control_option <- "notyet" } - if(!is.na(p$max_control_cohort_diff)){ - warning("max_control_cohort_diff can only be used with not yet") - p$control_option <- "notyet" - } - - treated_cohorts <- aux$cohorts[!is.infinite(aux$cohorts)] - cache_ps_fit_list <- list() #for the first outcome won't be able to use cache, empty list returns null for call like cache_ps_fit[["1.3"]] cache_hess_list <- list() outcome_result_list <- list() @@ -27,26 +21,14 @@ estimate_gtatt <- function(aux, p) { for(g in aux$cohorts){ gt_name <- paste0(g, ".", t) - - #setup and checks - base_period <- g-1 - min_control_cohort <- ifelse(p$control_option == "never", Inf, max(t+1, base_period+1)) #not-yet treated / never treated in both base and "treated" period - - if(!is.na(p$max_control_cohort_diff)){ - max_control_cohort <- min(g+p$max_control_cohort_diff, max(treated_cohorts)) + if(p$base_period == "universal"){ + base_period <- g-1-p$anticipation } else { - max_control_cohort <- ifelse(p$control_option == "notyet", max(treated_cohorts), Inf) + base_period <- ifelse(t>=g, g-1-p$anticipation, t-1) } - if(t == base_period){next} #no treatment effect for the base period - if(base_period < min(aux$time_periods)){next} #no treatment effect for the first period, since base period is not observed - if(g >= max_control_cohort){next} #no treatment effect for never treated or the last treated cohort (for not yet notyet) - if(t >= max_control_cohort){next} #no control available if the last cohort is treated too - - #select the control and treated cohorts - did_setup <- rep(NA, aux$id_size) - did_setup[get_cohort_pos(aux$cohort_sizes, min_control_cohort, max_control_cohort)] <- 0 - did_setup[get_cohort_pos(aux$cohort_sizes, g)] <- 1 #treated cannot be controls, assign treated after control to overwrite + did_setup <- get_did_setup(g, t, base_period, aux, p) + if(is.null(did_setup)){next} #no gtatt if no did setup #construct the covariates matrix covvars <- get_covvars(aux$covariates, aux$varycovariates, base_period, t) @@ -87,6 +69,42 @@ estimate_gtatt <- function(aux, p) { return(outcome_result_list) } +get_did_setup <- function(g, t, base_period, aux, p){ + + treated_cohorts <- aux$cohorts[!is.infinite(aux$cohorts)] + + #setup and checks + + if(p$control_option == "never"){ + min_control_cohort <- Inf + } else { + if(!is.infinite(p$min_control_cohort_diff)){ + min_control_cohort <- max(g+p$min_control_cohort_diff, min(treated_cohorts)) + } else { + min_control_cohort <- max(t, base_period)+p$anticipation+1 + } + } + + if(!is.infinite(p$max_control_cohort_diff)){ + max_control_cohort <- min(g+p$max_control_cohort_diff, max(treated_cohorts)) + } else { + max_control_cohort <- ifelse(p$control_option == "notyet", max(treated_cohorts), Inf) + } + + if(t == base_period | #no treatment effect for the base period + base_period < min(aux$time_periods) | #no treatment effect for the first period, since base period is not observed + g >= max_control_cohort | #no treatment effect for never treated or the last treated cohort (for not yet notyet) + t >= max_control_cohort){ #no control available if the last cohort is treated too + return(NULL) + } else { + #select the control and treated cohorts + did_setup <- rep(NA, aux$id_size) + did_setup[get_cohort_pos(aux$cohort_sizes, min_control_cohort, max_control_cohort)] <- 0 + did_setup[get_cohort_pos(aux$cohort_sizes, g)] <- 1 #treated cannot be controls, assign treated after control to overwrite + return(did_setup) + } +} + get_cohort_pos <- function(cohort_sizes, start_cohort, end_cohort = start_cohort){ start <- cohort_sizes[G < start_cohort, sum(cohort_size)]+1 end <- cohort_sizes[G <= end_cohort, sum(cohort_size)] diff --git a/R/fastdid.R b/R/fastdid.R index bd4a437..3a3d1ba 100644 --- a/R/fastdid.R +++ b/R/fastdid.R @@ -20,6 +20,7 @@ #' @param varycovariatesvar A character vector containing the names of time-varying covariate variables (optional). #' @param copy whether to copy the dataset before processing, set to false to speed up the process, but the input data will be altered. #' @param validate whether to validate the dataset before processing. +#' @param anticipation periods with aniticipation (delta in CS, default is 0, reference period is g - delta - 1). #' #' @import data.table parglm stringr dreamerr BMisc #' @importFrom stats quantile vcov sd binomial fitted qnorm rnorm as.formula @@ -60,9 +61,9 @@ fastdid <- function(data, timevar, cohortvar, unitvar, outcomevar, control_option="both",result_type="group_time", balanced_event_time = NA, control_type = "ipw", allow_unbalance_panel = FALSE, boot=FALSE, biters = 1000, - weightvar=NA,clustervar=NA,covariatesvar = NA, varycovariatesvar = NA, + weightvar=NA,clustervar=NA, covariatesvar = NA, varycovariatesvar = NA, copy = TRUE, validate = TRUE, - max_control_cohort_diff = NA + max_control_cohort_diff = Inf, anticipation = 0, min_control_cohort_diff = -Inf, base_period = "universal" ){ # validate arguments -------------------------------------------------------- @@ -87,7 +88,9 @@ fastdid <- function(data, check_set_arg(control_option, "match", .choices = c("both", "never", "notyet")) #kinda bad names since did's notyet include both notyet and never check_set_arg(control_type, "match", .choices = c("ipw", "reg", "dr")) + check_set_arg(base_period, "match", .choices = c("varying", "universal")) check_arg(copy, validate, boot, allow_unbalance_panel, "scalar logical") + check_arg(max_control_cohort_diff, min_control_cohort_diff, anticipation, "scalar numeric") if(!is.na(balanced_event_time)){ if(result_type != "dynamic"){stop("balanced_event_time is only meaningful with result_type == 'dynamic'")} @@ -106,6 +109,13 @@ fastdid <- function(data, stop("clustering only available with bootstrap") } + # coerce non-sensible option + + if((!is.infinite(max_control_cohort_diff) | !is.infinite(min_control_cohort_diff)) & control_option == "never"){ + warning("control_cohort_diff can only be used with not yet") + p$control_option <- "notyet" + } + p <- list(timevar = timevar, cohortvar = cohortvar, unitvar = unitvar, @@ -121,7 +131,10 @@ fastdid <- function(data, allow_unbalance_panel = allow_unbalance_panel, boot = boot, biters = biters, - max_control_cohort_diff = max_control_cohort_diff) + max_control_cohort_diff = max_control_cohort_diff, + min_control_cohort_diff = min_control_cohort_diff, + anticipation = anticipation, + base_period = base_period) # validate data ----------------------------------------------------- diff --git a/inst/tinytest/test_1_fastdid.R b/inst/tinytest/test_1_fastdid.R index 7acb010..cf112ec 100644 --- a/inst/tinytest/test_1_fastdid.R +++ b/inst/tinytest/test_1_fastdid.R @@ -63,7 +63,20 @@ expect_silent(fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",ou expect_silent(fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y", result_type = "group_time", allow_unbalance_panel = TRUE, max_control_cohort_diff = 2), - info = "balance panel false but dt is balance") + info = "max control cohort diff") + +expect_silent(fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y", result_type = "group_time", allow_unbalance_panel = TRUE, + min_control_cohort_diff = 4), + info = "min control cohort diff") + +expect_silent(fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y", result_type = "group_time", allow_unbalance_panel = TRUE, + anticipation = 2), + info = "anticipation") + +expect_silent(fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y", result_type = "group_time", allow_unbalance_panel = TRUE, + base_period = "varying"), + info = "baseperiod vary") + # dt that needs adjustment --------------------------- diff --git a/inst/tinytest/test_2_compare_gt.R b/inst/tinytest/test_2_compare_gt.R index 4106183..479a8b2 100644 --- a/inst/tinytest/test_2_compare_gt.R +++ b/inst/tinytest/test_2_compare_gt.R @@ -10,6 +10,7 @@ dt <- simdt$dt est_diff_ratio <- function(result, did_result){ did_result_dt <- data.table(cohort = did_result$group, time = did_result$t, did_att = did_result$att, did_se = did_result$se) + did_result_dt <- did_result_dt[did_att != 0] compare <- did_result_dt |> merge(result, by = c("cohort", "time"), all = TRUE) att_diff_per <- compare[, sum(abs(did_att-att), na.rm = TRUE)/sum(did_att, na.rm = TRUE)] se_diff_per <- compare[, sum(abs(did_se-se), na.rm = TRUE)/sum(did_se, na.rm = TRUE)] @@ -85,6 +86,35 @@ expect_equal(est_diff_ratio(result, did_result), c(0,0), tolerance = tol, info = "covariates dr") rm(result, did_result) +# anticipation ----------------- + +result <- fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y", result_type = "group_time", + control_type = "dr", + covariatesvar = c("x"), anticipation = 2) +did_result <- did::att_gt(yname = "y",gname = "G",idname = "unit",tname = "time",data = dt,base_period = "universal",est_method = "dr",cband = FALSE, + xformla = ~x, + control_group = "notyettreated", + clustervars = NULL, + bstrap = FALSE, anticipation = 2) + +expect_equal(est_diff_ratio(result, did_result), c(0,0), tolerance = tol, + info = "anticipation") +rm(result, did_result) + +# alternative base period ----------------- + +result <- fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y", result_type = "group_time", + control_type = "dr", + covariatesvar = c("x"), base_period = "varying") +did_result <- did::att_gt(yname = "y",gname = "G",idname = "unit",tname = "time",data = dt,base_period = "varying",est_method = "dr",cband = FALSE, + xformla = ~x, + control_group = "notyettreated", + clustervars = NULL, + bstrap = FALSE) + +expect_equal(est_diff_ratio(result, did_result), c(0,0), tolerance = tol, + info = "varying base period") +rm(result, did_result) # bootstrap -------------------- From 7237787631b8b569879c5a7030e4cfde6a3a18c1 Mon Sep 17 00:00:00 2001 From: TsaiLintung <101155567+TsaiLintung@users.noreply.github.com> Date: Tue, 7 May 2024 11:41:56 +0800 Subject: [PATCH 14/15] add filtervar --- R/estimate_gtatt.R | 5 +++++ R/fastdid.R | 22 ++++++++++++++++++---- R/validate_did.R | 5 ++++- inst/tinytest/test_1_fastdid.R | 8 ++++++++ 4 files changed, 35 insertions(+), 5 deletions(-) diff --git a/R/estimate_gtatt.R b/R/estimate_gtatt.R index 93a56ac..ad0a531 100644 --- a/R/estimate_gtatt.R +++ b/R/estimate_gtatt.R @@ -101,6 +101,11 @@ get_did_setup <- function(g, t, base_period, aux, p){ did_setup <- rep(NA, aux$id_size) did_setup[get_cohort_pos(aux$cohort_sizes, min_control_cohort, max_control_cohort)] <- 0 did_setup[get_cohort_pos(aux$cohort_sizes, g)] <- 1 #treated cannot be controls, assign treated after control to overwrite + + if(!is.na(p$filtervar)){ + did_setup[!aux$filters[[base_period]]] <- NA #only use units with filter == TRUE at base period + } + return(did_setup) } } diff --git a/R/fastdid.R b/R/fastdid.R index 3a3d1ba..d950f85 100644 --- a/R/fastdid.R +++ b/R/fastdid.R @@ -61,7 +61,7 @@ fastdid <- function(data, timevar, cohortvar, unitvar, outcomevar, control_option="both",result_type="group_time", balanced_event_time = NA, control_type = "ipw", allow_unbalance_panel = FALSE, boot=FALSE, biters = 1000, - weightvar=NA,clustervar=NA, covariatesvar = NA, varycovariatesvar = NA, + weightvar=NA,clustervar=NA, covariatesvar = NA, varycovariatesvar = NA, filtervar = NA, copy = TRUE, validate = TRUE, max_control_cohort_diff = Inf, anticipation = 0, min_control_cohort_diff = -Inf, base_period = "universal" ){ @@ -83,7 +83,7 @@ fastdid <- function(data, "NA | multi match", .choices = dt_names, .message = covariate_message) checkvar_message <- "__ARG__ must be NA or a character scalar if a name of columns from the dataset." - check_set_arg(weightvar, clustervar, + check_set_arg(weightvar, clustervar, filtervar, "NA | match", .choices = dt_names, .message = checkvar_message) check_set_arg(control_option, "match", .choices = c("both", "never", "notyet")) #kinda bad names since did's notyet include both notyet and never @@ -122,6 +122,7 @@ fastdid <- function(data, outcomevar = outcomevar, weightvar = weightvar, clustervar = clustervar, + filtervar = filtervar, covariatesvar = covariatesvar, varycovariatesvar = varycovariatesvar, control_option = control_option, @@ -142,7 +143,7 @@ fastdid <- function(data, setnames(dt, c(timevar, cohortvar, unitvar), c("time", "G", "unit")) if(validate){ - varnames <- c("time", "G", "unit", outcomevar,weightvar,clustervar,covariatesvar,varycovariatesvar) + varnames <- c("time", "G", "unit",outcomevar,weightvar,clustervar,covariatesvar,varycovariatesvar,filtervar) dt <- validate_did(dt, varnames, p) } @@ -277,6 +278,18 @@ get_auxdata <- function(dt, p){ varycovariates <- NA } + # filters + filters <- list() + if(!is.na(p$filtervar)){ + for(i in time_periods){ + start <- (i-1)*id_size+1 + end <- i*id_size + filters[[i]] <- dt[seq(start,end), .SD, .SDcols = p$filtervar] + } + } else { + filters <- NA + } + if(!allNA(p$covariatesvar)){ covariates <- cbind(const = -1, dt_inv[,.SD, .SDcols = p$covariatesvar]) } else { @@ -304,7 +317,8 @@ get_auxdata <- function(dt, p){ varycovariates = varycovariates, covariates = covariates, cluster = cluster, - weights = weights) + weights = weights, + filters = filters) return(aux) diff --git a/R/validate_did.R b/R/validate_did.R index 839f751..c4a9844 100644 --- a/R/validate_did.R +++ b/R/validate_did.R @@ -7,6 +7,10 @@ validate_did <- function(dt,varnames,p){ if(p$balanced_event_time > dt[, max(time-G)]){stop("balanced_event_time is larger than the max event time in the data")} } + if(!is.na(p$filtervar) && !is.logical(dt[[p$filtervar]])){ + stop("filter var needs to be a logical column") + } + #doesn't allow missing value for now for(col in varnames){ if(is.na(col)){next} @@ -17,7 +21,6 @@ validate_did <- function(dt,varnames,p){ } } - if(!allNA(p$covariatesvar) && uniqueN(dt, by = c("unit", p$covariatesvar)) > raw_unit_size){ warning("some covariates is time-varying, fastdid only use the first observation for covariates.") } diff --git a/inst/tinytest/test_1_fastdid.R b/inst/tinytest/test_1_fastdid.R index cf112ec..207f497 100644 --- a/inst/tinytest/test_1_fastdid.R +++ b/inst/tinytest/test_1_fastdid.R @@ -77,6 +77,14 @@ expect_silent(fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",ou base_period = "varying"), info = "baseperiod vary") +# filtervar ------------------------------------------ + +dt2 <- copy(dt) +dt2[, f := x>0] + +expect_silent(fastdid(dt2, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y", result_type = "group_time", allow_unbalance_panel = TRUE, + base_period = "varying", filtervar = "f"), + info = "filtervar") # dt that needs adjustment --------------------------- From 3d1b82be8afdf8f4e766a02f9df84d22f470f811 Mon Sep 17 00:00:00 2001 From: TsaiLintung <101155567+TsaiLintung@users.noreply.github.com> Date: Tue, 7 May 2024 13:42:24 +0800 Subject: [PATCH 15/15] 0.9.3 --- R/fastdid.R | 4 + R/global.R | 16 +-- R/sim_did.R | 1 + README.md | 29 ++-- development/build_source.R | 18 +-- .../{fastdid_0_9_2.R => fastdid_0_9_3.R} | 135 +++++++++++++----- development/source_head.R | 3 - man/fastdid.Rd | 16 ++- man/sim_did.Rd | 2 + 9 files changed, 148 insertions(+), 76 deletions(-) rename development/{fastdid_0_9_2.R => fastdid_0_9_3.R} (91%) delete mode 100644 development/source_head.R diff --git a/R/fastdid.R b/R/fastdid.R index d950f85..45a28fe 100644 --- a/R/fastdid.R +++ b/R/fastdid.R @@ -18,9 +18,13 @@ #' @param clustervar The name of the cluster variable, can only be used when boot == TRUE (optional). #' @param covariatesvar A character vector containing the names of time-invariant covariate variables (optional). #' @param varycovariatesvar A character vector containing the names of time-varying covariate variables (optional). +#' @param filtervar A logical vector that specifies which units to use (can be time varying, units will only be included in 2x2 when filtervar is TRUE is the base period) #' @param copy whether to copy the dataset before processing, set to false to speed up the process, but the input data will be altered. #' @param validate whether to validate the dataset before processing. #' @param anticipation periods with aniticipation (delta in CS, default is 0, reference period is g - delta - 1). +#' @param min_control_cohort_diff the min cohort difference between treated and control group +#' @param max_control_cohort_diff the max cohort difference between treated and control group +#' @param base_period same as did #' #' @import data.table parglm stringr dreamerr BMisc #' @importFrom stats quantile vcov sd binomial fitted qnorm rnorm as.formula diff --git a/R/global.R b/R/global.R index 0985e52..76be0cc 100644 --- a/R/global.R +++ b/R/global.R @@ -1,24 +1,10 @@ NULL -if(FALSE){ - text <- ". agg_weight att att_cont att_treat attgt cohort cohort_size - conf_lwb conf_upb const cont_ipw_weight count delta_y element_rect - element_text event_time pg placeholder post.y pre.y ps s se target - tau time_fe treat_ipw_weight treat_latent type unit unit_fe weight x - x2 x_trend y y0 y1 y2 time outcome V1 att_cont_post att_cont_pre att_treat_post att_treat_pre inpost - inpre max_et min_et new_unit or_delta or_delta_post or_delta_pre - targeted used" - text <- text |> str_remove_all("\\\n") |>str_split(" ") |> unlist() - text <- text[text!=""] - text <- text |> str_flatten(collapse = "','") - text <- paste0("c('", text, "')") -} - # quiets concerns of R CMD check re: the .'s that appear in pipelines and data.table variables utils::globalVariables(c('.','agg_weight','att','att_cont','att_treat','attgt','cohort','cohort_size','conf_lwb','conf_upb', 'const','cont_ipw_weight','count','delta_y','element_rect','element_text','event_time','pg','placeholder', 'post.y','pre.y','ps','s','se','target','tau','time_fe', 'treat_ipw_weight','treat_latent','type','unit','unit_fe','weight','x','x2', - 'x_trend','y','y0','y1','y2', 'time', 'weights', 'outcome', "G", "D", + 'x_trend','y','y0','y1','y2', 'time', 'weights', 'outcome', "G", "D", 'xvar', 'V1','att_cont_post','att_cont_pre','att_treat_post','att_treat_pre','inpost','inpre','max_et','min_et','new_unit','or_delta','or_delta_post','or_delta_pre','targeted','used')) diff --git a/R/sim_did.R b/R/sim_did.R index d8ba2e5..8a43e13 100644 --- a/R/sim_did.R +++ b/R/sim_did.R @@ -15,6 +15,7 @@ #' @param seed Seed for random number generation. #' @param stratify Whether to stratify the dataset based on a binary covariate. #' @param treatment_assign The method for treatment assignment ("latent" or "uniform"). +#' @param vary_cov include time-varying covariates #' #' @return A list containing the simulated dataset (dt) and the treatment effect values (att). #' diff --git a/README.md b/README.md index 3c8fd85..f6bc369 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ result <- fastdid(data = dt, clustervar = "x", boot = TRUE) #add clustering by using bootstrap ``` -Estimation for multiple outcomes can be done in one call by providing a vector of outcome column names (saves a lot of time when controlling for covariates since logit estimands can be recycled across outcomes). +Estimation for multiple outcomes can be done in one call by providing a vector of outcome column names (saves a lot of time when controlling for covariates since logit estimates can be recycled across outcomes). ``` #calling fastdid @@ -122,9 +122,9 @@ Aggregated parameters: `fastdid` aggregates in the same function. ## Other -1. **fastdid** currently only offers inverse probability weights estimators for controlling for covariates when allowing for unbalanced panels. -2. **fastdid** only uses the time before the event as base periods ("universal" in `attgt`) -4. **fastdid** currently only reports the pointwise confidence intervals, instead of the simultaneously valid confidence intervals (check section 4.1 of Callaway and Sant'Anna's (2021) for more detail.) +1. **fastdid** only offers inverse probability weights estimators for controlling for covariates when allowing for unbalanced panels. +2. **fastdid** use universal base periods as default. +4. **fastdid** only reports the pointwise confidence intervals, instead of the simultaneously valid confidence intervals (check section 4.1 of Callaway and Sant'Anna's (2021) for more detail.) # Roadmap @@ -134,17 +134,26 @@ Aggregated parameters: `fastdid` aggregates in the same function. - Min/max event time and balanced composition :white_check_mark: - DR and OR estimators :white_check_mark: - Allowing for unbalanced panels :white_check_mark: (*well, not fully because DR and OR still need to be added*) -- Larger-than-memory data support +- Anticipation :white_check_mark: +- Varying base periods :white_check_mark: - User-provided aggregation scheme -- drop-in interface for did -- Anticipation -- Varying base periods - User-provided control formula -- Simultaneously valid confidence bands -- Further optimization! +- simultaneously valid confidence bands +- Further optimization + +# Source version + +Since **fastdid** is not on CRAN yet, it needs to be converted to R scripts to be used in some restricted environments. This can be done with `development/build_source.R`. After changing the working directory, the script will produce `development/fastdid_VERNAME.R`, which can be sourced to mimic the functionalities of the package. # Update +## 0.9.3 (2024/5/7) + +- add anticipation and varying base period option +- add min and max control cohort difference +- add time-varying control ([reference](https://arxiv.org/abs/2202.02903)) +- add filtervar + ## 0.9.2 (2023/12/20) - add support to doubly robust and outcome regression estimators diff --git a/development/build_source.R b/development/build_source.R index 7d3bbc3..d29baf2 100644 --- a/development/build_source.R +++ b/development/build_source.R @@ -2,24 +2,20 @@ rm(list = ls()) gc() library(stringr) -library(here) -source_files <- list.files(here("R"), include.dirs = FALSE, full.names = TRUE) +setwd("~/GitHub/fastdid") #change this if necessary +source_files <- list.files("R", include.dirs = FALSE, full.names = TRUE) -ver <- "0.9.2" -vername <- "unbalanced" +ver <- "0.9.3" +vername <- "timing" dep <- c("data.table", "stringr", "BMisc", "collapse", "dreamerr", "parglm") -sink(here("development/source_head.R")) +sink(paste0("development/fastdid_", str_replace_all(ver, "\\.", "_"), ".R")) cat(paste0("#", as.character(Sys.Date()), "\n")) cat(paste0("message('loading fastdid source ver. ver: ", ver, " (", vername ,"), date: " , as.character(Sys.Date()), "')\n")) -cat(paste0("require(", dep, ");") |> str_flatten()) -sink() - -sink(paste0("development/fastdid_", str_replace_all(ver, "\\.", "_"), ".R")) -cat(readLines(here("development/source_head.R")), sep ="\n") +cat(paste0("require(", dep, ");" |> str_flatten(), "\n")) for(file in source_files){ - current_file = readLines(file) + current_file = readLines(file, warn = FALSE) cat(current_file, sep ="\n") cat("\n") } diff --git a/development/fastdid_0_9_2.R b/development/fastdid_0_9_3.R similarity index 91% rename from development/fastdid_0_9_2.R rename to development/fastdid_0_9_3.R index f9f4ae4..379a0de 100644 --- a/development/fastdid_0_9_2.R +++ b/development/fastdid_0_9_3.R @@ -1,6 +1,11 @@ -#2024-04-09 -message('loading fastdid source ver. ver: 0.9.2 (unbalanced), date: 2024-04-09') -require(data.table);require(stringr);require(BMisc);require(collapse);require(dreamerr);require(parglm) +#2024-05-07 +message('loading fastdid source ver. ver: 0.9.3 (timing), date: 2024-05-07') +require(data.table); + require(stringr); + require(BMisc); + require(collapse); + require(dreamerr); + require(parglm); aggregate_gt <- function(gt_result, aux, p){ @@ -540,18 +545,12 @@ estimate_did_rc <- function(dt_did, covvars, control_type, estimate_gtatt <- function(aux, p) { + #chcek if there is availble never-treated group if(!Inf %in% aux$cohorts & p$control_option != "notyet"){ warning("no never-treated availble, switching to not-yet-treated control") p$control_option <- "notyet" } - if(!is.na(p$max_control_cohort_diff)){ - warning("max_control_cohort_diff can only be used with not yet") - p$control_option <- "notyet" - } - - treated_cohorts <- aux$cohorts[!is.infinite(aux$cohorts)] - cache_ps_fit_list <- list() #for the first outcome won't be able to use cache, empty list returns null for call like cache_ps_fit[["1.3"]] cache_hess_list <- list() outcome_result_list <- list() @@ -567,26 +566,14 @@ estimate_gtatt <- function(aux, p) { for(g in aux$cohorts){ gt_name <- paste0(g, ".", t) - - #setup and checks - base_period <- g-1 - min_control_cohort <- ifelse(p$control_option == "never", Inf, max(t+1, base_period+1)) #not-yet treated / never treated in both base and "treated" period - - if(!is.na(p$max_control_cohort_diff)){ - max_control_cohort <- min(g+p$max_control_cohort_diff, max(treated_cohorts)) + if(p$base_period == "universal"){ + base_period <- g-1-p$anticipation } else { - max_control_cohort <- ifelse(p$control_option == "notyet", max(treated_cohorts), Inf) + base_period <- ifelse(t>=g, g-1-p$anticipation, t-1) } - if(t == base_period){next} #no treatment effect for the base period - if(base_period < min(aux$time_periods)){next} #no treatment effect for the first period, since base period is not observed - if(g >= max_control_cohort){next} #no treatment effect for never treated or the last treated cohort (for not yet notyet) - if(t >= max_control_cohort){next} #no control available if the last cohort is treated too - - #select the control and treated cohorts - did_setup <- rep(NA, aux$id_size) - did_setup[get_cohort_pos(aux$cohort_sizes, min_control_cohort, max_control_cohort)] <- 0 - did_setup[get_cohort_pos(aux$cohort_sizes, g)] <- 1 #treated cannot be controls, assign treated after control to overwrite + did_setup <- get_did_setup(g, t, base_period, aux, p) + if(is.null(did_setup)){next} #no gtatt if no did setup #construct the covariates matrix covvars <- get_covvars(aux$covariates, aux$varycovariates, base_period, t) @@ -627,6 +614,47 @@ estimate_gtatt <- function(aux, p) { return(outcome_result_list) } +get_did_setup <- function(g, t, base_period, aux, p){ + + treated_cohorts <- aux$cohorts[!is.infinite(aux$cohorts)] + + #setup and checks + + if(p$control_option == "never"){ + min_control_cohort <- Inf + } else { + if(!is.infinite(p$min_control_cohort_diff)){ + min_control_cohort <- max(g+p$min_control_cohort_diff, min(treated_cohorts)) + } else { + min_control_cohort <- max(t, base_period)+p$anticipation+1 + } + } + + if(!is.infinite(p$max_control_cohort_diff)){ + max_control_cohort <- min(g+p$max_control_cohort_diff, max(treated_cohorts)) + } else { + max_control_cohort <- ifelse(p$control_option == "notyet", max(treated_cohorts), Inf) + } + + if(t == base_period | #no treatment effect for the base period + base_period < min(aux$time_periods) | #no treatment effect for the first period, since base period is not observed + g >= max_control_cohort | #no treatment effect for never treated or the last treated cohort (for not yet notyet) + t >= max_control_cohort){ #no control available if the last cohort is treated too + return(NULL) + } else { + #select the control and treated cohorts + did_setup <- rep(NA, aux$id_size) + did_setup[get_cohort_pos(aux$cohort_sizes, min_control_cohort, max_control_cohort)] <- 0 + did_setup[get_cohort_pos(aux$cohort_sizes, g)] <- 1 #treated cannot be controls, assign treated after control to overwrite + + if(!is.na(p$filtervar)){ + did_setup[!aux$filters[[base_period]]] <- NA #only use units with filter == TRUE at base period + } + + return(did_setup) + } +} + get_cohort_pos <- function(cohort_sizes, start_cohort, end_cohort = start_cohort){ start <- cohort_sizes[G < start_cohort, sum(cohort_size)]+1 end <- cohort_sizes[G <= end_cohort, sum(cohort_size)] @@ -679,11 +707,17 @@ get_covvars <- function(covariates, varycovariates, base_period, t){ #' @param clustervar The name of the cluster variable, can only be used when boot == TRUE (optional). #' @param covariatesvar A character vector containing the names of time-invariant covariate variables (optional). #' @param varycovariatesvar A character vector containing the names of time-varying covariate variables (optional). +#' @param filtervar A logical vector that specifies which units to use (can be time varying, units will only be included in 2x2 when filtervar is TRUE is the base period) #' @param copy whether to copy the dataset before processing, set to false to speed up the process, but the input data will be altered. #' @param validate whether to validate the dataset before processing. +#' @param anticipation periods with aniticipation (delta in CS, default is 0, reference period is g - delta - 1). +#' @param min_control_cohort_diff the min cohort difference between treated and control group +#' @param max_control_cohort_diff the max cohort difference between treated and control group +#' @param base_period same as did #' -#' @import data.table parglm stringr collapse dreamerr BMisc +#' @import data.table parglm stringr dreamerr BMisc #' @importFrom stats quantile vcov sd binomial fitted qnorm rnorm as.formula +#' @importFrom collapse allNA fnrow whichNA fnunique fsum na_insert #' @return A data.table containing the estimated treatment effects and standard errors. #' @export #' @@ -720,9 +754,9 @@ fastdid <- function(data, timevar, cohortvar, unitvar, outcomevar, control_option="both",result_type="group_time", balanced_event_time = NA, control_type = "ipw", allow_unbalance_panel = FALSE, boot=FALSE, biters = 1000, - weightvar=NA,clustervar=NA,covariatesvar = NA, varycovariatesvar = NA, + weightvar=NA,clustervar=NA, covariatesvar = NA, varycovariatesvar = NA, filtervar = NA, copy = TRUE, validate = TRUE, - max_control_cohort_diff = NA + max_control_cohort_diff = Inf, anticipation = 0, min_control_cohort_diff = -Inf, base_period = "universal" ){ # validate arguments -------------------------------------------------------- @@ -742,12 +776,14 @@ fastdid <- function(data, "NA | multi match", .choices = dt_names, .message = covariate_message) checkvar_message <- "__ARG__ must be NA or a character scalar if a name of columns from the dataset." - check_set_arg(weightvar, clustervar, + check_set_arg(weightvar, clustervar, filtervar, "NA | match", .choices = dt_names, .message = checkvar_message) check_set_arg(control_option, "match", .choices = c("both", "never", "notyet")) #kinda bad names since did's notyet include both notyet and never check_set_arg(control_type, "match", .choices = c("ipw", "reg", "dr")) + check_set_arg(base_period, "match", .choices = c("varying", "universal")) check_arg(copy, validate, boot, allow_unbalance_panel, "scalar logical") + check_arg(max_control_cohort_diff, min_control_cohort_diff, anticipation, "scalar numeric") if(!is.na(balanced_event_time)){ if(result_type != "dynamic"){stop("balanced_event_time is only meaningful with result_type == 'dynamic'")} @@ -766,12 +802,20 @@ fastdid <- function(data, stop("clustering only available with bootstrap") } + # coerce non-sensible option + + if((!is.infinite(max_control_cohort_diff) | !is.infinite(min_control_cohort_diff)) & control_option == "never"){ + warning("control_cohort_diff can only be used with not yet") + p$control_option <- "notyet" + } + p <- list(timevar = timevar, cohortvar = cohortvar, unitvar = unitvar, outcomevar = outcomevar, weightvar = weightvar, clustervar = clustervar, + filtervar = filtervar, covariatesvar = covariatesvar, varycovariatesvar = varycovariatesvar, control_option = control_option, @@ -781,7 +825,10 @@ fastdid <- function(data, allow_unbalance_panel = allow_unbalance_panel, boot = boot, biters = biters, - max_control_cohort_diff = max_control_cohort_diff) + max_control_cohort_diff = max_control_cohort_diff, + min_control_cohort_diff = min_control_cohort_diff, + anticipation = anticipation, + base_period = base_period) # validate data ----------------------------------------------------- @@ -789,7 +836,7 @@ fastdid <- function(data, setnames(dt, c(timevar, cohortvar, unitvar), c("time", "G", "unit")) if(validate){ - varnames <- c("time", "G", "unit", outcomevar,weightvar,clustervar,covariatesvar,varycovariatesvar) + varnames <- c("time", "G", "unit",outcomevar,weightvar,clustervar,covariatesvar,varycovariatesvar,filtervar) dt <- validate_did(dt, varnames, p) } @@ -924,6 +971,18 @@ get_auxdata <- function(dt, p){ varycovariates <- NA } + # filters + filters <- list() + if(!is.na(p$filtervar)){ + for(i in time_periods){ + start <- (i-1)*id_size+1 + end <- i*id_size + filters[[i]] <- dt[seq(start,end), .SD, .SDcols = p$filtervar] + } + } else { + filters <- NA + } + if(!allNA(p$covariatesvar)){ covariates <- cbind(const = -1, dt_inv[,.SD, .SDcols = p$covariatesvar]) } else { @@ -951,7 +1010,8 @@ get_auxdata <- function(dt, p){ varycovariates = varycovariates, covariates = covariates, cluster = cluster, - weights = weights) + weights = weights, + filters = filters) return(aux) @@ -1017,7 +1077,7 @@ utils::globalVariables(c('.','agg_weight','att','att_cont','att_treat','attgt',' 'const','cont_ipw_weight','count','delta_y','element_rect','element_text','event_time','pg','placeholder', 'post.y','pre.y','ps','s','se','target','tau','time_fe', 'treat_ipw_weight','treat_latent','type','unit','unit_fe','weight','x','x2', - 'x_trend','y','y0','y1','y2', 'time', 'weights', 'outcome', + 'x_trend','y','y0','y1','y2', 'time', 'weights', 'outcome', "G", "D", 'V1','att_cont_post','att_cont_pre','att_treat_post','att_treat_pre','inpost','inpre','max_et','min_et','new_unit','or_delta','or_delta_post','or_delta_pre','targeted','used')) @@ -1245,6 +1305,10 @@ validate_did <- function(dt,varnames,p){ if(p$balanced_event_time > dt[, max(time-G)]){stop("balanced_event_time is larger than the max event time in the data")} } + if(!is.na(p$filtervar) && !is.logical(dt[[p$filtervar]])){ + stop("filter var needs to be a logical column") + } + #doesn't allow missing value for now for(col in varnames){ if(is.na(col)){next} @@ -1255,7 +1319,6 @@ validate_did <- function(dt,varnames,p){ } } - if(!allNA(p$covariatesvar) && uniqueN(dt, by = c("unit", p$covariatesvar)) > raw_unit_size){ warning("some covariates is time-varying, fastdid only use the first observation for covariates.") } diff --git a/development/source_head.R b/development/source_head.R deleted file mode 100644 index ea6800f..0000000 --- a/development/source_head.R +++ /dev/null @@ -1,3 +0,0 @@ -#2024-04-17 -message('loading fastdid source ver. ver: 0.9.2 (unbalanced), date: 2024-04-17') -require(data.table);require(stringr);require(BMisc);require(collapse);require(dreamerr);require(parglm); \ No newline at end of file diff --git a/man/fastdid.Rd b/man/fastdid.Rd index 0ea6537..3ca9594 100644 --- a/man/fastdid.Rd +++ b/man/fastdid.Rd @@ -21,9 +21,13 @@ fastdid( clustervar = NA, covariatesvar = NA, varycovariatesvar = NA, + filtervar = NA, copy = TRUE, validate = TRUE, - max_control_cohort_diff = NA + max_control_cohort_diff = Inf, + anticipation = 0, + min_control_cohort_diff = -Inf, + base_period = "universal" ) } \arguments{ @@ -59,9 +63,19 @@ fastdid( \item{varycovariatesvar}{A character vector containing the names of time-varying covariate variables (optional).} +\item{filtervar}{A logical vector that specifies which units to use (can be time varying, units will only be included in 2x2 when filtervar is TRUE is the base period)} + \item{copy}{whether to copy the dataset before processing, set to false to speed up the process, but the input data will be altered.} \item{validate}{whether to validate the dataset before processing.} + +\item{max_control_cohort_diff}{the max cohort difference between treated and control group} + +\item{anticipation}{periods with aniticipation (delta in CS, default is 0, reference period is g - delta - 1).} + +\item{min_control_cohort_diff}{the min cohort difference between treated and control group} + +\item{base_period}{same as did} } \value{ A data.table containing the estimated treatment effects and standard errors. diff --git a/man/sim_did.Rd b/man/sim_did.Rd index 540022a..e7fbb3d 100644 --- a/man/sim_did.Rd +++ b/man/sim_did.Rd @@ -38,6 +38,8 @@ sim_did( \item{second_cov}{Whether to include a second covariate.} +\item{vary_cov}{include time-varying covariates} + \item{na}{Whether to generate missing data ("none", "y", "x", or "both").} \item{balanced}{Whether to balance the dataset by random sampling.}