From ccebf6ce4484d8f677289b4ca53fccf0c39564d8 Mon Sep 17 00:00:00 2001
From: TsaiLintung <101155567+TsaiLintung@users.noreply.github.com>
Date: Wed, 27 Dec 2023 08:08:56 +0800
Subject: [PATCH 01/15] remove didor

---
 development/did_or.R | 30 ------------------------------
 1 file changed, 30 deletions(-)
 delete mode 100644 development/did_or.R

diff --git a/development/did_or.R b/development/did_or.R
deleted file mode 100644
index 69fd9a0..0000000
--- a/development/did_or.R
+++ /dev/null
@@ -1,30 +0,0 @@
-library(did)
-# install.packages("devtools")
-#devtools::install_github("TsaiLintung/fastdid")
-library(fastdid)
-
-simdt <- sim_did(1e+03, 5, cov = "cont", hetero = "all", balanced = TRUE, second_outcome = TRUE,
-                  stratify = FALSE,
-                 second_cov = TRUE)
-dt <- simdt$dt
-
-did_result <- did::att_gt(yname = "y",gname = "G",idname = "unit",tname = "time",data = dt,base_period = "universal",est_method = "dr",cband = FALSE,
-                          xformla = ~x+x2,
-                          control_group = "notyettreated",
-                          allow_unbalanced_panel = FALSE, #this is the only differece
-                          clustervars = NULL,
-                          bstrap = FALSE)
-did_result2 <- did::att_gt(yname = "y",gname = "G",idname = "unit",tname = "time",data = dt,base_period = "universal",est_method = "dr",cband = FALSE,
-                           xformla = ~x+x2,
-                           control_group = "notyettreated",
-                           allow_unbalanced_panel = TRUE, #this is the only differece
-                           clustervars = NULL,
-                           bstrap = FALSE)
-
-sum(did_result$att, na.rm = TRUE)/sum(did_result2$att, na.rm = TRUE)
-sum(did_result$se, na.rm = TRUE)/sum(did_result2$se, na.rm = TRUE) #should be 1 if the SE is the same, but not??????
-
-#very weird
-#DRDID:drdid_rc1, line 203: inf.cont <- inf.cont.post - inf.cont.pre + inf.cont.ps + inf.cont.or
-#DRDID:drdid_panel, line 169: inf.control <- (inf.cont.1 + inf.cont.2 - inf.cont.3) / mean(w.cont)
-#influence from outcome have different sign??
\ No newline at end of file

From 38516e1298089e2b195af9eaf3a923dfa98d54e4 Mon Sep 17 00:00:00 2001
From: TsaiLintung <101155567+TsaiLintung@users.noreply.github.com>
Date: Fri, 29 Dec 2023 13:26:15 +0800
Subject: [PATCH 02/15] add benchmark compare

---
 development/compare_benchmark.R               | 94 +++++++++++++++++++
 development/{benchmark.R => solo_benchmark.R} |  0
 interactive/rc_test.R                         | 50 ++++++++++
 3 files changed, 144 insertions(+)
 create mode 100644 development/compare_benchmark.R
 rename development/{benchmark.R => solo_benchmark.R} (100%)
 create mode 100644 interactive/rc_test.R

diff --git a/development/compare_benchmark.R b/development/compare_benchmark.R
new file mode 100644
index 0000000..0ce555b
--- /dev/null
+++ b/development/compare_benchmark.R
@@ -0,0 +1,94 @@
+rm(list = ls())
+gc()
+
+library(peakRAM)
+
+library(magrittr)
+library(ggplot2)
+library(fixest)
+
+library(did)
+#install_github("TsaiLintung/fastdid")
+library(fastdid)
+library(DiDforBigData)
+
+
+data.table::setDTthreads(0)
+
+# setup ------------------------------------------------------------------------------------
+
+min_order <- 2
+max_order <- 6
+
+# functions for running the estimation -----------------------------------------------------
+
+run_fastdid <- function(dt){
+  result <- fastdid(dt, 
+                    timevar = "time", cohortvar = "G", unitvar = "unit", outcomevar = "y", 
+                    result_type = "group_time", copy = FALSE, allow_unbalance_panel = FALSE)
+}
+
+run_did <- function(dt){
+  result <- att_gt(yname = "y", tname = "time", idname = "unit", gname = "G", data = dt,
+                   control_group = "notyettreated", bstrap = FALSE, cband = FALSE)
+}
+
+run_dfbd <- function(dt){
+  varnames = list()
+  varnames$time_name = "time"
+  varnames$outcome_name = "y"
+  varnames$cohort_name = "G"
+  varnames$id_name = "unit"
+  
+  # estimate the ATT for all cohorts at event time 1 only
+  result <- DiD(dt, varnames)
+}
+
+run_benchmark <- function(min_order, max_order){
+  all_bm <- data.table()
+  for(order in min_order:max_order){
+    raw_dt <- sim_did(10^order, 10, seed = 1, cov = "cont", second_outcome = TRUE, second_cov = TRUE)[["dt"]]
+    
+    gc()
+    
+    dt <- copy(raw_dt)
+    fast_bm <- peakRAM(run_fastdid(dt))
+    
+    gc()
+    
+    dt <- copy(raw_dt)
+    did_bm <- peakRAM(run_did(dt))
+    
+    gc()
+    
+    dt <- copy(raw_dt)
+    dfbd_bm <- peakRAM(run_dfbd(dt))
+    
+    gc()
+    
+    bm <- rbind(fast_bm, did_bm, dfbd_bm) |> as.data.table()
+    bm[, order := order] 
+    all_bm <- rbind(all_bm, bm)
+    message(order)
+  }
+  
+  return(all_bm)
+  
+}
+
+bm_result <- run_benchmark(min_order, max_order)
+
+# visualize benchmark result ------------------------------------------------------------------------------------ 
+
+bm_result[Function_Call == "run_fastdid(dt)", pkg := "fastdid"]
+bm_result[Function_Call == "run_dfbd(dt)", pkg := "dfbd"]
+bm_result[Function_Call == "run_did(dt)", pkg := "did"]
+
+setnames(bm_result, c("Elapsed_Time_sec", "Peak_RAM_Used_MiB"), c("time", "RAM"))
+
+bm_result %>% ggplot(aes(x = order, y = time, color = pkg)) + geom_line() + geom_point() + coord_trans(y = "log10") + 
+  ggtitle("Comparison of computing time")
+
+bm_result %>% ggplot(aes(x = order, y = RAM, color = pkg)) + geom_line() + geom_point() + 
+  ggtitle("Comparison of peak RAM used")
+
diff --git a/development/benchmark.R b/development/solo_benchmark.R
similarity index 100%
rename from development/benchmark.R
rename to development/solo_benchmark.R
diff --git a/interactive/rc_test.R b/interactive/rc_test.R
new file mode 100644
index 0000000..0f1f31a
--- /dev/null
+++ b/interactive/rc_test.R
@@ -0,0 +1,50 @@
+
+ipw_unbalanced_data <- function(method){
+  result <- fastdid(dt2, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y",  result_type = "group_time",
+                    control_type = method,
+                    covariatesvar = c("x", "x2"),
+                    allow_unbalance_panel = TRUE)
+  did_result <- did::att_gt(yname = "y",gname = "G",idname = "unit",tname = "time",data = dt2,
+                            base_period = "universal",cband = FALSE,
+                            est_method = drdid_rc1,xformla = ~x+x2,
+                            control_group = "notyettreated",
+                            allow_unbalanced_panel = TRUE,
+                            clustervars = NULL,
+                            bstrap = FALSE)
+  est_diff_ratio(result, did_result)
+}
+
+ipw_balanced_data <- function(method){
+  result <- fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y",  result_type = "group_time",
+                    control_type = method,
+                    covariatesvar = c("x", "x2"),
+                    allow_unbalance_panel = TRUE)
+  did_result <- did::att_gt(yname = "y",gname = "G",idname = "unit",tname = "time",data = dt,
+                            base_period = "universal",cband = FALSE,
+                            est_method = drdid_rc,xformla = ~x+x2,
+                            control_group = "notyettreated",
+                            allow_unbalanced_panel = TRUE,
+                            clustervars = NULL,
+                            bstrap = FALSE)
+  est_diff_ratio(result, did_result)
+}
+
+simple_unbalanced <- function(){
+  result <- fastdid(dt2, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y",  result_type = "group_time",
+                    allow_unbalance_panel = TRUE)
+  did_result <- did::att_gt(yname = "y",gname = "G",idname = "unit",tname = "time",data = dt2,base_period = "universal",est_method = "ipw",cband = FALSE,
+                            #xformla = ~x,
+                            control_group = "notyettreated",
+                            allow_unbalanced_panel = TRUE,
+                            clustervars = NULL,
+                            bstrap = FALSE)
+  
+  est_diff_ratio(result, did_result)
+}
+
+run_new <- function(method){
+  load_all()
+  print(simple_unbalanced())
+  print(ipw_balanced_data(method))
+  print(ipw_unbalanced_data(method))
+}

From 1c4abf57bc951ea3cff203e94e27747a594a7f92 Mon Sep 17 00:00:00 2001
From: TsaiLintung <101155567+TsaiLintung@users.noreply.github.com>
Date: Mon, 18 Mar 2024 16:06:51 +0800
Subject: [PATCH 03/15] added time varying control (no correctness test)

---
 R/estimate_gtatt.R             | 25 ++++++++++++++---
 R/fastdid.R                    | 43 ++++++++++++++++++++++-------
 R/sim_did.R                    | 13 +++++++--
 R/validate_did.R               | 19 ++++++-------
 development/buildtest.R        |  2 +-
 inst/tinytest/test_1_fastdid.R | 12 +++++++-
 interactive/rc_test.R          | 50 ----------------------------------
 7 files changed, 85 insertions(+), 79 deletions(-)
 delete mode 100644 interactive/rc_test.R

diff --git a/R/estimate_gtatt.R b/R/estimate_gtatt.R
index 2367d59..b30e529 100644
--- a/R/estimate_gtatt.R
+++ b/R/estimate_gtatt.R
@@ -1,4 +1,4 @@
-estimate_gtatt <- function(outcomes_list, outcomevar, covariates, control_type, weights, 
+estimate_gtatt <- function(outcomes_list, outcomevar, covariates, varycovariates, control_type, weights, 
                            cohort_sizes,cohorts,id_size,time_periods,
                            control_option, allow_unbalance_panel) {
 
@@ -40,15 +40,32 @@ estimate_gtatt <- function(outcomes_list, outcomevar, covariates, control_type,
         did_setup[get_cohort_pos(cohort_sizes, min_control_cohort, max_control_cohort)] <- 0
         did_setup[get_cohort_pos(cohort_sizes, g)] <- 1 #treated cannot be controls, assign treated after control to overwrite
         
+        if(!is.null(varycovariates)){
+          precov <- varycovariates[[base_period]]
+          names(precov) <- paste0("pre_", names(precov))
+          
+          postcov <- varycovariates[[t]]-varycovariates[[base_period]]
+          names(postcov) <- paste0("post_", names(varycovariates[[t]]))
+          
+          if(is.null(covariates)){
+            covariatesdt <- cbind(const = -1, cbind(precov, postcov)) #if covariates is NULL
+          } else {
+            covariatesdt <- cbind(covariates, cbind(precov, postcov)) #if covariates is NULL
+          }
+          
+        } else {
+          covariatesdt <- covariates
+        } 
+        
         #construct the 2x2 dataset
-        cohort_did <- data.table(did_setup, outcomes[[t]], outcomes[[base_period]], weights, covariates)
+        cohort_did <- data.table(did_setup, outcomes[[t]], outcomes[[base_period]], weights, covariatesdt)
         setnames(cohort_did, c("did_setup", "V2", "V3", "weights"), c("D", "post.y", "pre.y", "weights"))
  
         #estimate did
         if(!allow_unbalance_panel){
-          result <- estimate_did(cohort_did, colnames(covariates), control_type, last_coef, cache_ps_fit_list[[gt_name]], cache_hess_list[[gt_name]])
+          result <- estimate_did(cohort_did, colnames(covariatesdt), control_type, last_coef, cache_ps_fit_list[[gt_name]], cache_hess_list[[gt_name]])
         } else {
-          result <- estimate_did_rc(cohort_did, colnames(covariates), control_type, last_coef, cache_ps_fit_list[[gt_name]], cache_hess_list[[gt_name]])
+          result <- estimate_did_rc(cohort_did, colnames(covariatesdt), control_type, last_coef, cache_ps_fit_list[[gt_name]], cache_hess_list[[gt_name]])
         }
        
       
diff --git a/R/fastdid.R b/R/fastdid.R
index 0d2176c..9267c0f 100644
--- a/R/fastdid.R
+++ b/R/fastdid.R
@@ -16,7 +16,8 @@
 #' @param biters The number of bootstrap iterations. Only relevant if boot = TRUE. Default is 1000.
 #' @param weightvar The name of the weight variable (optional).
 #' @param clustervar The name of the cluster variable, can only be used when boot == TRUE (optional).
-#' @param covariatesvar A character vector containing the names of covariate variables (optional).
+#' @param covariatesvar A character vector containing the names of time-invariant covariate variables (optional).
+#' @param varycovariatesvar A character vector containing the names of time-varying covariate variables (optional).
 #' @param copy whether to copy the dataset before processing, set to false to speed up the process, but the input data will be altered.
 #' @param validate whether to validate the dataset before processing.
 #' 
@@ -58,7 +59,7 @@ fastdid <- function(data,
                     timevar, cohortvar, unitvar, outcomevar, 
                     control_option="both",result_type="group_time", balanced_event_time = NULL,
                     control_type = "ipw", allow_unbalance_panel = FALSE, boot=FALSE, biters = 1000,
-                    weightvar=NULL,clustervar=NULL,covariatesvar = NULL,
+                    weightvar=NULL,clustervar=NULL,covariatesvar = NULL,varycovariatesvar = NULL,
                     copy = TRUE, validate = TRUE
                     ){
   
@@ -77,7 +78,7 @@ fastdid <- function(data,
   check_set_arg(timevar, unitvar, cohortvar, "match", .choices = dt_names, .message = name_message)
   
   covariate_message <- "__ARG__ must be NULL or a character vector which are all names of columns from the dataset."
-  check_set_arg(covariatesvar, outcomevar, 
+  check_set_arg(varycovariatesvar, covariatesvar, outcomevar, 
             "NULL | multi match", .choices = dt_names, .message = covariate_message)
   
   checkvar_message <- "__ARG__ must be NULL or a character scalar if a name of columns from the dataset."
@@ -97,14 +98,20 @@ fastdid <- function(data,
     stop("fastdid currently only supprts ipw when allowing for unbalanced panels.")
   }
   
+  if(allow_unbalance_panel == TRUE & !is.null(varycovariatesvar)){
+    stop("fastdid currently only supprts time varying covariates when allowing for unbalanced panels.")
+  }
+  
+  if(any(covariatesvar %in% varycovariatesvar)){stop("time-varying var and invariant var have overlaps.")}
+  
   setnames(dt, c(timevar, cohortvar, unitvar), c("time", "G", "unit"))
 
   # validate data -----------------------------------------------------
   
   
   if(validate){
-    varnames <- c("time", "G", "unit", outcomevar,weightvar,clustervar,covariatesvar)
-    dt <- validate_did(dt, covariatesvar, varnames, balanced_event_time, allow_unbalance_panel)
+    varnames <- c("time", "G", "unit", outcomevar,weightvar,clustervar,covariatesvar, varycovariatesvar)
+    dt <- validate_did(dt, covariatesvar, varycovariatesvar, varnames, balanced_event_time, allow_unbalance_panel)
   }
   
   # preprocess -----------------------------------------------------------
@@ -151,9 +158,11 @@ fastdid <- function(data,
   
   #construct the outcomes list for fast access later
   id_size <- dt[, uniqueN(unit)]
-  outcomes_list <- list()
+  
+  # get auxiliary data ------------------------------
   
   #loop for multiple outcome
+  outcomes_list <- list()
   for(outcol in outcomevar){
     outcomes <- list()
     
@@ -186,10 +195,7 @@ fastdid <- function(data,
     dt_inv <- dt[dt[, .I[1], by = unit]$V1]
     setorder(dt_inv, unit) #can't move this outside
   }
-  
-  
 
-  
   cohorts <- dt_inv[, unique(G)]
   cohort_sizes <- dt_inv[, .(cohort_size = .N) , by = G]
 
@@ -200,6 +206,16 @@ fastdid <- function(data,
     covariates <- NULL
   }
   
+  varycovariates <- list()
+  if(!is.null(varycovariatesvar)){
+    for(i in time_periods){
+      start <- (i-1)*id_size+1
+      end <- i*id_size
+      varycovariates[[i]] <- dt[seq(start,end), .SD, .SDcols = varycovariatesvar]
+    }
+  } else {
+    varycovariates <- NULL
+  }
   if(!is.null(clustervar)){
     cluster <- dt_inv[, .SD, .SDcols = clustervar] |> unlist()
   } else {cluster <- NULL}
@@ -207,11 +223,18 @@ fastdid <- function(data,
   if(!is.null(weightvar)){
     weights <- dt_inv[, .SD, .SDcols = weightvar] |> unlist()
   } else {weights <- rep(1,id_size)}
+
+  
+  # cluster <- ifelse(is.null(clustervar), NULL, dt_inv[, .SD, .SDcols = clustervar] |> unlist())
+  #weights <- ifelse(is.null(weightvar), rep(1,id_size), dt_inv[, .SD, .SDcols = weightvar] |> unlist())
+  # covariates <- ifelse(is.null(covariatesvar), NULL, cbind(const = -1, dt_inv[,.SD, .SDcols = covariatesvar]))
+  
+ 
   
   # main part  -------------------------------------------------
 
   # attgt
-  gt_result_list <- estimate_gtatt(outcomes_list, outcomevar, covariates, control_type, weights,
+  gt_result_list <- estimate_gtatt(outcomes_list, outcomevar, covariates, varycovariates, control_type, weights,
                                    cohort_sizes,cohorts,id_size,time_periods, #info about the dt
                                    control_option, allow_unbalance_panel)
   
diff --git a/R/sim_did.R b/R/sim_did.R
index 2d7ced0..7ca136e 100644
--- a/R/sim_did.R
+++ b/R/sim_did.R
@@ -27,7 +27,7 @@
 #'
 #' @export
 sim_did <- function(sample_size, time_period, untreated_prop = 0.3, epsilon_size = 0.001,
-                    cov = "no", hetero = "all", second_outcome = FALSE, second_cov = FALSE, na = "none", 
+                    cov = "no", hetero = "all", second_outcome = FALSE, second_cov = FALSE, vary_cov = FALSE, na = "none", 
                     balanced = TRUE, seed = NA, stratify = FALSE, treatment_assign = "latent"){
   
   if(!is.na(seed)){set.seed(seed)}
@@ -91,8 +91,15 @@ sim_did <- function(sample_size, time_period, untreated_prop = 0.3, epsilon_size
   
   dt[, D := as.integer(time >= G)]
   
+  #add time_varying covariates
+  if(vary_cov){
+    dt[, xvar := pmin(G, time_period)*time+rnorm(sample_size*time_period, 0,1)] #should be confounding....?
+  } else {
+    dt[, xvar := 1]
+  }
+  
   #untreated potential outcomes
-  dt[, y0 := unit_fe + time_fe + x*x_trend + x2*x_trend + rnorm(sample_size*time_period, sd = epsilon_size)]
+  dt[, y0 := unit_fe + time_fe + (x+x2)*x_trend + xvar + rnorm(sample_size*time_period, sd = epsilon_size)]
   
   #generate gtatt
   att <- CJ(G = 1:time_period, time = 1:time_period)
@@ -112,7 +119,7 @@ sim_did <- function(sample_size, time_period, untreated_prop = 0.3, epsilon_size
   #potential outcome
   dt[, y1 := y0 + tau]
   dt[, y := y1*D + y0*(1-D)]
-  dt <- dt[, .SD, .SDcols = c("time", "G", "unit", "x", "x2", "y", "s")]
+  dt <- dt[, .SD, .SDcols = c("time", "G", "unit", "x", "x2", "y", "s", "xvar")]
   
   #additional -----------------
   
diff --git a/R/validate_did.R b/R/validate_did.R
index ada0388..c0210c2 100644
--- a/R/validate_did.R
+++ b/R/validate_did.R
@@ -1,4 +1,4 @@
-validate_did <- function(dt,covariatesvar,varnames, balanced_event_time, allow_unbalance_panel){
+validate_did <- function(dt,covariatesvar, varycovariatesvar,varnames, balanced_event_time, allow_unbalance_panel){
   raw_unit_size <- dt[, uniqueN(unit)]
   raw_time_size <- dt[, uniqueN(time)]
   
@@ -8,7 +8,7 @@ validate_did <- function(dt,covariatesvar,varnames, balanced_event_time, allow_u
   }
   
   #doesn't allow missing value for now
-  for(col in c(covariatesvar, varnames)){
+  for(col in varnames){
     if(is.null(col)){next}
     na_obs <- whichNA(dt[[col]])
     if(length(na_obs) != 0){
@@ -17,17 +17,16 @@ validate_did <- function(dt,covariatesvar,varnames, balanced_event_time, allow_u
     }
   }
   
-  if(!is.null(covariatesvar)){
-    
-    if(uniqueN(dt, by = c("unit", covariatesvar)) > raw_unit_size){
-      warning("some covariates is time-varying, fastdid only use the first observation for covariates.")
-    }
-    
-    for(cov in covariatesvar){
+  if(!is.null(covariatesvar) & uniqueN(dt, by = c("unit", covariatesvar)) > raw_unit_size){
+    warning("some covariates is time-varying, fastdid only use the first observation for covariates.")
+  }
+  
+  
+  if(!is.null(covariatesvar)|!is.null(varycovariatesvar)){
+    for(cov in c(covariatesvar, varycovariatesvar)){
       #check covaraites is not constant  
       if(fnunique(dt[, get(cov)[1], by = "unit"][, V1]) == 1)stop(cov, " have no variation")
     }
-
   }
   
   #check balanced panel
diff --git a/development/buildtest.R b/development/buildtest.R
index 1be45d6..26cbaf9 100644
--- a/development/buildtest.R
+++ b/development/buildtest.R
@@ -1,4 +1,4 @@
-setwd("~/GitHub/EventStudyCode")
+setwd("~/GitHub/fastdid")
 
 library(devtools)
 library(tinytest)
diff --git a/inst/tinytest/test_1_fastdid.R b/inst/tinytest/test_1_fastdid.R
index eefd514..0c24c50 100644
--- a/inst/tinytest/test_1_fastdid.R
+++ b/inst/tinytest/test_1_fastdid.R
@@ -2,7 +2,7 @@
 
 tol <- 1e-2 #allow 1% different between estimates
 simdt <- sim_did(1e+02, 10, cov = "cont", hetero = "all", balanced = TRUE, second_outcome = TRUE, seed = 1, 
-                 stratify = FALSE, second_cov = TRUE)
+                 stratify = FALSE, second_cov = TRUE, vary_cov = TRUE)
 dt <- simdt$dt
 
 # basics ---------------------------------------------
@@ -26,6 +26,16 @@ expect_silent(fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",ou
               info = "covariates dr")
 
 
+expect_silent(fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y",  result_type = "group_time",
+                      control_type = "dr",
+                      covariatesvar = c("x", "x2"), varycovariatesvar = "xvar"),
+              info = "with varying covariates dr")
+
+expect_silent(fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y",  result_type = "group_time",
+                      control_type = "dr",
+                      varycovariatesvar = "xvar"),
+              info = "varying covariates only dr")
+
 expect_silent(fastdid(dt[G != 3], timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y",  result_type = "group_time"),
               info = "missing cohort")
 
diff --git a/interactive/rc_test.R b/interactive/rc_test.R
deleted file mode 100644
index 0f1f31a..0000000
--- a/interactive/rc_test.R
+++ /dev/null
@@ -1,50 +0,0 @@
-
-ipw_unbalanced_data <- function(method){
-  result <- fastdid(dt2, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y",  result_type = "group_time",
-                    control_type = method,
-                    covariatesvar = c("x", "x2"),
-                    allow_unbalance_panel = TRUE)
-  did_result <- did::att_gt(yname = "y",gname = "G",idname = "unit",tname = "time",data = dt2,
-                            base_period = "universal",cband = FALSE,
-                            est_method = drdid_rc1,xformla = ~x+x2,
-                            control_group = "notyettreated",
-                            allow_unbalanced_panel = TRUE,
-                            clustervars = NULL,
-                            bstrap = FALSE)
-  est_diff_ratio(result, did_result)
-}
-
-ipw_balanced_data <- function(method){
-  result <- fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y",  result_type = "group_time",
-                    control_type = method,
-                    covariatesvar = c("x", "x2"),
-                    allow_unbalance_panel = TRUE)
-  did_result <- did::att_gt(yname = "y",gname = "G",idname = "unit",tname = "time",data = dt,
-                            base_period = "universal",cband = FALSE,
-                            est_method = drdid_rc,xformla = ~x+x2,
-                            control_group = "notyettreated",
-                            allow_unbalanced_panel = TRUE,
-                            clustervars = NULL,
-                            bstrap = FALSE)
-  est_diff_ratio(result, did_result)
-}
-
-simple_unbalanced <- function(){
-  result <- fastdid(dt2, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y",  result_type = "group_time",
-                    allow_unbalance_panel = TRUE)
-  did_result <- did::att_gt(yname = "y",gname = "G",idname = "unit",tname = "time",data = dt2,base_period = "universal",est_method = "ipw",cband = FALSE,
-                            #xformla = ~x,
-                            control_group = "notyettreated",
-                            allow_unbalanced_panel = TRUE,
-                            clustervars = NULL,
-                            bstrap = FALSE)
-  
-  est_diff_ratio(result, did_result)
-}
-
-run_new <- function(method){
-  load_all()
-  print(simple_unbalanced())
-  print(ipw_balanced_data(method))
-  print(ipw_unbalanced_data(method))
-}

From 3e1801918b973918ebb3425ae91b7c6542716ebe Mon Sep 17 00:00:00 2001
From: TsaiLintung <101155567+TsaiLintung@users.noreply.github.com>
Date: Mon, 18 Mar 2024 16:44:21 +0800
Subject: [PATCH 04/15] switch argument default from NULL to NA

---
 R/aggregate_gt.R               |  2 +-
 R/estimate_did.R               | 14 +++++------
 R/estimate_did_rc.R            | 12 +++++-----
 R/estimate_gtatt.R             | 10 ++++----
 R/fastdid.R                    | 44 +++++++++++++++++-----------------
 R/get_se.R                     |  6 ++---
 R/utils.R                      | 26 ++++++++++++++++----
 R/validate_did.R               | 10 ++++----
 inst/tinytest/test_1_fastdid.R |  1 -
 9 files changed, 72 insertions(+), 53 deletions(-)

diff --git a/R/aggregate_gt.R b/R/aggregate_gt.R
index d90e51c..2fe0741 100644
--- a/R/aggregate_gt.R
+++ b/R/aggregate_gt.R
@@ -67,7 +67,7 @@ get_aggregate_scheme <- function(group_time, result_type, id_weights, id_cohorts
   
   #for balanced cohort composition in dynamic setting
   #a cohort us only used if it is seen for all dynamic time
-  if(result_type == "dynamic" & !is.null(balanced_event_time)){
+  if(result_type == "dynamic" & !is.na(balanced_event_time)){
 
     cohorts <- group_time[, .(max_et = max(time-G),
                               min_et = min(time-G)), by = "G"]
diff --git a/R/estimate_did.R b/R/estimate_did.R
index ad3a56e..85b6821 100644
--- a/R/estimate_did.R
+++ b/R/estimate_did.R
@@ -7,14 +7,14 @@ estimate_did <- function(dt_did, covnames, control_type,
   data_pos <-  which(dt_did[, !is.na(D)])
   dt_did <- dt_did[data_pos]
   n <- dt_did[, .N]
-  if(is.null(covnames)){
-    covvars <- NULL
+  if(allNA(covnames)){
+    covvars <- NA
   } else {
     covvars <- as.matrix(dt_did[,.SD, .SDcols = covnames])
   }
 
-  ipw <- control_type %in% c("ipw", "dr") & !is.null(covvars)
-  or <- control_type %in% c("reg", "dr") & !is.null(covvars) #OR is REG
+  ipw <- control_type %in% c("ipw", "dr") & !allNA(covvars)
+  or <- control_type %in% c("reg", "dr") & !allNA(covvars) #OR is REG
 
   # ipw --------
 
@@ -41,7 +41,7 @@ estimate_did <- function(dt_did, covnames, control_type,
     } else { #when using multiple outcome, ipw cache can be reused
       hess <- cache_hess
       prop_score_fit <- cache_ps_fit
-      logit_coef <- NULL #won't be needing the approximate cache
+      logit_coef <- NA #won't be needing the approximate cache
     }
 
     #get the results into the main did dt
@@ -52,8 +52,8 @@ estimate_did <- function(dt_did, covnames, control_type,
   } else {
 
     prop_score_fit <- rep(1,n)
-    logit_coef <- NULL
-    hess <- NULL
+    logit_coef <- NA
+    hess <- NA
 
     dt_did[, treat_ipw_weight := weights*D]
     dt_did[, cont_ipw_weight := weights*(1-D)]
diff --git a/R/estimate_did_rc.R b/R/estimate_did_rc.R
index 9d21404..4107d76 100644
--- a/R/estimate_did_rc.R
+++ b/R/estimate_did_rc.R
@@ -21,14 +21,14 @@ estimate_did_rc <- function(dt_did, covnames, control_type,
   sum_weight_pre <- dt_did[, sum(inpre*weights)]
   sum_weight_post <- dt_did[, sum(inpost*weights)]
   
-  if(is.null(covnames)){
-    covvars <- NULL
+  if(allNA(covnames)){
+    covvars <- NA
   } else {
     covvars <- as.matrix(dt_did[,.SD, .SDcols = covnames])
   }
   
-  ipw <- control_type %in% c("ipw", "dr") & !is.null(covvars)
-  or <- control_type %in% c("reg", "dr") & !is.null(covvars) #OR is REG
+  ipw <- control_type %in% c("ipw", "dr") & !allNA(covvars)
+  or <- control_type %in% c("reg", "dr") & !allNA(covvars) #OR is REG
   
   # ipw --------
   
@@ -62,8 +62,8 @@ estimate_did_rc <- function(dt_did, covnames, control_type,
   } else {
     
     prop_score_fit <- rep(1,n)
-    logit_coef <- NULL
-    hess <- NULL
+    logit_coef <- NA
+    hess <- NA
     dt_did[, treat_ipw_weight := weights*D]
     dt_did[, cont_ipw_weight := weights*(1-D)]
     
diff --git a/R/estimate_gtatt.R b/R/estimate_gtatt.R
index b30e529..2eb71c8 100644
--- a/R/estimate_gtatt.R
+++ b/R/estimate_gtatt.R
@@ -1,4 +1,4 @@
-estimate_gtatt <- function(outcomes_list, outcomevar, covariates, varycovariates, control_type, weights, 
+estimate_gtatt <- function(outcomes_list, outcomevar, varycovariatesvar, covariatesvar, covariates, varycovariates, control_type, weights, 
                            cohort_sizes,cohorts,id_size,time_periods,
                            control_option, allow_unbalance_panel) {
 
@@ -40,17 +40,17 @@ estimate_gtatt <- function(outcomes_list, outcomevar, covariates, varycovariates
         did_setup[get_cohort_pos(cohort_sizes, min_control_cohort, max_control_cohort)] <- 0
         did_setup[get_cohort_pos(cohort_sizes, g)] <- 1 #treated cannot be controls, assign treated after control to overwrite
         
-        if(!is.null(varycovariates)){
+        if(!allNA(varycovariatesvar)){
           precov <- varycovariates[[base_period]]
           names(precov) <- paste0("pre_", names(precov))
           
           postcov <- varycovariates[[t]]-varycovariates[[base_period]]
           names(postcov) <- paste0("post_", names(varycovariates[[t]]))
           
-          if(is.null(covariates)){
-            covariatesdt <- cbind(const = -1, cbind(precov, postcov)) #if covariates is NULL
+          if(allNA(covariatesvar)){
+            covariatesdt <- cbind(const = -1, cbind(precov, postcov)) #if covariates is na
           } else {
-            covariatesdt <- cbind(covariates, cbind(precov, postcov)) #if covariates is NULL
+            covariatesdt <- cbind(covariates, cbind(precov, postcov)) #if covariates is na
           }
           
         } else {
diff --git a/R/fastdid.R b/R/fastdid.R
index 9267c0f..623258f 100644
--- a/R/fastdid.R
+++ b/R/fastdid.R
@@ -9,7 +9,7 @@
 #' @param outcomevar The name of the outcome variable.
 #' @param control_option The control units used for the DiD estimates. Default is "both".
 #' @param result_type A character string indicating the type of result to be returned. Default is "group_time".
-#' @param balanced_event_time A numeric scalar that indicates the max event time to balance the cohort composition, only meaningful when result_type == "dynamic". Default is NULL
+#' @param balanced_event_time A numeric scalar that indicates the max event time to balance the cohort composition, only meaningful when result_type == "dynamic". Default is NA
 #' @param control_type The method for controlling for covariates. "ipw" for inverse probability weighting, "reg" for outcome regression, or "dr" for doubly-robust
 #' @param allow_unbalance_panel Whether allow unbalance panel as input (if false will coerce the dataset to a balanced panel). Default is FALSE 
 #' @param boot Logical, indicating whether bootstrapping should be performed. Default is FALSE.
@@ -57,9 +57,9 @@
 #' @keywords difference-in-differences fast computation panel data estimation did
 fastdid <- function(data,
                     timevar, cohortvar, unitvar, outcomevar, 
-                    control_option="both",result_type="group_time", balanced_event_time = NULL,
+                    control_option="both",result_type="group_time", balanced_event_time = NA,
                     control_type = "ipw", allow_unbalance_panel = FALSE, boot=FALSE, biters = 1000,
-                    weightvar=NULL,clustervar=NULL,covariatesvar = NULL,varycovariatesvar = NULL,
+                    weightvar=NA,clustervar=NA,covariatesvar = NA,varycovariatesvar = NA,
                     copy = TRUE, validate = TRUE
                     ){
   
@@ -77,19 +77,19 @@ fastdid <- function(data,
   name_message <- "__ARG__ must be a character scalar and a name of a column from the dataset."
   check_set_arg(timevar, unitvar, cohortvar, "match", .choices = dt_names, .message = name_message)
   
-  covariate_message <- "__ARG__ must be NULL or a character vector which are all names of columns from the dataset."
+  covariate_message <- "__ARG__ must be NA or a character vector which are all names of columns from the dataset."
   check_set_arg(varycovariatesvar, covariatesvar, outcomevar, 
-            "NULL | multi match", .choices = dt_names, .message = covariate_message)
+            "NA | multi match", .choices = dt_names, .message = covariate_message)
   
-  checkvar_message <- "__ARG__ must be NULL or a character scalar if a name of columns from the dataset."
+  checkvar_message <- "__ARG__ must be NA or a character scalar if a name of columns from the dataset."
   check_set_arg(weightvar, clustervar,
-            "NULL | match", .choices = dt_names, .message = checkvar_message)
+            "NA | match", .choices = dt_names, .message = checkvar_message)
   
   check_set_arg(control_option, "match", .choices = c("both", "never", "notyet")) #kinda bad names since did's notyet include both notyet and never
   check_set_arg(control_type, "match", .choices = c("ipw", "reg", "dr")) 
   check_arg(copy, validate, boot, allow_unbalance_panel, "scalar logical")
   
-  if(!is.null(balanced_event_time)){
+  if(!is.na(balanced_event_time)){
     if(result_type != "dynamic"){stop("balanced_event_time is only meaningful with result_type == 'dynamic'")}
     check_arg(balanced_event_time, "numeric scalar")
   }
@@ -98,11 +98,11 @@ fastdid <- function(data,
     stop("fastdid currently only supprts ipw when allowing for unbalanced panels.")
   }
   
-  if(allow_unbalance_panel == TRUE & !is.null(varycovariatesvar)){
+  if(allow_unbalance_panel == TRUE & !allNA(varycovariatesvar)){
     stop("fastdid currently only supprts time varying covariates when allowing for unbalanced panels.")
   }
   
-  if(any(covariatesvar %in% varycovariatesvar)){stop("time-varying var and invariant var have overlaps.")}
+  if(any(covariatesvar %in% varycovariatesvar) & !allNA(varycovariatesvar) & !allNA(covariatesvar)){stop("time-varying var and invariant var have overlaps.")}
   
   setnames(dt, c(timevar, cohortvar, unitvar), c("time", "G", "unit"))
 
@@ -200,41 +200,41 @@ fastdid <- function(data,
   cohort_sizes <- dt_inv[, .(cohort_size = .N) , by = G]
 
   # the optional columns
-  if(!is.null(covariatesvar)){
+  if(!allNA(covariatesvar)){
     covariates <- cbind(const = -1, dt_inv[,.SD, .SDcols = covariatesvar]) # const 
   } else {
-    covariates <- NULL
+    covariates <- NA
   }
   
   varycovariates <- list()
-  if(!is.null(varycovariatesvar)){
+  if(!allNA(varycovariatesvar)){
     for(i in time_periods){
       start <- (i-1)*id_size+1
       end <- i*id_size
       varycovariates[[i]] <- dt[seq(start,end), .SD, .SDcols = varycovariatesvar]
     }
   } else {
-    varycovariates <- NULL
+    varycovariates <- NA
   }
-  if(!is.null(clustervar)){
+  if(!is.na(clustervar)){
     cluster <- dt_inv[, .SD, .SDcols = clustervar] |> unlist()
-  } else {cluster <- NULL}
+  } else {cluster <- NA}
   
-  if(!is.null(weightvar)){
+  if(!is.na(weightvar)){
     weights <- dt_inv[, .SD, .SDcols = weightvar] |> unlist()
   } else {weights <- rep(1,id_size)}
 
   
-  # cluster <- ifelse(is.null(clustervar), NULL, dt_inv[, .SD, .SDcols = clustervar] |> unlist())
-  #weights <- ifelse(is.null(weightvar), rep(1,id_size), dt_inv[, .SD, .SDcols = weightvar] |> unlist())
-  # covariates <- ifelse(is.null(covariatesvar), NULL, cbind(const = -1, dt_inv[,.SD, .SDcols = covariatesvar]))
+  # cluster <- ifelse(is.na(clustervar), NA, dt_inv[, .SD, .SDcols = clustervar] |> unlist())
+  #weights <- ifelse(is.na(weightvar), rep(1,id_size), dt_inv[, .SD, .SDcols = weightvar] |> unlist())
+  # covariates <- ifelse(is.na(covariatesvar), NA, cbind(const = -1, dt_inv[,.SD, .SDcols = covariatesvar]))
   
  
   
   # main part  -------------------------------------------------
 
   # attgt
-  gt_result_list <- estimate_gtatt(outcomes_list, outcomevar, covariates, varycovariates, control_type, weights,
+  gt_result_list <- estimate_gtatt(outcomes_list, outcomevar, varycovariatesvar, covariatesvar, covariates, varycovariates, control_type, weights,
                                    cohort_sizes,cohorts,id_size,time_periods, #info about the dt
                                    control_option, allow_unbalance_panel)
   
@@ -248,7 +248,7 @@ fastdid <- function(data,
                                result_type, balanced_event_time)
     
     #get se from the influence function
-    agg_se <- get_se(agg_result$inf_matrix, boot, biters, cluster)
+    agg_se <- get_se(agg_result$inf_matrix, boot, biters, cluster, clustervar)
     
     # post process -----------------------------------------------
     
diff --git a/R/get_se.R b/R/get_se.R
index 8fd8f46..8c71f74 100644
--- a/R/get_se.R
+++ b/R/get_se.R
@@ -1,10 +1,10 @@
-get_se <- function(inf_matrix, boot, biters, cluster) {
+get_se <- function(inf_matrix, boot, biters, cluster, clustervar) {
   
   if(boot){
 
     top_quant <- 0.75
     bot_quant <- 0.25
-    if(!is.null(cluster)){
+    if(!allNA(clustervar)){
       cluster_n <- stats::aggregate(cluster, by=list(cluster), length)[,2]
       inf_matrix <- fsum(inf_matrix, cluster) / cluster_n #the mean without 0 for each cluster of each setting
     }
@@ -21,7 +21,7 @@ get_se <- function(inf_matrix, boot, biters, cluster) {
     se[se < sqrt(.Machine$double.eps)*10] <- NA
     
   } else {
-    if(!is.null(cluster)){stop("clustering only available with bootstrap")}
+    if(!allNA(clustervar)){stop("clustering only available with bootstrap")}
     inf_matrix <- inf_matrix  |> as.data.table()
     se <- inf_matrix[, lapply(.SD, function(x) sqrt(sum(x^2, na.rm = TRUE)/length(x)^2))] %>% as.vector() #should maybe use n-1 but did use n
     
diff --git a/R/utils.R b/R/utils.R
index c2397f1..03478a8 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -1,6 +1,3 @@
-
-
-
 set_max_thread <- function(){
   data.table::setDTthreads(0)
   options(kit.nThread = getDTthreads())
@@ -8,4 +5,25 @@ set_max_thread <- function(){
 
 reverse_col <- function(x){
   return(x[,ncol(x):1])
-}
\ No newline at end of file
+}
+
+
+release <- function(p){
+  for(name in names(p)){
+    assign(name, p[[name]], envir = parent.frame()) #assign it to the envir where the func is called
+  }
+}
+
+gather <- function(...){
+  names <- substitute(list(...)) |> deparse() |>
+    str_remove_all("\\)|list\\(") |> str_flatten() |> str_split_1(",") |> str_trim()
+  values <- list(...)
+  p <- list()
+  
+  for(i in 1:length(names)){
+    #p[[names[[i]]]] <- values[[i]]
+    p <- c(p, list(values[[i]])) #for nulls
+  }
+  names(p) <- names
+  return(p)
+}
diff --git a/R/validate_did.R b/R/validate_did.R
index c0210c2..c847150 100644
--- a/R/validate_did.R
+++ b/R/validate_did.R
@@ -3,13 +3,13 @@ validate_did <- function(dt,covariatesvar, varycovariatesvar,varnames, balanced_
   raw_time_size <- dt[, uniqueN(time)]
   
   
-  if(!is.null(balanced_event_time)){
+  if(!is.na(balanced_event_time)){
     if(balanced_event_time > dt[, max(time-G)]){stop("balanced_event_time is larger than the max event time in the data")}
   }
   
   #doesn't allow missing value for now
   for(col in varnames){
-    if(is.null(col)){next}
+    if(is.na(col)){next}
     na_obs <- whichNA(dt[[col]])
     if(length(na_obs) != 0){
       warning("missing values detected in ", col, ", removing ", length(na_obs), " observation.")
@@ -17,13 +17,15 @@ validate_did <- function(dt,covariatesvar, varycovariatesvar,varnames, balanced_
     }
   }
   
-  if(!is.null(covariatesvar) & uniqueN(dt, by = c("unit", covariatesvar)) > raw_unit_size){
+  
+  if(!allNA(covariatesvar) && uniqueN(dt, by = c("unit", covariatesvar)) > raw_unit_size){
     warning("some covariates is time-varying, fastdid only use the first observation for covariates.")
   }
   
   
-  if(!is.null(covariatesvar)|!is.null(varycovariatesvar)){
+  if(!allNA(covariatesvar)|!allNA(varycovariatesvar)){
     for(cov in c(covariatesvar, varycovariatesvar)){
+      if(is.na(cov)){next}
       #check covaraites is not constant  
       if(fnunique(dt[, get(cov)[1], by = "unit"][, V1]) == 1)stop(cov, " have no variation")
     }
diff --git a/inst/tinytest/test_1_fastdid.R b/inst/tinytest/test_1_fastdid.R
index 0c24c50..2bf3b15 100644
--- a/inst/tinytest/test_1_fastdid.R
+++ b/inst/tinytest/test_1_fastdid.R
@@ -25,7 +25,6 @@ expect_silent(fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",ou
                       covariatesvar = c("x", "x2")),
               info = "covariates dr")
 
-
 expect_silent(fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y",  result_type = "group_time",
                       control_type = "dr",
                       covariatesvar = c("x", "x2"), varycovariatesvar = "xvar"),

From 4fa8aeb747b34482e8fd38d2210acf9862c13092 Mon Sep 17 00:00:00 2001
From: TsaiLintung <101155567+TsaiLintung@users.noreply.github.com>
Date: Mon, 18 Mar 2024 17:55:37 +0800
Subject: [PATCH 05/15] reorganize with my gather-release pattern

---
 R/aggregate_gt.R    |  83 ++++++++++++++++------
 R/estimate_did.R    |   6 +-
 R/estimate_did_rc.R |   6 +-
 R/estimate_gtatt.R  |   7 +-
 R/fastdid.R         | 163 +++++++++++++++++++++++++-------------------
 R/get_se.R          |  31 ---------
 R/validate_did.R    |   5 +-
 7 files changed, 168 insertions(+), 133 deletions(-)
 delete mode 100644 R/get_se.R

diff --git a/R/aggregate_gt.R b/R/aggregate_gt.R
index 2fe0741..9094cb0 100644
--- a/R/aggregate_gt.R
+++ b/R/aggregate_gt.R
@@ -1,12 +1,15 @@
-aggregate_gt <- function(gt_result, cohort_sizes, 
-                         id_weights, id_cohorts,
-                         result_type, balanced_event_time){
+aggregate_gt <- function(gt_result, auxdata, params){
+  
+  release(auxdata)
+  release(params)
   
   gt_att <- gt_result$att
   gt_inf_func <- gt_result$inf_func
   gt <- gt_result$gt
+  id_cohorts <- dt_inv[, G]
+  
   
-  id_dt <- data.table(weight = id_weights/sum(id_weights), G = id_cohorts)
+  id_dt <- data.table(weight = weights/sum(weights), G = id_cohorts)
   pg_dt <- id_dt[, .(pg = sum(weight)), by = "G"]
   group_time <- gt |> merge(pg_dt, by = "G")
   
@@ -22,31 +25,39 @@ aggregate_gt <- function(gt_result, cohort_sizes,
     
   } else {
     
-    agg_sch <- get_aggregate_scheme(group_time, result_type, id_weights, id_cohorts, balanced_event_time)
+    agg_sch <- get_aggregate_scheme(group_time, result_type, weights, id_cohorts, balanced_event_time)
     targets <- agg_sch$targets
-    weights <- as.matrix(agg_sch$weights)
+    agg_weights <- as.matrix(agg_sch$agg_weights)
     
     #aggregated att
-    agg_att <- weights %*% gt_att
+    agg_att <- agg_weights %*% gt_att
     
     #get the influence from weight estimation
     #this needs to be optimized!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-    inf_weights <- sapply(asplit(weights, 1), function (x){
-      get_weight_influence(x, gt_att, id_weights, id_cohorts, group_time[, .(G, time)])
+    inf_weights <- sapply(asplit(agg_weights, 1), function (x){
+      get_weight_influence(x, gt_att, weights, id_cohorts, group_time[, .(G, time)])
     })
     
     #aggregated influence function
-    inf_matrix <- (gt_inf_func %*% t(weights)) + inf_weights 
+    inf_matrix <- (gt_inf_func %*% t(agg_weights)) + inf_weights 
 
   }
-  return(list(inf_matrix = inf_matrix, agg_att = agg_att, targets = targets))
+  
+  #get se!!
+  agg_se <- get_se(inf_matrix, boot, biters, cluster, clustervar)
+  
+  # post process
+  result <- data.table(targets, agg_att, agg_se)
+  names(result) <- c("target", "att", "se")
+  
+  return(result)
 }
 
-get_aggregate_scheme <- function(group_time, result_type, id_weights, id_cohorts, balanced_event_time){
+get_aggregate_scheme <- function(group_time, result_type, weights, id_cohorts, balanced_event_time){
   
   #browser()
   
-  weights <- data.table()
+  agg_weights <- data.table()
   gt_count <- group_time[, .N]
   
   bool_to_pn <- function(x){ifelse(x, 1, -1)}
@@ -89,18 +100,18 @@ get_aggregate_scheme <- function(group_time, result_type, id_weights, id_cohorts
     
     group_time[, targeted := NULL]
     
-    weights <- rbind(weights, target_weights)
+    agg_weights <- rbind(agg_weights, target_weights)
   }
   
-  return(list(weights = weights, #a matrix of each target and gt's weight in it 
+  return(list(agg_weights = agg_weights, #a matrix of each target and gt's weight in it 
               targets = targets)) 
 }
 
-get_weight_influence <- function(agg_weights, gt_att, id_weights, id_cohorts, group) {
+get_weight_influence <- function(agg_weights, gt_att, weights, id_cohorts, group) {
 
   keepers <- which(agg_weights > 0)
   
-  id_dt <- data.table(weight = id_weights/sum(id_weights), G = id_cohorts)
+  id_dt <- data.table(weight = weights/sum(weights), G = id_cohorts)
   pg_dt <- id_dt[, .(pg = sum(weight)), by = "G"]
   group <- group |> merge(pg_dt, by = "G")
   
@@ -110,16 +121,48 @@ get_weight_influence <- function(agg_weights, gt_att, id_weights, id_cohorts, gr
 
   # effect of estimating weights in the numerator
   if1 <- sapply(keepers, function(k) {
-    (id_weights*BMisc::TorF(id_cohorts == group[k,G]) - group[k,pg]) /
+    (weights*BMisc::TorF(id_cohorts == group[k,G]) - group[k,pg]) /
       sum(group[keepers,pg])
   })
   # effect of estimating weights in the denominator
   if2 <- base::rowSums(sapply(keepers, function(k) {
-    id_weights*BMisc::TorF(id_cohorts == group[k,G]) - group[k,pg]
+    weights*BMisc::TorF(id_cohorts == group[k,G]) - group[k,pg]
   })) %*%
     t(group[keepers,pg]/(sum(group[keepers,pg])^2))
   # return the influence function for the weights
   inf_weight <- (if1 - if2) %*% as.vector(gt_att[keepers])
   inf_weight[abs(inf_weight) < sqrt(.Machine$double.eps)*10] <- 0 #fill zero 
   return(inf_weight)
-}
\ No newline at end of file
+}
+
+get_se <- function(inf_matrix, boot, biters, cluster, clustervar) {
+  
+  if(boot){
+    
+    top_quant <- 0.75
+    bot_quant <- 0.25
+    if(!allNA(clustervar)){
+      cluster_n <- stats::aggregate(cluster, by=list(cluster), length)[,2]
+      inf_matrix <- fsum(inf_matrix, cluster) / cluster_n #the mean without 0 for each cluster of each setting
+    }
+    
+    boot_results <- BMisc::multiplier_bootstrap(inf_matrix, biters = biters) %>% as.data.table()
+    
+    boot_top <- boot_results[, lapply(.SD, function(x) stats::quantile(x, top_quant, type=1, na.rm = TRUE)),]
+    boot_bot <- boot_results[, lapply(.SD, function(x) stats::quantile(x, bot_quant, type=1, na.rm = TRUE)),]
+    
+    dt_se <- rbind(boot_bot, boot_top) %>% transpose()
+    names(dt_se) <- c("boot_bot", "boot_top")
+    
+    se <- dt_se[,(boot_top-boot_bot)/(qnorm(top_quant) - qnorm(bot_quant))]
+    se[se < sqrt(.Machine$double.eps)*10] <- NA
+    
+  } else {
+    if(!allNA(clustervar)){stop("clustering only available with bootstrap")}
+    inf_matrix <- inf_matrix  |> as.data.table()
+    se <- inf_matrix[, lapply(.SD, function(x) sqrt(sum(x^2, na.rm = TRUE)/length(x)^2))] %>% as.vector() #should maybe use n-1 but did use n
+    
+  }
+  return(unlist(se))
+}
+
diff --git a/R/estimate_did.R b/R/estimate_did.R
index 85b6821..d1d58fe 100644
--- a/R/estimate_did.R
+++ b/R/estimate_did.R
@@ -13,9 +13,9 @@ estimate_did <- function(dt_did, covnames, control_type,
     covvars <- as.matrix(dt_did[,.SD, .SDcols = covnames])
   }
 
-  ipw <- control_type %in% c("ipw", "dr") & !allNA(covvars)
-  or <- control_type %in% c("reg", "dr") & !allNA(covvars) #OR is REG
-
+  ipw <- control_type %in% c("ipw", "dr") & !allNA(covnames)
+  or <- control_type %in% c("reg", "dr") & !allNA(covnames) #OR is REG
+  
   # ipw --------
 
   if(ipw){
diff --git a/R/estimate_did_rc.R b/R/estimate_did_rc.R
index 4107d76..351bfc9 100644
--- a/R/estimate_did_rc.R
+++ b/R/estimate_did_rc.R
@@ -27,9 +27,9 @@ estimate_did_rc <- function(dt_did, covnames, control_type,
     covvars <- as.matrix(dt_did[,.SD, .SDcols = covnames])
   }
   
-  ipw <- control_type %in% c("ipw", "dr") & !allNA(covvars)
-  or <- control_type %in% c("reg", "dr") & !allNA(covvars) #OR is REG
-  
+  ipw <- control_type %in% c("ipw", "dr") & !allNA(covnames)
+  or <- control_type %in% c("reg", "dr") & !allNA(covnames) #OR is REG
+
   # ipw --------
   
   if(ipw){
diff --git a/R/estimate_gtatt.R b/R/estimate_gtatt.R
index 2eb71c8..87639f5 100644
--- a/R/estimate_gtatt.R
+++ b/R/estimate_gtatt.R
@@ -1,7 +1,8 @@
-estimate_gtatt <- function(outcomes_list, outcomevar, varycovariatesvar, covariatesvar, covariates, varycovariates, control_type, weights, 
-                           cohort_sizes,cohorts,id_size,time_periods,
-                           control_option, allow_unbalance_panel) {
+estimate_gtatt <- function(auxdata, params) {
 
+  release(auxdata)
+  release(params)
+  
   treated_cohorts <- cohorts[!is.infinite(cohorts)]
   
   if(!Inf %in% cohorts & control_option != "notyet"){
diff --git a/R/fastdid.R b/R/fastdid.R
index 623258f..68df16e 100644
--- a/R/fastdid.R
+++ b/R/fastdid.R
@@ -63,15 +63,7 @@ fastdid <- function(data,
                     copy = TRUE, validate = TRUE
                     ){
   
-  
   # validation arguments --------------------------------------------------------
-  
-  if(!is.data.table(data)){
-    warning("coercing input into a data.table.")
-    data <- as.data.table(data)
-  } 
-  
-  if(copy){dt <- copy(data)} else {dt <- data}
 
   dt_names <- names(dt)
   name_message <- "__ARG__ must be a character scalar and a name of a column from the dataset."
@@ -93,30 +85,74 @@ fastdid <- function(data,
     if(result_type != "dynamic"){stop("balanced_event_time is only meaningful with result_type == 'dynamic'")}
     check_arg(balanced_event_time, "numeric scalar")
   }
-
   if(allow_unbalance_panel == TRUE & control_type %in% c("dr", "reg")){
     stop("fastdid currently only supprts ipw when allowing for unbalanced panels.")
   }
-  
   if(allow_unbalance_panel == TRUE & !allNA(varycovariatesvar)){
     stop("fastdid currently only supprts time varying covariates when allowing for unbalanced panels.")
   }
+  if(any(covariatesvar %in% varycovariatesvar) & !allNA(varycovariatesvar) & !allNA(covariatesvar)){
+    stop("time-varying var and invariant var have overlaps.")
+  }
   
-  if(any(covariatesvar %in% varycovariatesvar) & !allNA(varycovariatesvar) & !allNA(covariatesvar)){stop("time-varying var and invariant var have overlaps.")}
+  params <- gather(timevar, cohortvar, unitvar, outcomevar, 
+                   control_option,result_type, balanced_event_time,
+                   control_type, allow_unbalance_panel , boot, biters,
+                   weightvar,clustervar,covariatesvar,varycovariatesvar)
   
-  setnames(dt, c(timevar, cohortvar, unitvar), c("time", "G", "unit"))
-
   # validate data -----------------------------------------------------
   
+  if(!is.data.table(data)){
+    warning("coercing input into a data.table.")
+    data <- as.data.table(data)
+  } 
+  if(copy){dt <- copy(data)} else {dt <- data}
+  setnames(dt, c(timevar, cohortvar, unitvar), c("time", "G", "unit"))
   
   if(validate){
     varnames <- c("time", "G", "unit", outcomevar,weightvar,clustervar,covariatesvar, varycovariatesvar)
-    dt <- validate_did(dt, covariatesvar, varycovariatesvar, varnames, balanced_event_time, allow_unbalance_panel)
+    dt <- validate_did(dt, varnames, params)
   }
   
   # preprocess -----------------------------------------------------------
   
   #make dt conform to the WLOG assumptions of fastdid
+  coerce_result <- coerce_dt(dt, params) #also changed dt
+  dt <- coerce_result$dt
+  # get auxiliary data
+  auxdata <- get_auxdata(dt, params)
+  
+  # main part  -------------------------------------------------
+
+  # estimate attgt
+  gt_result_list <- estimate_gtatt(auxdata, params)
+  
+  # aggregate the result
+  all_result <- data.table()
+  for(outcol in outcomevar){
+    gt_result <- gt_result_list[[outcol]]
+    
+    # aggregate att and inf function and get se
+    result <- aggregate_gt(gt_result, auxdata, params)
+    
+    #convert "targets" back to meaningful parameter identifiers like cohort 1 post, time 2 post 
+    result <- result |> convert_targets(result_type, coerce_result$time_change)
+    result[, outcome := outcol]
+    all_result <- rbind(all_result, result)
+    
+    rm(result)
+    
+  }
+
+  return(all_result)
+  
+}
+
+# small steps ----------------------------------------------------------------------
+
+coerce_dt <- function(dt, params){
+  
+  release(params)
   
   #change to int before sorting
   if(!is.numeric(dt[, G])){
@@ -135,11 +171,10 @@ fastdid <- function(data,
   }
   
   setorder(dt, time, G, unit) #sort the dataset essential for the sort-once-quick-access 
-
   #deal with time, coerice time to 1,2,3,4,5.......
   time_periods <- dt[, unique(time)]
   time_size <- length(time_periods)
-
+  
   time_offset <- min(time_periods) - 1 #assume time starts at 1, first is min after sort :)
   if(time_offset != 0){
     dt[, G := G-time_offset]
@@ -156,11 +191,26 @@ fastdid <- function(data,
     dt[time != 1, time := (time-1)/time_step+1]
   }
   
-  #construct the outcomes list for fast access later
+  return(list(dt = dt,
+              time_change = list(time_step = time_step,
+                            max_time = max(time_periods),
+                            time_offset = time_offset)))
+  
+}
+
+get_auxdata <- function(dt, params){
+  
+  release(params)
+
+  
+
+  time_periods <- dt[, unique(time)]
   id_size <- dt[, uniqueN(unit)]
   
   # get auxiliary data ------------------------------
+  setorder(dt, time, G, unit) #sort the dataset essential for the sort-once-quick-access 
   
+  #construct the outcomes list for fast access later
   #loop for multiple outcome
   outcomes_list <- list()
   for(outcol in outcomevar){
@@ -184,28 +234,22 @@ fastdid <- function(data,
       }
       
     }
-
+    
     outcomes_list[[outcol]] <- outcomes
   }
-
+  
   #the time-invariant parts 
   if(!allow_unbalance_panel){
     dt_inv <- dt[1:id_size]
   } else {
-    dt_inv <- dt[dt[, .I[1], by = unit]$V1]
+    dt_inv <- dt[dt[, .I[1], by = unit]$V1] #the first observation
     setorder(dt_inv, unit) #can't move this outside
   }
-
+  
   cohorts <- dt_inv[, unique(G)]
   cohort_sizes <- dt_inv[, .(cohort_size = .N) , by = G]
-
-  # the optional columns
-  if(!allNA(covariatesvar)){
-    covariates <- cbind(const = -1, dt_inv[,.SD, .SDcols = covariatesvar]) # const 
-  } else {
-    covariates <- NA
-  }
   
+  # the optional columns
   varycovariates <- list()
   if(!allNA(varycovariatesvar)){
     for(i in time_periods){
@@ -216,60 +260,35 @@ fastdid <- function(data,
   } else {
     varycovariates <- NA
   }
+  
+  if(!allNA(covariatesvar)){
+    covariates <- cbind(const = -1, dt_inv[,.SD, .SDcols = covariatesvar])
+  } else {
+    covariates <- NA
+  }
+  
   if(!is.na(clustervar)){
     cluster <- dt_inv[, .SD, .SDcols = clustervar] |> unlist()
-  } else {cluster <- NA}
+  } else {
+    cluster <- NA
+  }
   
   if(!is.na(weightvar)){
     weights <- dt_inv[, .SD, .SDcols = weightvar] |> unlist()
-  } else {weights <- rep(1,id_size)}
-
+  } else {
+    weights <- rep(1, id_size)
+  }
   
-  # cluster <- ifelse(is.na(clustervar), NA, dt_inv[, .SD, .SDcols = clustervar] |> unlist())
-  #weights <- ifelse(is.na(weightvar), rep(1,id_size), dt_inv[, .SD, .SDcols = weightvar] |> unlist())
-  # covariates <- ifelse(is.na(covariatesvar), NA, cbind(const = -1, dt_inv[,.SD, .SDcols = covariatesvar]))
+  auxdata <- gather(time_periods, id_size, outcomes_list, dt_inv, cohorts, cohort_sizes, 
+                    varycovariates, covariates, cluster, weights)
   
- 
-  
-  # main part  -------------------------------------------------
-
-  # attgt
-  gt_result_list <- estimate_gtatt(outcomes_list, outcomevar, varycovariatesvar, covariatesvar, covariates, varycovariates, control_type, weights,
-                                   cohort_sizes,cohorts,id_size,time_periods, #info about the dt
-                                   control_option, allow_unbalance_panel)
-  
-  all_result <- data.table()
-  for(outcol in outcomevar){
-    gt_result <- gt_result_list[[outcol]]
-    
-    # aggregate att and inf function
-    agg_result <- aggregate_gt(gt_result, cohort_sizes, 
-                               weights, dt_inv[, G],
-                               result_type, balanced_event_time)
-    
-    #get se from the influence function
-    agg_se <- get_se(agg_result$inf_matrix, boot, biters, cluster, clustervar)
-    
-    # post process -----------------------------------------------
-    
-    result <- data.table(agg_result$targets, agg_result$agg_att, agg_se)
-    names(result) <- c("target", "att", "se")
-    
-    #convert "targets" back to meaningful parameter identifiers like cohort 1 post, time 2 post 
-    result <- result |> convert_targets(result_type, time_offset, time_step, max(time_periods))
-    result[, outcome := outcol]
-    all_result <- rbind(all_result, result)
-    
-    rm(result)
-    
-  }
-
-  return(all_result)
+  return(auxdata)
   
 }
 
-convert_targets <- function(results, result_type, 
-                            time_offset, time_step, max_time){
+convert_targets <- function(results, result_type, time_change){
+  
+  release(time_change) #from coerce_dt
   
   if(result_type == "dynamic"){
     setnames(results, "target", "event_time")
diff --git a/R/get_se.R b/R/get_se.R
deleted file mode 100644
index 8c71f74..0000000
--- a/R/get_se.R
+++ /dev/null
@@ -1,31 +0,0 @@
-get_se <- function(inf_matrix, boot, biters, cluster, clustervar) {
-  
-  if(boot){
-
-    top_quant <- 0.75
-    bot_quant <- 0.25
-    if(!allNA(clustervar)){
-      cluster_n <- stats::aggregate(cluster, by=list(cluster), length)[,2]
-      inf_matrix <- fsum(inf_matrix, cluster) / cluster_n #the mean without 0 for each cluster of each setting
-    }
-
-    boot_results <- BMisc::multiplier_bootstrap(inf_matrix, biters = biters) %>% as.data.table()
-    
-    boot_top <- boot_results[, lapply(.SD, function(x) stats::quantile(x, top_quant, type=1, na.rm = TRUE)),]
-    boot_bot <- boot_results[, lapply(.SD, function(x) stats::quantile(x, bot_quant, type=1, na.rm = TRUE)),]
-    
-    dt_se <- rbind(boot_bot, boot_top) %>% transpose()
-    names(dt_se) <- c("boot_bot", "boot_top")
-
-    se <- dt_se[,(boot_top-boot_bot)/(qnorm(top_quant) - qnorm(bot_quant))]
-    se[se < sqrt(.Machine$double.eps)*10] <- NA
-    
-  } else {
-    if(!allNA(clustervar)){stop("clustering only available with bootstrap")}
-    inf_matrix <- inf_matrix  |> as.data.table()
-    se <- inf_matrix[, lapply(.SD, function(x) sqrt(sum(x^2, na.rm = TRUE)/length(x)^2))] %>% as.vector() #should maybe use n-1 but did use n
-    
-  }
-  return(unlist(se))
-}
-
diff --git a/R/validate_did.R b/R/validate_did.R
index c847150..6eca763 100644
--- a/R/validate_did.R
+++ b/R/validate_did.R
@@ -1,4 +1,7 @@
-validate_did <- function(dt,covariatesvar, varycovariatesvar,varnames, balanced_event_time, allow_unbalance_panel){
+validate_did <- function(dt,varnames,params){
+  
+  release(params)
+  
   raw_unit_size <- dt[, uniqueN(unit)]
   raw_time_size <- dt[, uniqueN(time)]
   

From 42dda9ddfdc3d30ec6c3f464883e5dc8b6019da4 Mon Sep 17 00:00:00 2001
From: TsaiLintung <101155567+TsaiLintung@users.noreply.github.com>
Date: Tue, 19 Mar 2024 12:03:55 +0800
Subject: [PATCH 06/15] some reorganization

---
 R/aggregate_gt.R               | 14 ++++---
 R/estimate_did.R               | 16 +++----
 R/estimate_gtatt.R             | 77 ++++++++++++++++++----------------
 R/fastdid.R                    | 28 +++++--------
 R/validate_did.R               |  1 -
 inst/tinytest/test_1_fastdid.R |  4 ++
 man/fastdid.Rd                 | 15 ++++---
 man/sim_did.Rd                 |  1 +
 8 files changed, 84 insertions(+), 72 deletions(-)

diff --git a/R/aggregate_gt.R b/R/aggregate_gt.R
index 9094cb0..3cb7a4a 100644
--- a/R/aggregate_gt.R
+++ b/R/aggregate_gt.R
@@ -3,12 +3,12 @@ aggregate_gt <- function(gt_result, auxdata, params){
   release(auxdata)
   release(params)
   
+  #release the stuff
   gt_att <- gt_result$att
   gt_inf_func <- gt_result$inf_func
   gt <- gt_result$gt
   id_cohorts <- dt_inv[, G]
   
-  
   id_dt <- data.table(weight = weights/sum(weights), G = id_cohorts)
   pg_dt <- id_dt[, .(pg = sum(weight)), by = "G"]
   group_time <- gt |> merge(pg_dt, by = "G")
@@ -19,12 +19,14 @@ aggregate_gt <- function(gt_result, auxdata, params){
   
   if(result_type == "group_time"){
     
+    #don't need to do anything
     targets <- group_time[, unique(G*max(time)+time)]
     inf_matrix <- gt_inf_func
     agg_att <- as.vector(gt_att)
     
   } else {
     
+    #get which gt(s) is a part of the aggregated param
     agg_sch <- get_aggregate_scheme(group_time, result_type, weights, id_cohorts, balanced_event_time)
     targets <- agg_sch$targets
     agg_weights <- as.matrix(agg_sch$agg_weights)
@@ -43,12 +45,13 @@ aggregate_gt <- function(gt_result, auxdata, params){
 
   }
   
-  #get se!!
+  #get se from influence function
   agg_se <- get_se(inf_matrix, boot, biters, cluster, clustervar)
   
   # post process
   result <- data.table(targets, agg_att, agg_se)
   names(result) <- c("target", "att", "se")
+  result[,outcome := gt_result$outname]
   
   return(result)
 }
@@ -142,14 +145,15 @@ get_se <- function(inf_matrix, boot, biters, cluster, clustervar) {
     top_quant <- 0.75
     bot_quant <- 0.25
     if(!allNA(clustervar)){
+      #take average within the cluster
       cluster_n <- stats::aggregate(cluster, by=list(cluster), length)[,2]
       inf_matrix <- fsum(inf_matrix, cluster) / cluster_n #the mean without 0 for each cluster of each setting
     }
     
     boot_results <- BMisc::multiplier_bootstrap(inf_matrix, biters = biters) %>% as.data.table()
     
-    boot_top <- boot_results[, lapply(.SD, function(x) stats::quantile(x, top_quant, type=1, na.rm = TRUE)),]
-    boot_bot <- boot_results[, lapply(.SD, function(x) stats::quantile(x, bot_quant, type=1, na.rm = TRUE)),]
+    boot_top <- boot_results[, lapply(.SD, function(x) stats::quantile(x, top_quant, type=1, na.rm = TRUE))]
+    boot_bot <- boot_results[, lapply(.SD, function(x) stats::quantile(x, bot_quant, type=1, na.rm = TRUE))]
     
     dt_se <- rbind(boot_bot, boot_top) %>% transpose()
     names(dt_se) <- c("boot_bot", "boot_top")
@@ -158,7 +162,7 @@ get_se <- function(inf_matrix, boot, biters, cluster, clustervar) {
     se[se < sqrt(.Machine$double.eps)*10] <- NA
     
   } else {
-    if(!allNA(clustervar)){stop("clustering only available with bootstrap")}
+
     inf_matrix <- inf_matrix  |> as.data.table()
     se <- inf_matrix[, lapply(.SD, function(x) sqrt(sum(x^2, na.rm = TRUE)/length(x)^2))] %>% as.vector() #should maybe use n-1 but did use n
     
diff --git a/R/estimate_did.R b/R/estimate_did.R
index d1d58fe..597a328 100644
--- a/R/estimate_did.R
+++ b/R/estimate_did.R
@@ -1,4 +1,4 @@
-estimate_did <- function(dt_did, covnames, control_type,
+estimate_did <- function(dt_did, covvars, control_type,
                          last_coef = NULL, cache_ps_fit, cache_hess){
 
   # preprocess --------
@@ -7,14 +7,15 @@ estimate_did <- function(dt_did, covnames, control_type,
   data_pos <-  which(dt_did[, !is.na(D)])
   dt_did <- dt_did[data_pos]
   n <- dt_did[, .N]
-  if(allNA(covnames)){
-    covvars <- NA
+  
+  if(is.matrix(covvars)){
+    ipw <- control_type %in% c("ipw", "dr") 
+    or <- control_type %in% c("reg", "dr")
+    covvars <- covvars[data_pos,] 
   } else {
-    covvars <- as.matrix(dt_did[,.SD, .SDcols = covnames])
+    ipw <- FALSE
+    or <- FALSE
   }
-
-  ipw <- control_type %in% c("ipw", "dr") & !allNA(covnames)
-  or <- control_type %in% c("reg", "dr") & !allNA(covnames) #OR is REG
   
   # ipw --------
 
@@ -104,6 +105,7 @@ estimate_did <- function(dt_did, covnames, control_type,
     M2 <- colMeans(dt_did[, cont_ipw_weight*(delta_y-weighted_cont_delta-or_delta)] * covvars)
 
     score_ps <- dt_did[, weights*(D-ps)] * covvars
+    
     asym_linear_ps <- score_ps %*% hess
 
     #ipw for control
diff --git a/R/estimate_gtatt.R b/R/estimate_gtatt.R
index 87639f5..abe05ab 100644
--- a/R/estimate_gtatt.R
+++ b/R/estimate_gtatt.R
@@ -3,25 +3,24 @@ estimate_gtatt <- function(auxdata, params) {
   release(auxdata)
   release(params)
   
-  treated_cohorts <- cohorts[!is.infinite(cohorts)]
-  
   if(!Inf %in% cohorts & control_option != "notyet"){
     warning("no never-treated availble, switching to not-yet-treated control")
     control_option <- "notyet"
   }
+  
+  treated_cohorts <- cohorts[!is.infinite(cohorts)]
   max_control_cohort <- ifelse(control_option == "notyet", max(treated_cohorts), Inf) 
   
   cache_ps_fit_list <- list() #for the first outcome won't be able to use cache, empty list returns null for call like cache_ps_fit[["1.3"]]
   cache_hess_list <- list()
   outcome_result_list <- list()
-  
   for(outcol in outcomevar){
     
     outcomes <- outcomes_list[[outcol]]
     last_coef <- NULL
     gt <- data.table()
     gt_att <- c()
-    gt_inf_func <- data.table(placeholder = rep(NA, id_size))
+    gt_inf_func <- data.table(placeholder = rep(NA, id_size)) #populate with NA
     
     for(t in time_periods){
       for(g in cohorts){
@@ -31,9 +30,9 @@ estimate_gtatt <- function(auxdata, params) {
         #setup and checks
         base_period <- g-1
         min_control_cohort <- ifelse(control_option == "never", Inf, max(t+1, base_period+1)) #not-yet treated / never treated in both base and "treated" period
-        if(t == base_period){next} #no treatmenteffect for the base period
-        if(base_period < min(time_periods)){next} #no treatmenteffect for the first period, since base period is not observed
-        if(g >= max_control_cohort){next} #no treatmenteffect for never treated or the last treated cohort (notyet)
+        if(t == base_period){next} #no treatment effect for the base period
+        if(base_period < min(time_periods)){next} #no treatment effect for the first period, since base period is not observed
+        if(g >= max_control_cohort){next} #no treatment effect for never treated or the last treated cohort (for not yet notyet)
         if(t >= max_control_cohort){next} #no control available if the last cohort is treated too
         
         #select the control and treated cohorts
@@ -41,37 +40,24 @@ estimate_gtatt <- function(auxdata, params) {
         did_setup[get_cohort_pos(cohort_sizes, min_control_cohort, max_control_cohort)] <- 0
         did_setup[get_cohort_pos(cohort_sizes, g)] <- 1 #treated cannot be controls, assign treated after control to overwrite
         
-        if(!allNA(varycovariatesvar)){
-          precov <- varycovariates[[base_period]]
-          names(precov) <- paste0("pre_", names(precov))
-          
-          postcov <- varycovariates[[t]]-varycovariates[[base_period]]
-          names(postcov) <- paste0("post_", names(varycovariates[[t]]))
-          
-          if(allNA(covariatesvar)){
-            covariatesdt <- cbind(const = -1, cbind(precov, postcov)) #if covariates is na
-          } else {
-            covariatesdt <- cbind(covariates, cbind(precov, postcov)) #if covariates is na
-          }
-          
-        } else {
-          covariatesdt <- covariates
-        } 
+        #construct the covariates matrix
+        covvars <- get_covvars(covariates, varycovariates)
         
         #construct the 2x2 dataset
-        cohort_did <- data.table(did_setup, outcomes[[t]], outcomes[[base_period]], weights, covariatesdt)
-        setnames(cohort_did, c("did_setup", "V2", "V3", "weights"), c("D", "post.y", "pre.y", "weights"))
- 
+        cohort_did <- data.table(did_setup, outcomes[[t]], outcomes[[base_period]], weights)
+        setnames(cohort_did, c("did_setup", "V2", "V3"), c("D", "post.y", "pre.y"))
+        
         #estimate did
         if(!allow_unbalance_panel){
-          result <- estimate_did(cohort_did, colnames(covariatesdt), control_type, last_coef, cache_ps_fit_list[[gt_name]], cache_hess_list[[gt_name]])
+          result <- estimate_did(cohort_did, covvars, control_type, 
+                                 last_coef, cache_ps_fit_list[[gt_name]], cache_hess_list[[gt_name]]) #cache
         } else {
-          result <- estimate_did_rc(cohort_did, colnames(covariatesdt), control_type, last_coef, cache_ps_fit_list[[gt_name]], cache_hess_list[[gt_name]])
+          result <- estimate_did_rc(cohort_did, covvars, control_type, 
+                                    last_coef, cache_ps_fit_list[[gt_name]], cache_hess_list[[gt_name]]) #cache
         }
        
-      
         #collect the result
-        last_coef <- result$logit_coef
+        last_coef <- result$logit_coef #for faster convergence in next iter
         gt <- rbind(gt, data.table(G = g, time = t)) #the sequence matters for the weights
         gt_att <- c(gt_att, att = result$att)
         gt_inf_func[[gt_name]] <- result$inf_func
@@ -86,10 +72,7 @@ estimate_gtatt <- function(auxdata, params) {
     
     gt_inf_func[,placeholder := NULL]
     names(gt_att) <- names(gt_inf_func)
-    outcome_result <- list(att = gt_att, inf_func = gt_inf_func, gt = gt)
-    outcome_result_list[[outcol]] <- outcome_result
-    
-    rm(outcome_result)
+    outcome_result_list[[outcol]] <- list(att = gt_att, inf_func = gt_inf_func, gt = gt, outname = outcol)
 
   }
   
@@ -97,8 +80,32 @@ estimate_gtatt <- function(auxdata, params) {
 }
 
 get_cohort_pos <- function(cohort_sizes, start_cohort, end_cohort = start_cohort){
-  #if(!start_cohort %in% cohort_sizes[,unique(G)]|!end_cohort %in% cohort_sizes[,unique(G)]) {stop("cohort not in cohort_sizes")}
   start <- cohort_sizes[G < start_cohort, sum(cohort_size)]+1
   end <- cohort_sizes[G <= end_cohort, sum(cohort_size)]
   return(start:end)
 }
+
+get_covvars <- function(covariates, varycovariates){
+  if(is.data.table(varycovariates)){
+    
+    precov <- varycovariates[[base_period]]
+    names(precov) <- paste0("pre_", names(precov))
+    postcov <- varycovariates[[t]]-varycovariates[[base_period]]
+    names(postcov) <- paste0("post_", names(varycovariates[[t]]))
+    
+    if(is.data.table(covariates)){
+      covvars <- cbind(covariates, cbind(precov, postcov)) #if covariates is na
+    } else {
+      covvars <- cbind(const = -1, cbind(precov, postcov)) #if covariates is na
+    }
+    covvars <- as.matrix(covvars)
+    
+  } else {
+    if(is.data.table(covariates)){
+      covvars <- as.matrix(covariates) #will be NA if covariatesvar have nothing
+    } else {
+      covvars <- NA
+    }
+  }
+  return(covvars)
+}
diff --git a/R/fastdid.R b/R/fastdid.R
index 68df16e..9989fac 100644
--- a/R/fastdid.R
+++ b/R/fastdid.R
@@ -63,7 +63,7 @@ fastdid <- function(data,
                     copy = TRUE, validate = TRUE
                     ){
   
-  # validation arguments --------------------------------------------------------
+  # validate arguments --------------------------------------------------------
 
   dt_names <- names(dt)
   name_message <- "__ARG__ must be a character scalar and a name of a column from the dataset."
@@ -94,6 +94,9 @@ fastdid <- function(data,
   if(any(covariatesvar %in% varycovariatesvar) & !allNA(varycovariatesvar) & !allNA(covariatesvar)){
     stop("time-varying var and invariant var have overlaps.")
   }
+  if(!boot & !allNA(clustervar)){
+    stop("clustering only available with bootstrap")
+  }
   
   params <- gather(timevar, cohortvar, unitvar, outcomevar, 
                    control_option,result_type, balanced_event_time,
@@ -106,6 +109,7 @@ fastdid <- function(data,
     warning("coercing input into a data.table.")
     data <- as.data.table(data)
   } 
+  
   if(copy){dt <- copy(data)} else {dt <- data}
   setnames(dt, c(timevar, cohortvar, unitvar), c("time", "G", "unit"))
   
@@ -127,22 +131,13 @@ fastdid <- function(data,
   # estimate attgt
   gt_result_list <- estimate_gtatt(auxdata, params)
   
-  # aggregate the result
-  all_result <- data.table()
-  for(outcol in outcomevar){
-    gt_result <- gt_result_list[[outcol]]
-    
-    # aggregate att and inf function and get se
-    result <- aggregate_gt(gt_result, auxdata, params)
-    
+  # aggregate the result by outcome
+  all_result <- rbindlist(lapply(gt_result_list, function(x){
+    result <- aggregate_gt(x, auxdata, params)
     #convert "targets" back to meaningful parameter identifiers like cohort 1 post, time 2 post 
     result <- result |> convert_targets(result_type, coerce_result$time_change)
-    result[, outcome := outcol]
-    all_result <- rbind(all_result, result)
-    
-    rm(result)
-    
-  }
+    return(result)
+  }))
 
   return(all_result)
   
@@ -202,12 +197,9 @@ get_auxdata <- function(dt, params){
   
   release(params)
 
-  
-
   time_periods <- dt[, unique(time)]
   id_size <- dt[, uniqueN(unit)]
   
-  # get auxiliary data ------------------------------
   setorder(dt, time, G, unit) #sort the dataset essential for the sort-once-quick-access 
   
   #construct the outcomes list for fast access later
diff --git a/R/validate_did.R b/R/validate_did.R
index 6eca763..746f6fb 100644
--- a/R/validate_did.R
+++ b/R/validate_did.R
@@ -5,7 +5,6 @@ validate_did <- function(dt,varnames,params){
   raw_unit_size <- dt[, uniqueN(unit)]
   raw_time_size <- dt[, uniqueN(time)]
   
-  
   if(!is.na(balanced_event_time)){
     if(balanced_event_time > dt[, max(time-G)]){stop("balanced_event_time is larger than the max event time in the data")}
   }
diff --git a/inst/tinytest/test_1_fastdid.R b/inst/tinytest/test_1_fastdid.R
index 2bf3b15..c73ee9f 100644
--- a/inst/tinytest/test_1_fastdid.R
+++ b/inst/tinytest/test_1_fastdid.R
@@ -7,6 +7,10 @@ dt <- simdt$dt
 
 # basics ---------------------------------------------
 
+# fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y",  result_type = "group_time",
+#         control_type = "ipw",
+#         covariatesvar = c("x", "x2"))
+
 expect_silent(fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y",  result_type = "group_time"),
               info = "simple call")
 
diff --git a/man/fastdid.Rd b/man/fastdid.Rd
index 35dd253..a2dfdd3 100644
--- a/man/fastdid.Rd
+++ b/man/fastdid.Rd
@@ -12,14 +12,15 @@ fastdid(
   outcomevar,
   control_option = "both",
   result_type = "group_time",
-  balanced_event_time = NULL,
+  balanced_event_time = NA,
   control_type = "ipw",
   allow_unbalance_panel = FALSE,
   boot = FALSE,
   biters = 1000,
-  weightvar = NULL,
-  clustervar = NULL,
-  covariatesvar = NULL,
+  weightvar = NA,
+  clustervar = NA,
+  covariatesvar = NA,
+  varycovariatesvar = NA,
   copy = TRUE,
   validate = TRUE
 )
@@ -39,7 +40,7 @@ fastdid(
 
 \item{result_type}{A character string indicating the type of result to be returned. Default is "group_time".}
 
-\item{balanced_event_time}{A numeric scalar that indicates the max event time to balance the cohort composition, only meaningful when result_type == "dynamic". Default is NULL}
+\item{balanced_event_time}{A numeric scalar that indicates the max event time to balance the cohort composition, only meaningful when result_type == "dynamic". Default is NA}
 
 \item{control_type}{The method for controlling for covariates. "ipw" for inverse probability weighting, "reg" for outcome regression, or "dr" for doubly-robust}
 
@@ -53,7 +54,9 @@ fastdid(
 
 \item{clustervar}{The name of the cluster variable, can only be used when boot == TRUE (optional).}
 
-\item{covariatesvar}{A character vector containing the names of covariate variables (optional).}
+\item{covariatesvar}{A character vector containing the names of time-invariant covariate variables (optional).}
+
+\item{varycovariatesvar}{A character vector containing the names of time-varying covariate variables (optional).}
 
 \item{copy}{whether to copy the dataset before processing, set to false to speed up the process, but the input data will be altered.}
 
diff --git a/man/sim_did.Rd b/man/sim_did.Rd
index 664caf3..540022a 100644
--- a/man/sim_did.Rd
+++ b/man/sim_did.Rd
@@ -13,6 +13,7 @@ sim_did(
   hetero = "all",
   second_outcome = FALSE,
   second_cov = FALSE,
+  vary_cov = FALSE,
   na = "none",
   balanced = TRUE,
   seed = NA,

From 11825e054bc58fb4d8dfbdae40f389e21b31ac92 Mon Sep 17 00:00:00 2001
From: TsaiLintung <101155567+TsaiLintung@users.noreply.github.com>
Date: Tue, 19 Mar 2024 12:04:26 +0800
Subject: [PATCH 07/15] still organization

---
 R/estimate_did_rc.R | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/R/estimate_did_rc.R b/R/estimate_did_rc.R
index 351bfc9..57e3680 100644
--- a/R/estimate_did_rc.R
+++ b/R/estimate_did_rc.R
@@ -1,18 +1,17 @@
-estimate_did_rc <- function(dt_did, covnames, control_type,
+estimate_did_rc <- function(dt_did, covvars, control_type,
                          last_coef = NULL, cache_ps_fit, cache_hess){
   
   #TODO: skip if not enough valid data
   
   # preprocess --------
   
-  oldn <- dt_did[, .N]
   
-  #separate the dataset into pre and post
-
+  oldn <- dt_did[, .N]
   data_pos <-  which(dt_did[, !is.na(D)])
   dt_did <- dt_did[data_pos]
   n <- dt_did[, .N]
   
+  #separate the dataset into pre and post
   dt_did[, inpre := as.numeric(!is.na(pre.y))]
   dt_did[, inpost := as.numeric(!is.na(post.y))]
   n_pre <- dt_did[, sum(!is.na(pre.y))]
@@ -21,14 +20,14 @@ estimate_did_rc <- function(dt_did, covnames, control_type,
   sum_weight_pre <- dt_did[, sum(inpre*weights)]
   sum_weight_post <- dt_did[, sum(inpost*weights)]
   
-  if(allNA(covnames)){
-    covvars <- NA
+  if(is.matrix(covvars)){
+    ipw <- control_type %in% c("ipw", "dr") 
+    or <- control_type %in% c("reg", "dr")
+    covvars <- covvars[data_pos,] 
   } else {
-    covvars <- as.matrix(dt_did[,.SD, .SDcols = covnames])
+    ipw <- FALSE
+    or <- FALSE
   }
-  
-  ipw <- control_type %in% c("ipw", "dr") & !allNA(covnames)
-  or <- control_type %in% c("reg", "dr") & !allNA(covnames) #OR is REG
 
   # ipw --------
   
@@ -191,8 +190,8 @@ estimate_did_rc <- function(dt_did, covnames, control_type,
   inf_treat_post <- inf_treat_did_post+inf_treat_or_post
   inf_cont_pre <- inf_cont_did_pre+inf_cont_ipw_pre+inf_cont_or_pre
   inf_treat_pre <- inf_treat_did_pre+inf_treat_or_pre
+  
   #post process
-
   inf_func_no_na_post <- (inf_treat_post - inf_cont_post) * oldn / n_post #adjust the value such that mean over the whole id size give the right result
   inf_func_no_na_post[is.na(inf_func_no_na_post)] <- 0 #fill 0 for NA part (no influce if not in this gt)
   

From 3ac3577ae28f6574c52e28bb7ee6920228bac955 Mon Sep 17 00:00:00 2001
From: TsaiLintung <101155567+TsaiLintung@users.noreply.github.com>
Date: Tue, 19 Mar 2024 15:39:04 +0800
Subject: [PATCH 08/15] abandon release and gather, more readable I hope

---
 R/aggregate_gt.R   | 40 +++++++++-----------
 R/estimate_gtatt.R | 45 +++++++++++-----------
 R/fastdid.R        | 94 ++++++++++++++++++++++++++--------------------
 R/utils.R          | 21 -----------
 R/validate_did.R   | 18 ++++-----
 5 files changed, 100 insertions(+), 118 deletions(-)

diff --git a/R/aggregate_gt.R b/R/aggregate_gt.R
index 3cb7a4a..48b800f 100644
--- a/R/aggregate_gt.R
+++ b/R/aggregate_gt.R
@@ -1,52 +1,46 @@
-aggregate_gt <- function(gt_result, auxdata, params){
-  
-  release(auxdata)
-  release(params)
+aggregate_gt <- function(gt_result, aux, p){
   
+
   #release the stuff
-  gt_att <- gt_result$att
-  gt_inf_func <- gt_result$inf_func
-  gt <- gt_result$gt
-  id_cohorts <- dt_inv[, G]
+  id_cohorts <- aux$dt_inv[, G]
   
-  id_dt <- data.table(weight = weights/sum(weights), G = id_cohorts)
+  id_dt <- data.table(weight = aux$weights/sum(aux$weights), G = id_cohorts)
   pg_dt <- id_dt[, .(pg = sum(weight)), by = "G"]
-  group_time <- gt |> merge(pg_dt, by = "G")
+  group_time <- gt_result$gt |> merge(pg_dt, by = "G")
   
   setorder(group_time, time, G) #change the order to match the order in gtatt
   
-  gt_inf_func <- as.matrix(gt_inf_func)
+  gt_result$inf_func <- as.matrix(gt_result$inf_func)
   
-  if(result_type == "group_time"){
+  if(p$result_type == "group_time"){
     
     #don't need to do anything
     targets <- group_time[, unique(G*max(time)+time)]
-    inf_matrix <- gt_inf_func
-    agg_att <- as.vector(gt_att)
+    inf_matrix <- gt_result$inf_func
+    agg_att <- as.vector(gt_result$att)
     
   } else {
     
     #get which gt(s) is a part of the aggregated param
-    agg_sch <- get_aggregate_scheme(group_time, result_type, weights, id_cohorts, balanced_event_time)
+    agg_sch <- get_aggregate_scheme(group_time, p$result_type, aux$weights, id_cohorts, p$balanced_event_time)
     targets <- agg_sch$targets
-    agg_weights <- as.matrix(agg_sch$agg_weights)
     
     #aggregated att
-    agg_att <- agg_weights %*% gt_att
+    agg_att <- agg_sch$agg_weights %*% gt_result$att
     
     #get the influence from weight estimation
     #this needs to be optimized!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-    inf_weights <- sapply(asplit(agg_weights, 1), function (x){
-      get_weight_influence(x, gt_att, weights, id_cohorts, group_time[, .(G, time)])
+    inf_weights <- sapply(asplit(agg_sch$agg_weights, 1), function (x){
+      get_weight_influence(x, gt_result$att, aux$weights, id_cohorts, group_time[, .(G, time)])
     })
     
     #aggregated influence function
-    inf_matrix <- (gt_inf_func %*% t(agg_weights)) + inf_weights 
-
+    inf_matrix <- (gt_result$inf_func %*% t(agg_sch$agg_weights)) + inf_weights 
+    
   }
   
   #get se from influence function
-  agg_se <- get_se(inf_matrix, boot, biters, cluster, clustervar)
+  agg_se <- get_se(inf_matrix, p$boot, p$biters, aux$cluster, p$clustervar)
   
   # post process
   result <- data.table(targets, agg_att, agg_se)
@@ -106,7 +100,7 @@ get_aggregate_scheme <- function(group_time, result_type, weights, id_cohorts, b
     agg_weights <- rbind(agg_weights, target_weights)
   }
   
-  return(list(agg_weights = agg_weights, #a matrix of each target and gt's weight in it 
+  return(list(agg_weights = as.matrix(agg_weights), #a matrix of each target and gt's weight in it 
               targets = targets)) 
 }
 
diff --git a/R/estimate_gtatt.R b/R/estimate_gtatt.R
index abe05ab..c6fdb23 100644
--- a/R/estimate_gtatt.R
+++ b/R/estimate_gtatt.R
@@ -1,58 +1,55 @@
-estimate_gtatt <- function(auxdata, params) {
+estimate_gtatt <- function(aux, p) {
 
-  release(auxdata)
-  release(params)
-  
-  if(!Inf %in% cohorts & control_option != "notyet"){
+  if(!Inf %in% aux$cohorts & p$control_option != "notyet"){
     warning("no never-treated availble, switching to not-yet-treated control")
-    control_option <- "notyet"
+    p$control_option <- "notyet"
   }
   
-  treated_cohorts <- cohorts[!is.infinite(cohorts)]
-  max_control_cohort <- ifelse(control_option == "notyet", max(treated_cohorts), Inf) 
+  treated_cohorts <- aux$cohorts[!is.infinite(aux$cohorts)]
+  max_control_cohort <- ifelse(p$control_option == "notyet", max(treated_cohorts), Inf) 
   
   cache_ps_fit_list <- list() #for the first outcome won't be able to use cache, empty list returns null for call like cache_ps_fit[["1.3"]]
   cache_hess_list <- list()
   outcome_result_list <- list()
-  for(outcol in outcomevar){
+  for(outcol in p$outcomevar){
     
-    outcomes <- outcomes_list[[outcol]]
+    outcomes <- aux$outcomes_list[[outcol]]
     last_coef <- NULL
     gt <- data.table()
     gt_att <- c()
-    gt_inf_func <- data.table(placeholder = rep(NA, id_size)) #populate with NA
+    gt_inf_func <- data.table(placeholder = rep(NA, aux$id_size)) #populate with NA
     
-    for(t in time_periods){
-      for(g in cohorts){
+    for(t in aux$time_periods){
+      for(g in aux$cohorts){
         
         gt_name <- paste0(g, ".", t)
         
         #setup and checks
         base_period <- g-1
-        min_control_cohort <- ifelse(control_option == "never", Inf, max(t+1, base_period+1)) #not-yet treated / never treated in both base and "treated" period
+        min_control_cohort <- ifelse(p$control_option  == "never", Inf, max(t+1, base_period+1)) #not-yet treated / never treated in both base and "treated" period
         if(t == base_period){next} #no treatment effect for the base period
-        if(base_period < min(time_periods)){next} #no treatment effect for the first period, since base period is not observed
+        if(base_period < min(aux$time_periods)){next} #no treatment effect for the first period, since base period is not observed
         if(g >= max_control_cohort){next} #no treatment effect for never treated or the last treated cohort (for not yet notyet)
         if(t >= max_control_cohort){next} #no control available if the last cohort is treated too
         
         #select the control and treated cohorts
-        did_setup <- rep(NA, id_size)
-        did_setup[get_cohort_pos(cohort_sizes, min_control_cohort, max_control_cohort)] <- 0
-        did_setup[get_cohort_pos(cohort_sizes, g)] <- 1 #treated cannot be controls, assign treated after control to overwrite
+        did_setup <- rep(NA, aux$id_size)
+        did_setup[get_cohort_pos(aux$cohort_sizes, min_control_cohort, max_control_cohort)] <- 0
+        did_setup[get_cohort_pos(aux$cohort_sizes, g)] <- 1 #treated cannot be controls, assign treated after control to overwrite
         
         #construct the covariates matrix
-        covvars <- get_covvars(covariates, varycovariates)
+        covvars <- get_covvars(aux$covariates, aux$varycovariates)
         
         #construct the 2x2 dataset
-        cohort_did <- data.table(did_setup, outcomes[[t]], outcomes[[base_period]], weights)
-        setnames(cohort_did, c("did_setup", "V2", "V3"), c("D", "post.y", "pre.y"))
+        cohort_did <- data.table(did_setup, outcomes[[t]], outcomes[[base_period]], aux$weights)
+        names(cohort_did) <- c("D", "post.y", "pre.y", "weights")
         
         #estimate did
-        if(!allow_unbalance_panel){
-          result <- estimate_did(cohort_did, covvars, control_type, 
+        if(!p$allow_unbalance_panel){
+          result <- estimate_did(cohort_did, covvars, p$control_type, 
                                  last_coef, cache_ps_fit_list[[gt_name]], cache_hess_list[[gt_name]]) #cache
         } else {
-          result <- estimate_did_rc(cohort_did, covvars, control_type, 
+          result <- estimate_did_rc(cohort_did, covvars, p$control_type, 
                                     last_coef, cache_ps_fit_list[[gt_name]], cache_hess_list[[gt_name]]) #cache
         }
        
diff --git a/R/fastdid.R b/R/fastdid.R
index 9989fac..c2fe2e0 100644
--- a/R/fastdid.R
+++ b/R/fastdid.R
@@ -98,10 +98,22 @@ fastdid <- function(data,
     stop("clustering only available with bootstrap")
   }
   
-  params <- gather(timevar, cohortvar, unitvar, outcomevar, 
-                   control_option,result_type, balanced_event_time,
-                   control_type, allow_unbalance_panel , boot, biters,
-                   weightvar,clustervar,covariatesvar,varycovariatesvar)
+  p <- list(timevar = timevar,
+            cohortvar = cohortvar,
+            unitvar = unitvar,
+            outcomevar =  outcomevar,
+            weightvar = weightvar,
+            clustervar = clustervar,
+            covariatesvar = covariatesvar,
+            varycovariatesvar = varycovariatesvar,
+            control_option = control_option,
+            result_type = result_type,
+            balanced_event_time = balanced_event_time,
+            control_type = control_type,
+            allow_unbalance_panel = allow_unbalance_panel,
+            boot = boot, 
+            biters = biters)
+  
   
   # validate data -----------------------------------------------------
   
@@ -114,26 +126,26 @@ fastdid <- function(data,
   setnames(dt, c(timevar, cohortvar, unitvar), c("time", "G", "unit"))
   
   if(validate){
-    varnames <- c("time", "G", "unit", outcomevar,weightvar,clustervar,covariatesvar, varycovariatesvar)
-    dt <- validate_did(dt, varnames, params)
+    varnames <- c("time", "G", "unit", outcomevar,weightvar,clustervar,covariatesvar,varycovariatesvar)
+    dt <- validate_did(dt, varnames, p)
   }
   
   # preprocess -----------------------------------------------------------
   
   #make dt conform to the WLOG assumptions of fastdid
-  coerce_result <- coerce_dt(dt, params) #also changed dt
+  coerce_result <- coerce_dt(dt, p) #also changed dt
   dt <- coerce_result$dt
   # get auxiliary data
-  auxdata <- get_auxdata(dt, params)
+  aux <- get_auxdata(dt, p)
   
   # main part  -------------------------------------------------
 
   # estimate attgt
-  gt_result_list <- estimate_gtatt(auxdata, params)
+  gt_result_list <- estimate_gtatt(aux, p)
   
   # aggregate the result by outcome
   all_result <- rbindlist(lapply(gt_result_list, function(x){
-    result <- aggregate_gt(x, auxdata, params)
+    result <- aggregate_gt(x, aux, p)
     #convert "targets" back to meaningful parameter identifiers like cohort 1 post, time 2 post 
     result <- result |> convert_targets(result_type, coerce_result$time_change)
     return(result)
@@ -145,9 +157,7 @@ fastdid <- function(data,
 
 # small steps ----------------------------------------------------------------------
 
-coerce_dt <- function(dt, params){
-  
-  release(params)
+coerce_dt <- function(dt, p){
   
   #change to int before sorting
   if(!is.numeric(dt[, G])){
@@ -157,7 +167,7 @@ coerce_dt <- function(dt, params){
     dt[, time := as.numeric(time)] 
   }
   
-  if(allow_unbalance_panel){
+  if(p$allow_unbalance_panel){
     dt_inv_raw <- dt[dt[, .I[1], by = unit]$V1]
     setorder(dt_inv_raw, G)
     dt_inv_raw[, new_unit := 1:.N] #let unit start from 1 .... N, useful for knowing which unit is missing
@@ -193,9 +203,7 @@ coerce_dt <- function(dt, params){
   
 }
 
-get_auxdata <- function(dt, params){
-  
-  release(params)
+get_auxdata <- function(dt, p){
 
   time_periods <- dt[, unique(time)]
   id_size <- dt[, uniqueN(unit)]
@@ -205,10 +213,10 @@ get_auxdata <- function(dt, params){
   #construct the outcomes list for fast access later
   #loop for multiple outcome
   outcomes_list <- list()
-  for(outcol in outcomevar){
+  for(outcol in p$outcomevar){
     outcomes <- list()
     
-    if(!allow_unbalance_panel){
+    if(!p$allow_unbalance_panel){
       for(i in time_periods){
         start <- (i-1)*id_size+1
         end <- i*id_size
@@ -231,7 +239,7 @@ get_auxdata <- function(dt, params){
   }
   
   #the time-invariant parts 
-  if(!allow_unbalance_panel){
+  if(!p$allow_unbalance_panel){
     dt_inv <- dt[1:id_size]
   } else {
     dt_inv <- dt[dt[, .I[1], by = unit]$V1] #the first observation
@@ -243,44 +251,50 @@ get_auxdata <- function(dt, params){
   
   # the optional columns
   varycovariates <- list()
-  if(!allNA(varycovariatesvar)){
+  if(!allNA(p$varycovariatesvar)){
     for(i in time_periods){
       start <- (i-1)*id_size+1
       end <- i*id_size
-      varycovariates[[i]] <- dt[seq(start,end), .SD, .SDcols = varycovariatesvar]
+      varycovariates[[i]] <- dt[seq(start,end), .SD, .SDcols = p$varycovariatesvar]
     }
   } else {
     varycovariates <- NA
   }
   
-  if(!allNA(covariatesvar)){
-    covariates <- cbind(const = -1, dt_inv[,.SD, .SDcols = covariatesvar])
+  if(!allNA(p$covariatesvar)){
+    covariates <- cbind(const = -1, dt_inv[,.SD, .SDcols = p$covariatesvar])
   } else {
     covariates <- NA
   }
   
-  if(!is.na(clustervar)){
-    cluster <- dt_inv[, .SD, .SDcols = clustervar] |> unlist()
+  if(!is.na(p$clustervar)){
+    cluster <- dt_inv[, .SD, .SDcols = p$clustervar] |> unlist()
   } else {
     cluster <- NA
   }
   
-  if(!is.na(weightvar)){
-    weights <- dt_inv[, .SD, .SDcols = weightvar] |> unlist()
+  if(!is.na(p$weightvar)){
+    weights <- dt_inv[, .SD, .SDcols = p$weightvar] |> unlist()
   } else {
     weights <- rep(1, id_size)
   }
   
-  auxdata <- gather(time_periods, id_size, outcomes_list, dt_inv, cohorts, cohort_sizes, 
-                    varycovariates, covariates, cluster, weights)
+  aux <- list(time_periods = time_periods,
+              id_size = id_size,
+              outcomes_list = outcomes_list,
+              dt_inv= dt_inv,
+              cohorts = cohorts,
+              cohort_sizes = cohort_sizes, 
+              varycovariates = varycovariates,
+              covariates = covariates,
+              cluster = cluster,
+              weights = weights)
   
-  return(auxdata)
+  return(aux)
   
 }
 
-convert_targets <- function(results, result_type, time_change){
-  
-  release(time_change) #from coerce_dt
+convert_targets <- function(results, result_type, t){
   
   if(result_type == "dynamic"){
     setnames(results, "target", "event_time")
@@ -288,23 +302,23 @@ convert_targets <- function(results, result_type, time_change){
   } else if (result_type == "cohort"){
     
     results[, type := ifelse(target >= 0, "post", "pre")]
-    results[, target := recover_time(abs(target), time_offset, time_step)]
+    results[, target := recover_time(abs(target), t$time_offset, t$time_step)]
     setnames(results, "target", "cohort")
     
   } else if (result_type == "calendar"){
     
     results[, type := ifelse(target >= 0, "post", "pre")]
-    results[, target := recover_time(abs(target), time_offset, time_step)]
+    results[, target := recover_time(abs(target), t$time_offset, t$time_step)]
     setnames(results, "target", "time")
     
   } else if (result_type == "group_time"){
     
-    results[, cohort := floor((target-1)/max_time)]
-    results[, time := (target-cohort*max_time)]
+    results[, cohort := floor((target-1)/t$max_time)]
+    results[, time := (target-cohort*t$max_time)]
     
     #recover the time
-    results[, cohort := recover_time(cohort, time_offset, time_step)]
-    results[, time := recover_time(time, time_offset, time_step)]
+    results[, cohort := recover_time(cohort, t$time_offset, t$time_step)]
+    results[, time := recover_time(time, t$time_offset, t$time_step)]
     
     results[, target := NULL]
     
diff --git a/R/utils.R b/R/utils.R
index 03478a8..2a855a5 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -6,24 +6,3 @@ set_max_thread <- function(){
 reverse_col <- function(x){
   return(x[,ncol(x):1])
 }
-
-
-release <- function(p){
-  for(name in names(p)){
-    assign(name, p[[name]], envir = parent.frame()) #assign it to the envir where the func is called
-  }
-}
-
-gather <- function(...){
-  names <- substitute(list(...)) |> deparse() |>
-    str_remove_all("\\)|list\\(") |> str_flatten() |> str_split_1(",") |> str_trim()
-  values <- list(...)
-  p <- list()
-  
-  for(i in 1:length(names)){
-    #p[[names[[i]]]] <- values[[i]]
-    p <- c(p, list(values[[i]])) #for nulls
-  }
-  names(p) <- names
-  return(p)
-}
diff --git a/R/validate_did.R b/R/validate_did.R
index 746f6fb..839f751 100644
--- a/R/validate_did.R
+++ b/R/validate_did.R
@@ -1,12 +1,10 @@
-validate_did <- function(dt,varnames,params){
-  
-  release(params)
-  
+validate_did <- function(dt,varnames,p){
+
   raw_unit_size <- dt[, uniqueN(unit)]
   raw_time_size <- dt[, uniqueN(time)]
   
-  if(!is.na(balanced_event_time)){
-    if(balanced_event_time > dt[, max(time-G)]){stop("balanced_event_time is larger than the max event time in the data")}
+  if(!is.na(p$balanced_event_time)){
+    if(p$balanced_event_time > dt[, max(time-G)]){stop("balanced_event_time is larger than the max event time in the data")}
   }
   
   #doesn't allow missing value for now
@@ -20,13 +18,13 @@ validate_did <- function(dt,varnames,params){
   }
   
   
-  if(!allNA(covariatesvar) && uniqueN(dt, by = c("unit", covariatesvar)) > raw_unit_size){
+  if(!allNA(p$covariatesvar) && uniqueN(dt, by = c("unit", p$covariatesvar)) > raw_unit_size){
     warning("some covariates is time-varying, fastdid only use the first observation for covariates.")
   }
   
   
-  if(!allNA(covariatesvar)|!allNA(varycovariatesvar)){
-    for(cov in c(covariatesvar, varycovariatesvar)){
+  if(!allNA(p$covariatesvar)|!allNA(p$varycovariatesvar)){
+    for(cov in c(p$covariatesvar, p$varycovariatesvar)){
       if(is.na(cov)){next}
       #check covaraites is not constant  
       if(fnunique(dt[, get(cov)[1], by = "unit"][, V1]) == 1)stop(cov, " have no variation")
@@ -41,7 +39,7 @@ validate_did <- function(dt,varnames,params){
   }
   
   #check if any is missing
-  if(!allow_unbalance_panel){
+  if(!p$allow_unbalance_panel){
     unit_count <- dt[, .(count = .N), by = unit]
     if(any(unit_count[, count < raw_time_size])){
       mis_unit <- unit_count[count < raw_time_size]

From 1f91ae87a1c3cd52ba64228ca84fd9497f17144f Mon Sep 17 00:00:00 2001
From: TsaiLintung <101155567+TsaiLintung@users.noreply.github.com>
Date: Wed, 20 Mar 2024 15:00:09 +0800
Subject: [PATCH 09/15] hot fix for wrong varycovarites condition and some
 warning

---
 R/estimate_did.R         | 13 +++++++++----
 R/estimate_gtatt.R       |  7 ++++---
 R/fastdid.R              |  2 +-
 R/sim_did.R              |  2 +-
 development/playground.R | 28 ++++++++++++++++++++++++++++
 5 files changed, 43 insertions(+), 9 deletions(-)
 create mode 100644 development/playground.R

diff --git a/R/estimate_did.R b/R/estimate_did.R
index 597a328..e768b2c 100644
--- a/R/estimate_did.R
+++ b/R/estimate_did.R
@@ -1,8 +1,7 @@
 estimate_did <- function(dt_did, covvars, control_type,
                          last_coef = NULL, cache_ps_fit, cache_hess){
-
+  
   # preprocess --------
-
   oldn <- dt_did[, .N]
   data_pos <-  which(dt_did[, !is.na(D)])
   dt_did <- dt_did[data_pos]
@@ -30,14 +29,20 @@ estimate_did <- function(dt_did, covvars, control_type,
                                                     intercept = FALSE))
       class(prop_score_est) <- "glm" #trick the vcov function to think that this is a glm object to dispatch the write method
       #const is implicitly put into the ipw formula, need to incorporate it manually
-      hess <- stats::vcov(prop_score_est) * n #for the influence function
 
       logit_coef <-  prop_score_est$coefficients
+      
+      if(anyNA(logit_coef)){
+        warning("some propensity score estimation resulted in NA coefficients, likely cause by perfect colinearity")
+      }
+      
       logit_coef[is.na(logit_coef)|abs(logit_coef) > 1e10] <- 0 #put extreme value and na to 0
       prop_score_fit <- fitted(prop_score_est)
       if(max(prop_score_fit) >= 1){warning(paste0("support overlap condition violated for some group_time"))}
       prop_score_fit <- pmin(1-1e-16, prop_score_fit) #for the ipw
 
+      hess <- stats::vcov(prop_score_est) * n #for the influence function
+      hess[is.na(hess)|abs(hess) > 1e10] <- 0
 
     } else { #when using multiple outcome, ipw cache can be reused
       hess <- cache_hess
@@ -142,6 +147,7 @@ estimate_did <- function(dt_did, covvars, control_type,
 
 
   #get overall influence function
+  #if(dt_did[, mean(cont_ipw_weight)] < 1e-10){warning("little/no overlap in covariates between control and treat group, estimates are unstable.")}
   inf_cont <- (inf_cont_did+inf_cont_ipw+inf_cont_or)/dt_did[, mean(cont_ipw_weight)]
   inf_treat <- (inf_treat_did+inf_treat_or)/dt_did[,mean(treat_ipw_weight)]
   inf_func_no_na <- inf_treat - inf_cont
@@ -151,7 +157,6 @@ estimate_did <- function(dt_did, covvars, control_type,
   inf_func_no_na <- inf_func_no_na * oldn / n #adjust the value such that mean over the whole id size give the right result
   inf_func[data_pos] <- inf_func_no_na
 
-  
   return(list(att = att, inf_func = inf_func, logit_coef = logit_coef, #for next gt
               cache_ps_fit = prop_score_fit, cache_hess = hess)) #for next outcome
 }
diff --git a/R/estimate_gtatt.R b/R/estimate_gtatt.R
index c6fdb23..80ecea3 100644
--- a/R/estimate_gtatt.R
+++ b/R/estimate_gtatt.R
@@ -38,7 +38,7 @@ estimate_gtatt <- function(aux, p) {
         did_setup[get_cohort_pos(aux$cohort_sizes, g)] <- 1 #treated cannot be controls, assign treated after control to overwrite
         
         #construct the covariates matrix
-        covvars <- get_covvars(aux$covariates, aux$varycovariates)
+        covvars <- get_covvars(aux$covariates, aux$varycovariates, base_period, t)
         
         #construct the 2x2 dataset
         cohort_did <- data.table(did_setup, outcomes[[t]], outcomes[[base_period]], aux$weights)
@@ -82,8 +82,9 @@ get_cohort_pos <- function(cohort_sizes, start_cohort, end_cohort = start_cohort
   return(start:end)
 }
 
-get_covvars <- function(covariates, varycovariates){
-  if(is.data.table(varycovariates)){
+get_covvars <- function(covariates, varycovariates, base_period, t){
+  
+  if(is.list(varycovariates)){
     
     precov <- varycovariates[[base_period]]
     names(precov) <- paste0("pre_", names(precov))
diff --git a/R/fastdid.R b/R/fastdid.R
index c2fe2e0..67b6513 100644
--- a/R/fastdid.R
+++ b/R/fastdid.R
@@ -59,7 +59,7 @@ fastdid <- function(data,
                     timevar, cohortvar, unitvar, outcomevar, 
                     control_option="both",result_type="group_time", balanced_event_time = NA,
                     control_type = "ipw", allow_unbalance_panel = FALSE, boot=FALSE, biters = 1000,
-                    weightvar=NA,clustervar=NA,covariatesvar = NA,varycovariatesvar = NA,
+                    weightvar=NA,clustervar=NA,covariatesvar = NA, varycovariatesvar = NA,
                     copy = TRUE, validate = TRUE
                     ){
   
diff --git a/R/sim_did.R b/R/sim_did.R
index 7ca136e..d8ba2e5 100644
--- a/R/sim_did.R
+++ b/R/sim_did.R
@@ -93,7 +93,7 @@ sim_did <- function(sample_size, time_period, untreated_prop = 0.3, epsilon_size
   
   #add time_varying covariates
   if(vary_cov){
-    dt[, xvar := pmin(G, time_period)*time+rnorm(sample_size*time_period, 0,1)] #should be confounding....?
+    dt[, xvar := pmin(G, time_period+4)*time+rnorm(sample_size*time_period, 0,30)] #should be confounding....?
   } else {
     dt[, xvar := 1]
   }
diff --git a/development/playground.R b/development/playground.R
new file mode 100644
index 0000000..88ebfb5
--- /dev/null
+++ b/development/playground.R
@@ -0,0 +1,28 @@
+setwd("~/GitHub/fastdid")
+
+library(devtools)
+library(tinytest)
+library(roxygen2)
+
+load_all()
+
+
+tol <- 1e-2 #allow 1% different between estimates
+simdt <- sim_did(1e+02, 4, cov = "cont", hetero = "all", balanced = TRUE, second_outcome = TRUE, seed = 1, 
+                 stratify = FALSE, second_cov = TRUE, vary_cov = TRUE)
+dt <- simdt$dt
+
+
+fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y",  result_type = "group_time")
+
+fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y",  result_type = "group_time",
+                  control_type = "ipw",
+                  varycovariatesvar = c("xvar"))
+
+fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y",  result_type = "group_time",
+        control_type = "ipw",
+        varycovariatesvar = c("x"))
+
+fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y",  result_type = "group_time",
+        control_type = "ipw",
+        covariatesvar = c("x"))
\ No newline at end of file

From f5c13a3d67b7fa425ec065cfe3821ce25b5a472a Mon Sep 17 00:00:00 2001
From: TsaiLintung <101155567+TsaiLintung@users.noreply.github.com>
Date: Tue, 2 Apr 2024 12:29:59 +0800
Subject: [PATCH 10/15] fix bug with dt and data

---
 R/fastdid.R | 20 ++++++++++----------
 README.md   | 12 ++++++------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/R/fastdid.R b/R/fastdid.R
index 67b6513..bb3e4c7 100644
--- a/R/fastdid.R
+++ b/R/fastdid.R
@@ -32,24 +32,24 @@
 #' dt <- simdt$dt
 #' 
 #' #basic call
-#' result <- fastdid(dt, timevar = "time", cohortvar = "G", 
+#' result <- fastdid(data = dt, timevar = "time", cohortvar = "G", 
 #'                   unitvar = "unit", outcomevar = "y",  
 #'                   result_type = "group_time")
 #' 
 #' #control for covariates
-#' result2 <- fastdid(dt, timevar = "time", cohortvar = "G", 
+#' result2 <- fastdid(data = dt, timevar = "time", cohortvar = "G", 
 #'                    unitvar = "unit", outcomevar = "y",  
 #'                    result_type = "group_time",
 #'                    covariatesvar = c("x", "x2"))
 #'                   
 #' #bootstrap and clustering
-#' result3 <- fastdid(dt, timevar = "time", cohortvar = "G", 
+#' result3 <- fastdid(data = dt, timevar = "time", cohortvar = "G", 
 #'                    unitvar = "unit", outcomevar = "y",  
 #'                    result_type = "group_time",
 #'                    boot = TRUE, clustervar = "x")
 #'
 #' #estimate for multiple outcomes
-#' result4 <- fastdid(dt, #the dataset
+#' result4 <- fastdid(data = dt, #the dataset
 #'                    timevar = "time", cohortvar = "G", unitvar = "unit", 
 #'                    outcomevar = c("y", "y2"), #name of the outcome columns
 #'                    result_type = "group_time") 
@@ -65,6 +65,12 @@ fastdid <- function(data,
   
   # validate arguments --------------------------------------------------------
 
+  if(!is.data.table(data)){
+    warning("coercing input into a data.table.")
+    data <- as.data.table(data)
+  } 
+  if(copy){dt <- copy(data)} else {dt <- data}
+  
   dt_names <- names(dt)
   name_message <- "__ARG__ must be a character scalar and a name of a column from the dataset."
   check_set_arg(timevar, unitvar, cohortvar, "match", .choices = dt_names, .message = name_message)
@@ -117,12 +123,6 @@ fastdid <- function(data,
   
   # validate data -----------------------------------------------------
   
-  if(!is.data.table(data)){
-    warning("coercing input into a data.table.")
-    data <- as.data.table(data)
-  } 
-  
-  if(copy){dt <- copy(data)} else {dt <- data}
   setnames(dt, c(timevar, cohortvar, unitvar), c("time", "G", "unit"))
   
   if(validate){
diff --git a/README.md b/README.md
index e2e055e..3c8fd85 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@ devtools::install_github("TsaiLintung/fastdid")
 
 # Usage
 
-`fastdid` is the main function provided by **fastdid**. When using `fastdid`, you need to provide the dataset (`dt`), specify the names of the relevant columns (`-var`), and the type of target (aggregated) parameters (`result_type` such as `"group_time"`, `"time"`, `"dynamic"`, or `"simple"`.) Here is a simple call. 
+`fastdid` is the main function provided by **fastdid**. When using `fastdid`, you need to provide the dataset (`data`), specify the names of the relevant columns (`-var`), and the type of target (aggregated) parameters (`result_type` such as `"group_time"`, `"time"`, `"dynamic"`, or `"simple"`.) Here is a simple call. 
 
 ```
 #loading the package
@@ -30,7 +30,7 @@ simdt <- sim_did(1e+03, 10, cov = "cont", second_cov = TRUE, second_outcome = TR
 dt <- simdt$dt
 
 #calling fastdid
-result <- fastdid(dt, #the dataset
+result <- fastdid(data = dt, #the dataset
                   timevar = "time", cohortvar = "G", unitvar = "unit", outcomevar = "y", #name of the columns
                   result_type = "group_time") #the result type
 ```
@@ -38,7 +38,7 @@ result <- fastdid(dt, #the dataset
 You can control for covariates by providing the name of the data columns, and choose the method among doubly-robust (`"dr"`), inverse probability weight (`"ipw"`), and outcome regression (`"or"`). 
 
 ```
-result <- fastdid(dt, 
+result <- fastdid(data = dt, 
                   timevar = "time", cohortvar = "G", unitvar = "unit", outcomevar = "y",
                   result_type = "group_time",
                   control_option = "dr", #choose the control method
@@ -48,7 +48,7 @@ result <- fastdid(dt,
 While the default is to coerce the data into a balanced panel, you can allow for unbalanced panel. Note that currently only "ipw" is available when `allow_unbalance_panel = TRUE`.
 
 ```
-result <- fastdid(dt, 
+result <- fastdid(data = dt, 
                   timevar = "time", cohortvar = "G", unitvar = "unit", outcomevar = "y",
                   result_type = "group_time",
                   allow_unbalance_panel = TRUE, #allow for unbalanced panel
@@ -59,7 +59,7 @@ result <- fastdid(dt,
 Clustered standard error can be obtained from multiplier bootstrap. 
 
 ```
-result <- fastdid(dt,
+result <- fastdid(data = dt,
                   timevar = "time", cohortvar = "G", unitvar = "unit", outcomevar = "y",
                   result_type = "group_time",
                   clustervar = "x", boot = TRUE) #add clustering by using bootstrap
@@ -69,7 +69,7 @@ Estimation for multiple outcomes can be done in one call by providing a vector o
 
 ```
 #calling fastdid
-result <- fastdid(dt, #the dataset
+result <- fastdid(data = dt, #the dataset
                   timevar = "time", cohortvar = "G", unitvar = "unit", outcomevar = c("y", "y2"), #name of the columns
                   result_type = "group_time") #the result type
 ```

From ece5a371d94ce53639f547b327b9d3788da25ff2 Mon Sep 17 00:00:00 2001
From: TsaiLintung <101155567+TsaiLintung@users.noreply.github.com>
Date: Tue, 2 Apr 2024 17:17:47 +0800
Subject: [PATCH 11/15] add max control cohort diff

---
 DESCRIPTION                    |  2 +-
 R/estimate_gtatt.R             | 13 ++++++++++++-
 R/fastdid.R                    |  6 ++++--
 inst/tinytest/test_1_fastdid.R |  4 ++++
 man/fastdid.Rd                 | 11 ++++++-----
 5 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index c9ba888..c9865a5 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -25,4 +25,4 @@ Imports:
     parglm
 Suggests: ggplot2, did
 Encoding: UTF-8
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
diff --git a/R/estimate_gtatt.R b/R/estimate_gtatt.R
index 80ecea3..eb165d5 100644
--- a/R/estimate_gtatt.R
+++ b/R/estimate_gtatt.R
@@ -5,8 +5,12 @@ estimate_gtatt <- function(aux, p) {
     p$control_option <- "notyet"
   }
   
+  if(!is.na(p$max_control_cohort_diff)){
+    warning("max_control_cohort_diff can only be used with not yet")
+    p$control_option <- "notyet"
+  }
+  
   treated_cohorts <- aux$cohorts[!is.infinite(aux$cohorts)]
-  max_control_cohort <- ifelse(p$control_option == "notyet", max(treated_cohorts), Inf) 
   
   cache_ps_fit_list <- list() #for the first outcome won't be able to use cache, empty list returns null for call like cache_ps_fit[["1.3"]]
   cache_hess_list <- list()
@@ -27,6 +31,13 @@ estimate_gtatt <- function(aux, p) {
         #setup and checks
         base_period <- g-1
         min_control_cohort <- ifelse(p$control_option  == "never", Inf, max(t+1, base_period+1)) #not-yet treated / never treated in both base and "treated" period
+        
+        if(!is.na(p$max_control_cohort_diff)){
+          max_control_cohort <- min(g+p$max_control_cohort_diff, max(treated_cohorts))
+        } else {
+          max_control_cohort <- ifelse(p$control_option == "notyet", max(treated_cohorts), Inf) 
+        }
+        
         if(t == base_period){next} #no treatment effect for the base period
         if(base_period < min(aux$time_periods)){next} #no treatment effect for the first period, since base period is not observed
         if(g >= max_control_cohort){next} #no treatment effect for never treated or the last treated cohort (for not yet notyet)
diff --git a/R/fastdid.R b/R/fastdid.R
index bb3e4c7..04b9d84 100644
--- a/R/fastdid.R
+++ b/R/fastdid.R
@@ -60,7 +60,8 @@ fastdid <- function(data,
                     control_option="both",result_type="group_time", balanced_event_time = NA,
                     control_type = "ipw", allow_unbalance_panel = FALSE, boot=FALSE, biters = 1000,
                     weightvar=NA,clustervar=NA,covariatesvar = NA, varycovariatesvar = NA,
-                    copy = TRUE, validate = TRUE
+                    copy = TRUE, validate = TRUE,
+                    max_control_cohort_diff = NA
                     ){
   
   # validate arguments --------------------------------------------------------
@@ -118,7 +119,8 @@ fastdid <- function(data,
             control_type = control_type,
             allow_unbalance_panel = allow_unbalance_panel,
             boot = boot, 
-            biters = biters)
+            biters = biters,
+            max_control_cohort_diff = max_control_cohort_diff)
   
   
   # validate data -----------------------------------------------------
diff --git a/inst/tinytest/test_1_fastdid.R b/inst/tinytest/test_1_fastdid.R
index c73ee9f..7acb010 100644
--- a/inst/tinytest/test_1_fastdid.R
+++ b/inst/tinytest/test_1_fastdid.R
@@ -61,6 +61,10 @@ expect_silent(fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",ou
 expect_silent(fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y",  result_type = "group_time", allow_unbalance_panel = TRUE),
               info = "balance panel false but dt is balance")
 
+expect_silent(fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y",  result_type = "group_time", allow_unbalance_panel = TRUE,
+                      max_control_cohort_diff = 2),
+              info = "balance panel false but dt is balance")
+
 # dt that needs adjustment ---------------------------
 
 base_result <- fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y",  result_type = "group_time")
diff --git a/man/fastdid.Rd b/man/fastdid.Rd
index a2dfdd3..0ea6537 100644
--- a/man/fastdid.Rd
+++ b/man/fastdid.Rd
@@ -22,7 +22,8 @@ fastdid(
   covariatesvar = NA,
   varycovariatesvar = NA,
   copy = TRUE,
-  validate = TRUE
+  validate = TRUE,
+  max_control_cohort_diff = NA
 )
 }
 \arguments{
@@ -74,24 +75,24 @@ simdt <- sim_did(1e+03, 10, cov = "cont", second_cov = TRUE, second_outcome = TR
 dt <- simdt$dt
 
 #basic call
-result <- fastdid(dt, timevar = "time", cohortvar = "G", 
+result <- fastdid(data = dt, timevar = "time", cohortvar = "G", 
                   unitvar = "unit", outcomevar = "y",  
                   result_type = "group_time")
 
 #control for covariates
-result2 <- fastdid(dt, timevar = "time", cohortvar = "G", 
+result2 <- fastdid(data = dt, timevar = "time", cohortvar = "G", 
                    unitvar = "unit", outcomevar = "y",  
                    result_type = "group_time",
                    covariatesvar = c("x", "x2"))
                   
 #bootstrap and clustering
-result3 <- fastdid(dt, timevar = "time", cohortvar = "G", 
+result3 <- fastdid(data = dt, timevar = "time", cohortvar = "G", 
                    unitvar = "unit", outcomevar = "y",  
                    result_type = "group_time",
                    boot = TRUE, clustervar = "x")
 
 #estimate for multiple outcomes
-result4 <- fastdid(dt, #the dataset
+result4 <- fastdid(data = dt, #the dataset
                    timevar = "time", cohortvar = "G", unitvar = "unit", 
                    outcomevar = c("y", "y2"), #name of the outcome columns
                    result_type = "group_time") 

From 19d2e78865565e0bf2d464995cefc2a0c7703716 Mon Sep 17 00:00:00 2001
From: TsaiLintung <101155567+TsaiLintung@users.noreply.github.com>
Date: Thu, 11 Apr 2024 17:13:25 +0800
Subject: [PATCH 12/15] change source

---
 development/build_source.R      |   5 +-
 development/fastdid_sourcever.R | 579 +++++++++++++++++++-------------
 2 files changed, 345 insertions(+), 239 deletions(-)

diff --git a/development/build_source.R b/development/build_source.R
index cde2d46..24d1b5e 100644
--- a/development/build_source.R
+++ b/development/build_source.R
@@ -1,16 +1,17 @@
 rm(list = ls())
 gc()
 
-setwd("~/GitHub/EventStudyCode")
+setwd("~/GitHub/fastdid")
 
 source_files <- list.files("R", include.dirs = FALSE, full.names = TRUE)
 
 ver <- "0.9.2"
 vername <- "unbalanced"
 
-sink("interactive/fastdid_sourcever.R")
+sink("development/fastdid_sourcever.R")
 cat(paste0("#", as.character(Sys.Date()), "\n"))
 cat(paste0("message('loading fastdid source ver. ver: ", ver, " (", vername ,"), date: " , as.character(Sys.Date()), "')\n"))
+cat("require(data.table);require(stringr);require(BMisc);require(collapse);require(dreamerr);require(parglm)\n")
 for(file in source_files){
   current_file = readLines(file)
   cat(current_file, sep ="\n")
diff --git a/development/fastdid_sourcever.R b/development/fastdid_sourcever.R
index 78b3969..f9f4ae4 100644
--- a/development/fastdid_sourcever.R
+++ b/development/fastdid_sourcever.R
@@ -1,54 +1,63 @@
-#2023-12-21
-message('loading fastdid source ver. ver: 0.9.2 (unbalanced), date: 2023-12-21')
-aggregate_gt <- function(gt_result, cohort_sizes, 
-                         id_weights, id_cohorts,
-                         result_type, balanced_event_time){
+#2024-04-09
+message('loading fastdid source ver. ver: 0.9.2 (unbalanced), date: 2024-04-09')
+require(data.table);require(stringr);require(BMisc);require(collapse);require(dreamerr);require(parglm)
+aggregate_gt <- function(gt_result, aux, p){
   
-  gt_att <- gt_result$att
-  gt_inf_func <- gt_result$inf_func
-  gt <- gt_result$gt
+
+  #release the stuff
+  id_cohorts <- aux$dt_inv[, G]
   
-  id_dt <- data.table(weight = id_weights/sum(id_weights), G = id_cohorts)
+  id_dt <- data.table(weight = aux$weights/sum(aux$weights), G = id_cohorts)
   pg_dt <- id_dt[, .(pg = sum(weight)), by = "G"]
-  group_time <- gt |> merge(pg_dt, by = "G")
+  group_time <- gt_result$gt |> merge(pg_dt, by = "G")
   
   setorder(group_time, time, G) #change the order to match the order in gtatt
   
-  gt_inf_func <- as.matrix(gt_inf_func)
+  gt_result$inf_func <- as.matrix(gt_result$inf_func)
   
-  if(result_type == "group_time"){
+  if(p$result_type == "group_time"){
     
+    #don't need to do anything
     targets <- group_time[, unique(G*max(time)+time)]
-    inf_matrix <- gt_inf_func
-    agg_att <- as.vector(gt_att)
+    inf_matrix <- gt_result$inf_func
+    agg_att <- as.vector(gt_result$att)
     
   } else {
     
-    agg_sch <- get_aggregate_scheme(group_time, result_type, id_weights, id_cohorts, balanced_event_time)
+    #get which gt(s) is a part of the aggregated param
+    agg_sch <- get_aggregate_scheme(group_time, p$result_type, aux$weights, id_cohorts, p$balanced_event_time)
     targets <- agg_sch$targets
-    weights <- as.matrix(agg_sch$weights)
     
     #aggregated att
-    agg_att <- weights %*% gt_att
+    agg_att <- agg_sch$agg_weights %*% gt_result$att
     
     #get the influence from weight estimation
     #this needs to be optimized!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-    inf_weights <- sapply(asplit(weights, 1), function (x){
-      get_weight_influence(x, gt_att, id_weights, id_cohorts, group_time[, .(G, time)])
+    inf_weights <- sapply(asplit(agg_sch$agg_weights, 1), function (x){
+      get_weight_influence(x, gt_result$att, aux$weights, id_cohorts, group_time[, .(G, time)])
     })
     
     #aggregated influence function
-    inf_matrix <- (gt_inf_func %*% t(weights)) + inf_weights 
-
+    inf_matrix <- (gt_result$inf_func %*% t(agg_sch$agg_weights)) + inf_weights 
+    
   }
-  return(list(inf_matrix = inf_matrix, agg_att = agg_att, targets = targets))
+  
+  #get se from influence function
+  agg_se <- get_se(inf_matrix, p$boot, p$biters, aux$cluster, p$clustervar)
+  
+  # post process
+  result <- data.table(targets, agg_att, agg_se)
+  names(result) <- c("target", "att", "se")
+  result[,outcome := gt_result$outname]
+  
+  return(result)
 }
 
-get_aggregate_scheme <- function(group_time, result_type, id_weights, id_cohorts, balanced_event_time){
+get_aggregate_scheme <- function(group_time, result_type, weights, id_cohorts, balanced_event_time){
   
   #browser()
   
-  weights <- data.table()
+  agg_weights <- data.table()
   gt_count <- group_time[, .N]
   
   bool_to_pn <- function(x){ifelse(x, 1, -1)}
@@ -69,7 +78,7 @@ get_aggregate_scheme <- function(group_time, result_type, id_weights, id_cohorts
   
   #for balanced cohort composition in dynamic setting
   #a cohort us only used if it is seen for all dynamic time
-  if(result_type == "dynamic" & !is.null(balanced_event_time)){
+  if(result_type == "dynamic" & !is.na(balanced_event_time)){
 
     cohorts <- group_time[, .(max_et = max(time-G),
                               min_et = min(time-G)), by = "G"]
@@ -91,18 +100,18 @@ get_aggregate_scheme <- function(group_time, result_type, id_weights, id_cohorts
     
     group_time[, targeted := NULL]
     
-    weights <- rbind(weights, target_weights)
+    agg_weights <- rbind(agg_weights, target_weights)
   }
   
-  return(list(weights = weights, #a matrix of each target and gt's weight in it 
+  return(list(agg_weights = as.matrix(agg_weights), #a matrix of each target and gt's weight in it 
               targets = targets)) 
 }
 
-get_weight_influence <- function(agg_weights, gt_att, id_weights, id_cohorts, group) {
+get_weight_influence <- function(agg_weights, gt_att, weights, id_cohorts, group) {
 
   keepers <- which(agg_weights > 0)
   
-  id_dt <- data.table(weight = id_weights/sum(id_weights), G = id_cohorts)
+  id_dt <- data.table(weight = weights/sum(weights), G = id_cohorts)
   pg_dt <- id_dt[, .(pg = sum(weight)), by = "G"]
   group <- group |> merge(pg_dt, by = "G")
   
@@ -112,12 +121,12 @@ get_weight_influence <- function(agg_weights, gt_att, id_weights, id_cohorts, gr
 
   # effect of estimating weights in the numerator
   if1 <- sapply(keepers, function(k) {
-    (id_weights*BMisc::TorF(id_cohorts == group[k,G]) - group[k,pg]) /
+    (weights*BMisc::TorF(id_cohorts == group[k,G]) - group[k,pg]) /
       sum(group[keepers,pg])
   })
   # effect of estimating weights in the denominator
   if2 <- base::rowSums(sapply(keepers, function(k) {
-    id_weights*BMisc::TorF(id_cohorts == group[k,G]) - group[k,pg]
+    weights*BMisc::TorF(id_cohorts == group[k,G]) - group[k,pg]
   })) %*%
     t(group[keepers,pg]/(sum(group[keepers,pg])^2))
   # return the influence function for the weights
@@ -126,24 +135,57 @@ get_weight_influence <- function(agg_weights, gt_att, id_weights, id_cohorts, gr
   return(inf_weight)
 }
 
-estimate_did <- function(dt_did, covnames, control_type,
-                         last_coef = NULL, cache_ps_fit, cache_hess){
+get_se <- function(inf_matrix, boot, biters, cluster, clustervar) {
+  
+  if(boot){
+    
+    top_quant <- 0.75
+    bot_quant <- 0.25
+    if(!allNA(clustervar)){
+      #take average within the cluster
+      cluster_n <- stats::aggregate(cluster, by=list(cluster), length)[,2]
+      inf_matrix <- fsum(inf_matrix, cluster) / cluster_n #the mean without 0 for each cluster of each setting
+    }
+    
+    boot_results <- BMisc::multiplier_bootstrap(inf_matrix, biters = biters) %>% as.data.table()
+    
+    boot_top <- boot_results[, lapply(.SD, function(x) stats::quantile(x, top_quant, type=1, na.rm = TRUE))]
+    boot_bot <- boot_results[, lapply(.SD, function(x) stats::quantile(x, bot_quant, type=1, na.rm = TRUE))]
+    
+    dt_se <- rbind(boot_bot, boot_top) %>% transpose()
+    names(dt_se) <- c("boot_bot", "boot_top")
+    
+    se <- dt_se[,(boot_top-boot_bot)/(qnorm(top_quant) - qnorm(bot_quant))]
+    se[se < sqrt(.Machine$double.eps)*10] <- NA
+    
+  } else {
+
+    inf_matrix <- inf_matrix  |> as.data.table()
+    se <- inf_matrix[, lapply(.SD, function(x) sqrt(sum(x^2, na.rm = TRUE)/length(x)^2))] %>% as.vector() #should maybe use n-1 but did use n
+    
+  }
+  return(unlist(se))
+}
 
-  # preprocess --------
 
+estimate_did <- function(dt_did, covvars, control_type,
+                         last_coef = NULL, cache_ps_fit, cache_hess){
+  
+  # preprocess --------
   oldn <- dt_did[, .N]
   data_pos <-  which(dt_did[, !is.na(D)])
   dt_did <- dt_did[data_pos]
   n <- dt_did[, .N]
-  if(is.null(covnames)){
-    covvars <- NULL
+  
+  if(is.matrix(covvars)){
+    ipw <- control_type %in% c("ipw", "dr") 
+    or <- control_type %in% c("reg", "dr")
+    covvars <- covvars[data_pos,] 
   } else {
-    covvars <- as.matrix(dt_did[,.SD, .SDcols = covnames])
+    ipw <- FALSE
+    or <- FALSE
   }
-
-  ipw <- control_type %in% c("ipw", "dr") & !is.null(covvars)
-  or <- control_type %in% c("reg", "dr") & !is.null(covvars) #OR is REG
-
+  
   # ipw --------
 
   if(ipw){
@@ -157,19 +199,25 @@ estimate_did <- function(dt_did, covnames, control_type,
                                                     intercept = FALSE))
       class(prop_score_est) <- "glm" #trick the vcov function to think that this is a glm object to dispatch the write method
       #const is implicitly put into the ipw formula, need to incorporate it manually
-      hess <- stats::vcov(prop_score_est) * n #for the influence function
 
       logit_coef <-  prop_score_est$coefficients
+      
+      if(anyNA(logit_coef)){
+        warning("some propensity score estimation resulted in NA coefficients, likely cause by perfect colinearity")
+      }
+      
       logit_coef[is.na(logit_coef)|abs(logit_coef) > 1e10] <- 0 #put extreme value and na to 0
       prop_score_fit <- fitted(prop_score_est)
       if(max(prop_score_fit) >= 1){warning(paste0("support overlap condition violated for some group_time"))}
       prop_score_fit <- pmin(1-1e-16, prop_score_fit) #for the ipw
 
+      hess <- stats::vcov(prop_score_est) * n #for the influence function
+      hess[is.na(hess)|abs(hess) > 1e10] <- 0
 
     } else { #when using multiple outcome, ipw cache can be reused
       hess <- cache_hess
       prop_score_fit <- cache_ps_fit
-      logit_coef <- NULL #won't be needing the approximate cache
+      logit_coef <- NA #won't be needing the approximate cache
     }
 
     #get the results into the main did dt
@@ -180,8 +228,8 @@ estimate_did <- function(dt_did, covnames, control_type,
   } else {
 
     prop_score_fit <- rep(1,n)
-    logit_coef <- NULL
-    hess <- NULL
+    logit_coef <- NA
+    hess <- NA
 
     dt_did[, treat_ipw_weight := weights*D]
     dt_did[, cont_ipw_weight := weights*(1-D)]
@@ -232,6 +280,7 @@ estimate_did <- function(dt_did, covnames, control_type,
     M2 <- colMeans(dt_did[, cont_ipw_weight*(delta_y-weighted_cont_delta-or_delta)] * covvars)
 
     score_ps <- dt_did[, weights*(D-ps)] * covvars
+    
     asym_linear_ps <- score_ps %*% hess
 
     #ipw for control
@@ -268,6 +317,7 @@ estimate_did <- function(dt_did, covnames, control_type,
 
 
   #get overall influence function
+  #if(dt_did[, mean(cont_ipw_weight)] < 1e-10){warning("little/no overlap in covariates between control and treat group, estimates are unstable.")}
   inf_cont <- (inf_cont_did+inf_cont_ipw+inf_cont_or)/dt_did[, mean(cont_ipw_weight)]
   inf_treat <- (inf_treat_did+inf_treat_or)/dt_did[,mean(treat_ipw_weight)]
   inf_func_no_na <- inf_treat - inf_cont
@@ -277,26 +327,24 @@ estimate_did <- function(dt_did, covnames, control_type,
   inf_func_no_na <- inf_func_no_na * oldn / n #adjust the value such that mean over the whole id size give the right result
   inf_func[data_pos] <- inf_func_no_na
 
-  
   return(list(att = att, inf_func = inf_func, logit_coef = logit_coef, #for next gt
               cache_ps_fit = prop_score_fit, cache_hess = hess)) #for next outcome
 }
 
-estimate_did_rc <- function(dt_did, covnames, control_type,
+estimate_did_rc <- function(dt_did, covvars, control_type,
                          last_coef = NULL, cache_ps_fit, cache_hess){
   
   #TODO: skip if not enough valid data
   
   # preprocess --------
   
-  oldn <- dt_did[, .N]
   
-  #separate the dataset into pre and post
-
+  oldn <- dt_did[, .N]
   data_pos <-  which(dt_did[, !is.na(D)])
   dt_did <- dt_did[data_pos]
   n <- dt_did[, .N]
   
+  #separate the dataset into pre and post
   dt_did[, inpre := as.numeric(!is.na(pre.y))]
   dt_did[, inpost := as.numeric(!is.na(post.y))]
   n_pre <- dt_did[, sum(!is.na(pre.y))]
@@ -305,15 +353,15 @@ estimate_did_rc <- function(dt_did, covnames, control_type,
   sum_weight_pre <- dt_did[, sum(inpre*weights)]
   sum_weight_post <- dt_did[, sum(inpost*weights)]
   
-  if(is.null(covnames)){
-    covvars <- NULL
+  if(is.matrix(covvars)){
+    ipw <- control_type %in% c("ipw", "dr") 
+    or <- control_type %in% c("reg", "dr")
+    covvars <- covvars[data_pos,] 
   } else {
-    covvars <- as.matrix(dt_did[,.SD, .SDcols = covnames])
+    ipw <- FALSE
+    or <- FALSE
   }
-  
-  ipw <- control_type %in% c("ipw", "dr") & !is.null(covvars)
-  or <- control_type %in% c("reg", "dr") & !is.null(covvars) #OR is REG
-  
+
   # ipw --------
   
   if(ipw){
@@ -346,8 +394,8 @@ estimate_did_rc <- function(dt_did, covnames, control_type,
   } else {
     
     prop_score_fit <- rep(1,n)
-    logit_coef <- NULL
-    hess <- NULL
+    logit_coef <- NA
+    hess <- NA
     dt_did[, treat_ipw_weight := weights*D]
     dt_did[, cont_ipw_weight := weights*(1-D)]
     
@@ -475,8 +523,8 @@ estimate_did_rc <- function(dt_did, covnames, control_type,
   inf_treat_post <- inf_treat_did_post+inf_treat_or_post
   inf_cont_pre <- inf_cont_did_pre+inf_cont_ipw_pre+inf_cont_or_pre
   inf_treat_pre <- inf_treat_did_pre+inf_treat_or_pre
+  
   #post process
-
   inf_func_no_na_post <- (inf_treat_post - inf_cont_post) * oldn / n_post #adjust the value such that mean over the whole id size give the right result
   inf_func_no_na_post[is.na(inf_func_no_na_post)] <- 0 #fill 0 for NA part (no influce if not in this gt)
   
@@ -490,62 +538,74 @@ estimate_did_rc <- function(dt_did, covnames, control_type,
               cache_ps_fit = prop_score_fit, cache_hess = hess)) #for next outcome
 }
 
-estimate_gtatt <- function(outcomes_list, outcomevar, covariates, control_type, weights, 
-                           cohort_sizes,cohorts,id_size,time_periods,
-                           control_option, allow_unbalance_panel) {
+estimate_gtatt <- function(aux, p) {
 
-  treated_cohorts <- cohorts[!is.infinite(cohorts)]
-  
-  if(!Inf %in% cohorts & control_option != "notyet"){
+  if(!Inf %in% aux$cohorts & p$control_option != "notyet"){
     warning("no never-treated availble, switching to not-yet-treated control")
-    control_option <- "notyet"
+    p$control_option <- "notyet"
   }
-  max_control_cohort <- ifelse(control_option == "notyet", max(treated_cohorts), Inf) 
+  
+  if(!is.na(p$max_control_cohort_diff)){
+    warning("max_control_cohort_diff can only be used with not yet")
+    p$control_option <- "notyet"
+  }
+  
+  treated_cohorts <- aux$cohorts[!is.infinite(aux$cohorts)]
   
   cache_ps_fit_list <- list() #for the first outcome won't be able to use cache, empty list returns null for call like cache_ps_fit[["1.3"]]
   cache_hess_list <- list()
   outcome_result_list <- list()
-  
-  for(outcol in outcomevar){
+  for(outcol in p$outcomevar){
     
-    outcomes <- outcomes_list[[outcol]]
+    outcomes <- aux$outcomes_list[[outcol]]
     last_coef <- NULL
     gt <- data.table()
     gt_att <- c()
-    gt_inf_func <- data.table(placeholder = rep(NA, id_size))
+    gt_inf_func <- data.table(placeholder = rep(NA, aux$id_size)) #populate with NA
     
-    for(t in time_periods){
-      for(g in cohorts){
+    for(t in aux$time_periods){
+      for(g in aux$cohorts){
         
         gt_name <- paste0(g, ".", t)
         
         #setup and checks
         base_period <- g-1
-        min_control_cohort <- ifelse(control_option == "never", Inf, max(t+1, base_period+1)) #not-yet treated / never treated in both base and "treated" period
-        if(t == base_period){next} #no treatmenteffect for the base period
-        if(base_period < min(time_periods)){next} #no treatmenteffect for the first period, since base period is not observed
-        if(g >= max_control_cohort){next} #no treatmenteffect for never treated or the last treated cohort (notyet)
+        min_control_cohort <- ifelse(p$control_option  == "never", Inf, max(t+1, base_period+1)) #not-yet treated / never treated in both base and "treated" period
+        
+        if(!is.na(p$max_control_cohort_diff)){
+          max_control_cohort <- min(g+p$max_control_cohort_diff, max(treated_cohorts))
+        } else {
+          max_control_cohort <- ifelse(p$control_option == "notyet", max(treated_cohorts), Inf) 
+        }
+        
+        if(t == base_period){next} #no treatment effect for the base period
+        if(base_period < min(aux$time_periods)){next} #no treatment effect for the first period, since base period is not observed
+        if(g >= max_control_cohort){next} #no treatment effect for never treated or the last treated cohort (for not yet notyet)
         if(t >= max_control_cohort){next} #no control available if the last cohort is treated too
         
         #select the control and treated cohorts
-        did_setup <- rep(NA, id_size)
-        did_setup[get_cohort_pos(cohort_sizes, min_control_cohort, max_control_cohort)] <- 0
-        did_setup[get_cohort_pos(cohort_sizes, g)] <- 1 #treated cannot be controls, assign treated after control to overwrite
+        did_setup <- rep(NA, aux$id_size)
+        did_setup[get_cohort_pos(aux$cohort_sizes, min_control_cohort, max_control_cohort)] <- 0
+        did_setup[get_cohort_pos(aux$cohort_sizes, g)] <- 1 #treated cannot be controls, assign treated after control to overwrite
+        
+        #construct the covariates matrix
+        covvars <- get_covvars(aux$covariates, aux$varycovariates, base_period, t)
         
         #construct the 2x2 dataset
-        cohort_did <- data.table(did_setup, outcomes[[t]], outcomes[[base_period]], weights, covariates)
-        setnames(cohort_did, c("did_setup", "V2", "V3", "weights"), c("D", "post.y", "pre.y", "weights"))
- 
+        cohort_did <- data.table(did_setup, outcomes[[t]], outcomes[[base_period]], aux$weights)
+        names(cohort_did) <- c("D", "post.y", "pre.y", "weights")
+        
         #estimate did
-        if(!allow_unbalance_panel){
-          result <- estimate_did(cohort_did, colnames(covariates), control_type, last_coef, cache_ps_fit_list[[gt_name]], cache_hess_list[[gt_name]])
+        if(!p$allow_unbalance_panel){
+          result <- estimate_did(cohort_did, covvars, p$control_type, 
+                                 last_coef, cache_ps_fit_list[[gt_name]], cache_hess_list[[gt_name]]) #cache
         } else {
-          result <- estimate_did_rc(cohort_did, colnames(covariates), control_type, last_coef, cache_ps_fit_list[[gt_name]], cache_hess_list[[gt_name]])
+          result <- estimate_did_rc(cohort_did, covvars, p$control_type, 
+                                    last_coef, cache_ps_fit_list[[gt_name]], cache_hess_list[[gt_name]]) #cache
         }
        
-      
         #collect the result
-        last_coef <- result$logit_coef
+        last_coef <- result$logit_coef #for faster convergence in next iter
         gt <- rbind(gt, data.table(G = g, time = t)) #the sequence matters for the weights
         gt_att <- c(gt_att, att = result$att)
         gt_inf_func[[gt_name]] <- result$inf_func
@@ -560,10 +620,7 @@ estimate_gtatt <- function(outcomes_list, outcomevar, covariates, control_type,
     
     gt_inf_func[,placeholder := NULL]
     names(gt_att) <- names(gt_inf_func)
-    outcome_result <- list(att = gt_att, inf_func = gt_inf_func, gt = gt)
-    outcome_result_list[[outcol]] <- outcome_result
-    
-    rm(outcome_result)
+    outcome_result_list[[outcol]] <- list(att = gt_att, inf_func = gt_inf_func, gt = gt, outname = outcol)
 
   }
   
@@ -571,12 +628,37 @@ estimate_gtatt <- function(outcomes_list, outcomevar, covariates, control_type,
 }
 
 get_cohort_pos <- function(cohort_sizes, start_cohort, end_cohort = start_cohort){
-  #if(!start_cohort %in% cohort_sizes[,unique(G)]|!end_cohort %in% cohort_sizes[,unique(G)]) {stop("cohort not in cohort_sizes")}
   start <- cohort_sizes[G < start_cohort, sum(cohort_size)]+1
   end <- cohort_sizes[G <= end_cohort, sum(cohort_size)]
   return(start:end)
 }
 
+get_covvars <- function(covariates, varycovariates, base_period, t){
+  
+  if(is.list(varycovariates)){
+    
+    precov <- varycovariates[[base_period]]
+    names(precov) <- paste0("pre_", names(precov))
+    postcov <- varycovariates[[t]]-varycovariates[[base_period]]
+    names(postcov) <- paste0("post_", names(varycovariates[[t]]))
+    
+    if(is.data.table(covariates)){
+      covvars <- cbind(covariates, cbind(precov, postcov)) #if covariates is na
+    } else {
+      covvars <- cbind(const = -1, cbind(precov, postcov)) #if covariates is na
+    }
+    covvars <- as.matrix(covvars)
+    
+  } else {
+    if(is.data.table(covariates)){
+      covvars <- as.matrix(covariates) #will be NA if covariatesvar have nothing
+    } else {
+      covvars <- NA
+    }
+  }
+  return(covvars)
+}
+
 #' Fast Staggered DID Estimation
 #'
 #' Performs Difference-in-Differences (DID) estimation fast.
@@ -588,14 +670,15 @@ get_cohort_pos <- function(cohort_sizes, start_cohort, end_cohort = start_cohort
 #' @param outcomevar The name of the outcome variable.
 #' @param control_option The control units used for the DiD estimates. Default is "both".
 #' @param result_type A character string indicating the type of result to be returned. Default is "group_time".
-#' @param balanced_event_time A numeric scalar that indicates the max event time to balance the cohort composition, only meaningful when result_type == "dynamic". Default is NULL
+#' @param balanced_event_time A numeric scalar that indicates the max event time to balance the cohort composition, only meaningful when result_type == "dynamic". Default is NA
 #' @param control_type The method for controlling for covariates. "ipw" for inverse probability weighting, "reg" for outcome regression, or "dr" for doubly-robust
 #' @param allow_unbalance_panel Whether allow unbalance panel as input (if false will coerce the dataset to a balanced panel). Default is FALSE 
 #' @param boot Logical, indicating whether bootstrapping should be performed. Default is FALSE.
 #' @param biters The number of bootstrap iterations. Only relevant if boot = TRUE. Default is 1000.
 #' @param weightvar The name of the weight variable (optional).
 #' @param clustervar The name of the cluster variable, can only be used when boot == TRUE (optional).
-#' @param covariatesvar A character vector containing the names of covariate variables (optional).
+#' @param covariatesvar A character vector containing the names of time-invariant covariate variables (optional).
+#' @param varycovariatesvar A character vector containing the names of time-varying covariate variables (optional).
 #' @param copy whether to copy the dataset before processing, set to false to speed up the process, but the input data will be altered.
 #' @param validate whether to validate the dataset before processing.
 #' 
@@ -610,24 +693,24 @@ get_cohort_pos <- function(cohort_sizes, start_cohort, end_cohort = start_cohort
 #' dt <- simdt$dt
 #' 
 #' #basic call
-#' result <- fastdid(dt, timevar = "time", cohortvar = "G", 
+#' result <- fastdid(data = dt, timevar = "time", cohortvar = "G", 
 #'                   unitvar = "unit", outcomevar = "y",  
 #'                   result_type = "group_time")
 #' 
 #' #control for covariates
-#' result2 <- fastdid(dt, timevar = "time", cohortvar = "G", 
+#' result2 <- fastdid(data = dt, timevar = "time", cohortvar = "G", 
 #'                    unitvar = "unit", outcomevar = "y",  
 #'                    result_type = "group_time",
 #'                    covariatesvar = c("x", "x2"))
 #'                   
 #' #bootstrap and clustering
-#' result3 <- fastdid(dt, timevar = "time", cohortvar = "G", 
+#' result3 <- fastdid(data = dt, timevar = "time", cohortvar = "G", 
 #'                    unitvar = "unit", outcomevar = "y",  
 #'                    result_type = "group_time",
 #'                    boot = TRUE, clustervar = "x")
 #'
 #' #estimate for multiple outcomes
-#' result4 <- fastdid(dt, #the dataset
+#' result4 <- fastdid(data = dt, #the dataset
 #'                    timevar = "time", cohortvar = "G", unitvar = "unit", 
 #'                    outcomevar = c("y", "y2"), #name of the outcome columns
 #'                    result_type = "group_time") 
@@ -635,60 +718,109 @@ get_cohort_pos <- function(cohort_sizes, start_cohort, end_cohort = start_cohort
 #' @keywords difference-in-differences fast computation panel data estimation did
 fastdid <- function(data,
                     timevar, cohortvar, unitvar, outcomevar, 
-                    control_option="both",result_type="group_time", balanced_event_time = NULL,
+                    control_option="both",result_type="group_time", balanced_event_time = NA,
                     control_type = "ipw", allow_unbalance_panel = FALSE, boot=FALSE, biters = 1000,
-                    weightvar=NULL,clustervar=NULL,covariatesvar = NULL,
-                    copy = TRUE, validate = TRUE
+                    weightvar=NA,clustervar=NA,covariatesvar = NA, varycovariatesvar = NA,
+                    copy = TRUE, validate = TRUE,
+                    max_control_cohort_diff = NA
                     ){
   
-  
-  # validation arguments --------------------------------------------------------
-  
+  # validate arguments --------------------------------------------------------
+
   if(!is.data.table(data)){
     warning("coercing input into a data.table.")
     data <- as.data.table(data)
   } 
-  
   if(copy){dt <- copy(data)} else {dt <- data}
-
+  
   dt_names <- names(dt)
   name_message <- "__ARG__ must be a character scalar and a name of a column from the dataset."
   check_set_arg(timevar, unitvar, cohortvar, "match", .choices = dt_names, .message = name_message)
   
-  covariate_message <- "__ARG__ must be NULL or a character vector which are all names of columns from the dataset."
-  check_set_arg(covariatesvar, outcomevar, 
-            "NULL | multi match", .choices = dt_names, .message = covariate_message)
+  covariate_message <- "__ARG__ must be NA or a character vector which are all names of columns from the dataset."
+  check_set_arg(varycovariatesvar, covariatesvar, outcomevar, 
+            "NA | multi match", .choices = dt_names, .message = covariate_message)
   
-  checkvar_message <- "__ARG__ must be NULL or a character scalar if a name of columns from the dataset."
+  checkvar_message <- "__ARG__ must be NA or a character scalar if a name of columns from the dataset."
   check_set_arg(weightvar, clustervar,
-            "NULL | match", .choices = dt_names, .message = checkvar_message)
+            "NA | match", .choices = dt_names, .message = checkvar_message)
   
   check_set_arg(control_option, "match", .choices = c("both", "never", "notyet")) #kinda bad names since did's notyet include both notyet and never
   check_set_arg(control_type, "match", .choices = c("ipw", "reg", "dr")) 
   check_arg(copy, validate, boot, allow_unbalance_panel, "scalar logical")
   
-  if(!is.null(balanced_event_time)){
+  if(!is.na(balanced_event_time)){
     if(result_type != "dynamic"){stop("balanced_event_time is only meaningful with result_type == 'dynamic'")}
     check_arg(balanced_event_time, "numeric scalar")
   }
-
   if(allow_unbalance_panel == TRUE & control_type %in% c("dr", "reg")){
     stop("fastdid currently only supprts ipw when allowing for unbalanced panels.")
   }
+  if(allow_unbalance_panel == TRUE & !allNA(varycovariatesvar)){
+    stop("fastdid currently only supprts time varying covariates when allowing for unbalanced panels.")
+  }
+  if(any(covariatesvar %in% varycovariatesvar) & !allNA(varycovariatesvar) & !allNA(covariatesvar)){
+    stop("time-varying var and invariant var have overlaps.")
+  }
+  if(!boot & !allNA(clustervar)){
+    stop("clustering only available with bootstrap")
+  }
+  
+  p <- list(timevar = timevar,
+            cohortvar = cohortvar,
+            unitvar = unitvar,
+            outcomevar =  outcomevar,
+            weightvar = weightvar,
+            clustervar = clustervar,
+            covariatesvar = covariatesvar,
+            varycovariatesvar = varycovariatesvar,
+            control_option = control_option,
+            result_type = result_type,
+            balanced_event_time = balanced_event_time,
+            control_type = control_type,
+            allow_unbalance_panel = allow_unbalance_panel,
+            boot = boot, 
+            biters = biters,
+            max_control_cohort_diff = max_control_cohort_diff)
+  
   
-  setnames(dt, c(timevar, cohortvar, unitvar), c("time", "G", "unit"))
-
   # validate data -----------------------------------------------------
   
+  setnames(dt, c(timevar, cohortvar, unitvar), c("time", "G", "unit"))
   
   if(validate){
-    varnames <- c("time", "G", "unit", outcomevar,weightvar,clustervar,covariatesvar)
-    dt <- validate_did(dt, covariatesvar, varnames, balanced_event_time, allow_unbalance_panel)
+    varnames <- c("time", "G", "unit", outcomevar,weightvar,clustervar,covariatesvar,varycovariatesvar)
+    dt <- validate_did(dt, varnames, p)
   }
   
   # preprocess -----------------------------------------------------------
   
   #make dt conform to the WLOG assumptions of fastdid
+  coerce_result <- coerce_dt(dt, p) #also changed dt
+  dt <- coerce_result$dt
+  # get auxiliary data
+  aux <- get_auxdata(dt, p)
+  
+  # main part  -------------------------------------------------
+
+  # estimate attgt
+  gt_result_list <- estimate_gtatt(aux, p)
+  
+  # aggregate the result by outcome
+  all_result <- rbindlist(lapply(gt_result_list, function(x){
+    result <- aggregate_gt(x, aux, p)
+    #convert "targets" back to meaningful parameter identifiers like cohort 1 post, time 2 post 
+    result <- result |> convert_targets(result_type, coerce_result$time_change)
+    return(result)
+  }))
+
+  return(all_result)
+  
+}
+
+# small steps ----------------------------------------------------------------------
+
+coerce_dt <- function(dt, p){
   
   #change to int before sorting
   if(!is.numeric(dt[, G])){
@@ -698,7 +830,7 @@ fastdid <- function(data,
     dt[, time := as.numeric(time)] 
   }
   
-  if(allow_unbalance_panel){
+  if(p$allow_unbalance_panel){
     dt_inv_raw <- dt[dt[, .I[1], by = unit]$V1]
     setorder(dt_inv_raw, G)
     dt_inv_raw[, new_unit := 1:.N] #let unit start from 1 .... N, useful for knowing which unit is missing
@@ -707,11 +839,10 @@ fastdid <- function(data,
   }
   
   setorder(dt, time, G, unit) #sort the dataset essential for the sort-once-quick-access 
-
   #deal with time, coerice time to 1,2,3,4,5.......
   time_periods <- dt[, unique(time)]
   time_size <- length(time_periods)
-
+  
   time_offset <- min(time_periods) - 1 #assume time starts at 1, first is min after sort :)
   if(time_offset != 0){
     dt[, G := G-time_offset]
@@ -728,15 +859,27 @@ fastdid <- function(data,
     dt[time != 1, time := (time-1)/time_step+1]
   }
   
-  #construct the outcomes list for fast access later
+  return(list(dt = dt,
+              time_change = list(time_step = time_step,
+                            max_time = max(time_periods),
+                            time_offset = time_offset)))
+  
+}
+
+get_auxdata <- function(dt, p){
+
+  time_periods <- dt[, unique(time)]
   id_size <- dt[, uniqueN(unit)]
-  outcomes_list <- list()
   
+  setorder(dt, time, G, unit) #sort the dataset essential for the sort-once-quick-access 
+  
+  #construct the outcomes list for fast access later
   #loop for multiple outcome
-  for(outcol in outcomevar){
+  outcomes_list <- list()
+  for(outcol in p$outcomevar){
     outcomes <- list()
     
-    if(!allow_unbalance_panel){
+    if(!p$allow_unbalance_panel){
       for(i in time_periods){
         start <- (i-1)*id_size+1
         end <- i*id_size
@@ -754,78 +897,67 @@ fastdid <- function(data,
       }
       
     }
-
+    
     outcomes_list[[outcol]] <- outcomes
   }
-
+  
   #the time-invariant parts 
-  if(!allow_unbalance_panel){
+  if(!p$allow_unbalance_panel){
     dt_inv <- dt[1:id_size]
   } else {
-    dt_inv <- dt[dt[, .I[1], by = unit]$V1]
+    dt_inv <- dt[dt[, .I[1], by = unit]$V1] #the first observation
     setorder(dt_inv, unit) #can't move this outside
   }
   
-  
-
-  
   cohorts <- dt_inv[, unique(G)]
   cohort_sizes <- dt_inv[, .(cohort_size = .N) , by = G]
-
+  
   # the optional columns
-  if(!is.null(covariatesvar)){
-    covariates <- cbind(const = -1, dt_inv[,.SD, .SDcols = covariatesvar]) # const 
+  varycovariates <- list()
+  if(!allNA(p$varycovariatesvar)){
+    for(i in time_periods){
+      start <- (i-1)*id_size+1
+      end <- i*id_size
+      varycovariates[[i]] <- dt[seq(start,end), .SD, .SDcols = p$varycovariatesvar]
+    }
   } else {
-    covariates <- NULL
+    varycovariates <- NA
   }
   
-  if(!is.null(clustervar)){
-    cluster <- dt_inv[, .SD, .SDcols = clustervar] |> unlist()
-  } else {cluster <- NULL}
-  
-  if(!is.null(weightvar)){
-    weights <- dt_inv[, .SD, .SDcols = weightvar] |> unlist()
-  } else {weights <- rep(1,id_size)}
+  if(!allNA(p$covariatesvar)){
+    covariates <- cbind(const = -1, dt_inv[,.SD, .SDcols = p$covariatesvar])
+  } else {
+    covariates <- NA
+  }
   
-  # main part  -------------------------------------------------
-
-  # attgt
-  gt_result_list <- estimate_gtatt(outcomes_list, outcomevar, covariates, control_type, weights,
-                                   cohort_sizes,cohorts,id_size,time_periods, #info about the dt
-                                   control_option, allow_unbalance_panel)
+  if(!is.na(p$clustervar)){
+    cluster <- dt_inv[, .SD, .SDcols = p$clustervar] |> unlist()
+  } else {
+    cluster <- NA
+  }
   
-  all_result <- data.table()
-  for(outcol in outcomevar){
-    gt_result <- gt_result_list[[outcol]]
-    
-    # aggregate att and inf function
-    agg_result <- aggregate_gt(gt_result, cohort_sizes, 
-                               weights, dt_inv[, G],
-                               result_type, balanced_event_time)
-    
-    #get se from the influence function
-    agg_se <- get_se(agg_result$inf_matrix, boot, biters, cluster)
-    
-    # post process -----------------------------------------------
-    
-    result <- data.table(agg_result$targets, agg_result$agg_att, agg_se)
-    names(result) <- c("target", "att", "se")
-    
-    #convert "targets" back to meaningful parameter identifiers like cohort 1 post, time 2 post 
-    result <- result |> convert_targets(result_type, time_offset, time_step, max(time_periods))
-    result[, outcome := outcol]
-    all_result <- rbind(all_result, result)
-    
-    rm(result)
-    
+  if(!is.na(p$weightvar)){
+    weights <- dt_inv[, .SD, .SDcols = p$weightvar] |> unlist()
+  } else {
+    weights <- rep(1, id_size)
   }
-
-  return(all_result)
+  
+  aux <- list(time_periods = time_periods,
+              id_size = id_size,
+              outcomes_list = outcomes_list,
+              dt_inv= dt_inv,
+              cohorts = cohorts,
+              cohort_sizes = cohort_sizes, 
+              varycovariates = varycovariates,
+              covariates = covariates,
+              cluster = cluster,
+              weights = weights)
+  
+  return(aux)
   
 }
 
-convert_targets <- function(results, result_type, 
-                            time_offset, time_step, max_time){
+convert_targets <- function(results, result_type, t){
   
   if(result_type == "dynamic"){
     setnames(results, "target", "event_time")
@@ -833,23 +965,23 @@ convert_targets <- function(results, result_type,
   } else if (result_type == "cohort"){
     
     results[, type := ifelse(target >= 0, "post", "pre")]
-    results[, target := recover_time(abs(target), time_offset, time_step)]
+    results[, target := recover_time(abs(target), t$time_offset, t$time_step)]
     setnames(results, "target", "cohort")
     
   } else if (result_type == "calendar"){
     
     results[, type := ifelse(target >= 0, "post", "pre")]
-    results[, target := recover_time(abs(target), time_offset, time_step)]
+    results[, target := recover_time(abs(target), t$time_offset, t$time_step)]
     setnames(results, "target", "time")
     
   } else if (result_type == "group_time"){
     
-    results[, cohort := floor((target-1)/max_time)]
-    results[, time := (target-cohort*max_time)]
+    results[, cohort := floor((target-1)/t$max_time)]
+    results[, time := (target-cohort*t$max_time)]
     
     #recover the time
-    results[, cohort := recover_time(cohort, time_offset, time_step)]
-    results[, time := recover_time(time, time_offset, time_step)]
+    results[, cohort := recover_time(cohort, t$time_offset, t$time_step)]
+    results[, time := recover_time(time, t$time_offset, t$time_step)]
     
     results[, target := NULL]
     
@@ -864,38 +996,6 @@ recover_time <- function(time, time_offset, time_step){
   return(((time-1)*time_step)+1+time_offset)
 }
 
-get_se <- function(inf_matrix, boot, biters, cluster) {
-  
-  if(boot){
-
-    top_quant <- 0.75
-    bot_quant <- 0.25
-    if(!is.null(cluster)){
-      cluster_n <- stats::aggregate(cluster, by=list(cluster), length)[,2]
-      inf_matrix <- fsum(inf_matrix, cluster) / cluster_n #the mean without 0 for each cluster of each setting
-    }
-
-    boot_results <- BMisc::multiplier_bootstrap(inf_matrix, biters = biters) %>% as.data.table()
-    
-    boot_top <- boot_results[, lapply(.SD, function(x) stats::quantile(x, top_quant, type=1, na.rm = TRUE)),]
-    boot_bot <- boot_results[, lapply(.SD, function(x) stats::quantile(x, bot_quant, type=1, na.rm = TRUE)),]
-    
-    dt_se <- rbind(boot_bot, boot_top) %>% transpose()
-    names(dt_se) <- c("boot_bot", "boot_top")
-
-    se <- dt_se[,(boot_top-boot_bot)/(qnorm(top_quant) - qnorm(bot_quant))]
-    se[se < sqrt(.Machine$double.eps)*10] <- NA
-    
-  } else {
-    if(!is.null(cluster)){stop("clustering only available with bootstrap")}
-    inf_matrix <- inf_matrix  |> as.data.table()
-    se <- inf_matrix[, lapply(.SD, function(x) sqrt(sum(x^2, na.rm = TRUE)/length(x)^2))] %>% as.vector() #should maybe use n-1 but did use n
-    
-  }
-  return(unlist(se))
-}
-
-
 NULL
 
 if(FALSE){
@@ -1008,7 +1108,7 @@ plot_did_dynamics <-function(dt,
 #'
 #' @export
 sim_did <- function(sample_size, time_period, untreated_prop = 0.3, epsilon_size = 0.001,
-                    cov = "no", hetero = "all", second_outcome = FALSE, second_cov = FALSE, na = "none", 
+                    cov = "no", hetero = "all", second_outcome = FALSE, second_cov = FALSE, vary_cov = FALSE, na = "none", 
                     balanced = TRUE, seed = NA, stratify = FALSE, treatment_assign = "latent"){
   
   if(!is.na(seed)){set.seed(seed)}
@@ -1072,8 +1172,15 @@ sim_did <- function(sample_size, time_period, untreated_prop = 0.3, epsilon_size
   
   dt[, D := as.integer(time >= G)]
   
+  #add time_varying covariates
+  if(vary_cov){
+    dt[, xvar := pmin(G, time_period+4)*time+rnorm(sample_size*time_period, 0,30)] #should be confounding....?
+  } else {
+    dt[, xvar := 1]
+  }
+  
   #untreated potential outcomes
-  dt[, y0 := unit_fe + time_fe + x*x_trend + x2*x_trend + rnorm(sample_size*time_period, sd = epsilon_size)]
+  dt[, y0 := unit_fe + time_fe + (x+x2)*x_trend + xvar + rnorm(sample_size*time_period, sd = epsilon_size)]
   
   #generate gtatt
   att <- CJ(G = 1:time_period, time = 1:time_period)
@@ -1093,7 +1200,7 @@ sim_did <- function(sample_size, time_period, untreated_prop = 0.3, epsilon_size
   #potential outcome
   dt[, y1 := y0 + tau]
   dt[, y := y1*D + y0*(1-D)]
-  dt <- dt[, .SD, .SDcols = c("time", "G", "unit", "x", "x2", "y", "s")]
+  dt <- dt[, .SD, .SDcols = c("time", "G", "unit", "x", "x2", "y", "s", "xvar")]
   
   #additional -----------------
   
@@ -1120,9 +1227,6 @@ sim_did <- function(sample_size, time_period, untreated_prop = 0.3, epsilon_size
 
 
 
-
-
-
 set_max_thread <- function(){
   data.table::setDTthreads(0)
   options(kit.nThread = getDTthreads())
@@ -1132,18 +1236,18 @@ reverse_col <- function(x){
   return(x[,ncol(x):1])
 }
 
-validate_did <- function(dt,covariatesvar,varnames, balanced_event_time, allow_unbalance_panel){
+validate_did <- function(dt,varnames,p){
+
   raw_unit_size <- dt[, uniqueN(unit)]
   raw_time_size <- dt[, uniqueN(time)]
   
-  
-  if(!is.null(balanced_event_time)){
-    if(balanced_event_time > dt[, max(time-G)]){stop("balanced_event_time is larger than the max event time in the data")}
+  if(!is.na(p$balanced_event_time)){
+    if(p$balanced_event_time > dt[, max(time-G)]){stop("balanced_event_time is larger than the max event time in the data")}
   }
   
   #doesn't allow missing value for now
-  for(col in c(covariatesvar, varnames)){
-    if(is.null(col)){next}
+  for(col in varnames){
+    if(is.na(col)){next}
     na_obs <- whichNA(dt[[col]])
     if(length(na_obs) != 0){
       warning("missing values detected in ", col, ", removing ", length(na_obs), " observation.")
@@ -1151,17 +1255,18 @@ validate_did <- function(dt,covariatesvar,varnames, balanced_event_time, allow_u
     }
   }
   
-  if(!is.null(covariatesvar)){
-    
-    if(uniqueN(dt, by = c("unit", covariatesvar)) > raw_unit_size){
-      warning("some covariates is time-varying, fastdid only use the first observation for covariates.")
-    }
-    
-    for(cov in covariatesvar){
+  
+  if(!allNA(p$covariatesvar) && uniqueN(dt, by = c("unit", p$covariatesvar)) > raw_unit_size){
+    warning("some covariates is time-varying, fastdid only use the first observation for covariates.")
+  }
+  
+  
+  if(!allNA(p$covariatesvar)|!allNA(p$varycovariatesvar)){
+    for(cov in c(p$covariatesvar, p$varycovariatesvar)){
+      if(is.na(cov)){next}
       #check covaraites is not constant  
       if(fnunique(dt[, get(cov)[1], by = "unit"][, V1]) == 1)stop(cov, " have no variation")
     }
-
   }
   
   #check balanced panel
@@ -1172,7 +1277,7 @@ validate_did <- function(dt,covariatesvar,varnames, balanced_event_time, allow_u
   }
   
   #check if any is missing
-  if(!allow_unbalance_panel){
+  if(!p$allow_unbalance_panel){
     unit_count <- dt[, .(count = .N), by = unit]
     if(any(unit_count[, count < raw_time_size])){
       mis_unit <- unit_count[count < raw_time_size]

From 9e606d285a89f1539e2621c8e4b437312e96263f Mon Sep 17 00:00:00 2001
From: TsaiLintung <101155567+TsaiLintung@users.noreply.github.com>
Date: Tue, 7 May 2024 11:28:55 +0800
Subject: [PATCH 13/15] timing stuff mostly done

---
 R/estimate_gtatt.R                | 66 ++++++++++++++++++++-----------
 R/fastdid.R                       | 19 +++++++--
 inst/tinytest/test_1_fastdid.R    | 15 ++++++-
 inst/tinytest/test_2_compare_gt.R | 30 ++++++++++++++
 4 files changed, 102 insertions(+), 28 deletions(-)

diff --git a/R/estimate_gtatt.R b/R/estimate_gtatt.R
index eb165d5..93a56ac 100644
--- a/R/estimate_gtatt.R
+++ b/R/estimate_gtatt.R
@@ -1,17 +1,11 @@
 estimate_gtatt <- function(aux, p) {
 
+  #chcek if there is availble never-treated group
   if(!Inf %in% aux$cohorts & p$control_option != "notyet"){
     warning("no never-treated availble, switching to not-yet-treated control")
     p$control_option <- "notyet"
   }
   
-  if(!is.na(p$max_control_cohort_diff)){
-    warning("max_control_cohort_diff can only be used with not yet")
-    p$control_option <- "notyet"
-  }
-  
-  treated_cohorts <- aux$cohorts[!is.infinite(aux$cohorts)]
-  
   cache_ps_fit_list <- list() #for the first outcome won't be able to use cache, empty list returns null for call like cache_ps_fit[["1.3"]]
   cache_hess_list <- list()
   outcome_result_list <- list()
@@ -27,26 +21,14 @@ estimate_gtatt <- function(aux, p) {
       for(g in aux$cohorts){
         
         gt_name <- paste0(g, ".", t)
-        
-        #setup and checks
-        base_period <- g-1
-        min_control_cohort <- ifelse(p$control_option  == "never", Inf, max(t+1, base_period+1)) #not-yet treated / never treated in both base and "treated" period
-        
-        if(!is.na(p$max_control_cohort_diff)){
-          max_control_cohort <- min(g+p$max_control_cohort_diff, max(treated_cohorts))
+        if(p$base_period == "universal"){
+          base_period <- g-1-p$anticipation
         } else {
-          max_control_cohort <- ifelse(p$control_option == "notyet", max(treated_cohorts), Inf) 
+          base_period <- ifelse(t>=g, g-1-p$anticipation, t-1)
         }
         
-        if(t == base_period){next} #no treatment effect for the base period
-        if(base_period < min(aux$time_periods)){next} #no treatment effect for the first period, since base period is not observed
-        if(g >= max_control_cohort){next} #no treatment effect for never treated or the last treated cohort (for not yet notyet)
-        if(t >= max_control_cohort){next} #no control available if the last cohort is treated too
-        
-        #select the control and treated cohorts
-        did_setup <- rep(NA, aux$id_size)
-        did_setup[get_cohort_pos(aux$cohort_sizes, min_control_cohort, max_control_cohort)] <- 0
-        did_setup[get_cohort_pos(aux$cohort_sizes, g)] <- 1 #treated cannot be controls, assign treated after control to overwrite
+        did_setup <- get_did_setup(g, t, base_period, aux, p)
+        if(is.null(did_setup)){next} #no gtatt if no did setup
         
         #construct the covariates matrix
         covvars <- get_covvars(aux$covariates, aux$varycovariates, base_period, t)
@@ -87,6 +69,42 @@ estimate_gtatt <- function(aux, p) {
   return(outcome_result_list)
 }
 
+get_did_setup <- function(g, t, base_period, aux, p){
+  
+  treated_cohorts <- aux$cohorts[!is.infinite(aux$cohorts)]
+  
+  #setup and checks
+  
+  if(p$control_option == "never"){
+    min_control_cohort <- Inf
+  } else {
+    if(!is.infinite(p$min_control_cohort_diff)){
+      min_control_cohort <- max(g+p$min_control_cohort_diff, min(treated_cohorts))
+    } else {
+      min_control_cohort <- max(t, base_period)+p$anticipation+1
+    }
+  }
+
+  if(!is.infinite(p$max_control_cohort_diff)){
+    max_control_cohort <- min(g+p$max_control_cohort_diff, max(treated_cohorts))
+  } else {
+    max_control_cohort <- ifelse(p$control_option == "notyet", max(treated_cohorts), Inf) 
+  }
+  
+  if(t == base_period | #no treatment effect for the base period
+     base_period < min(aux$time_periods) | #no treatment effect for the first period, since base period is not observed
+     g >= max_control_cohort | #no treatment effect for never treated or the last treated cohort (for not yet notyet)
+     t >= max_control_cohort){ #no control available if the last cohort is treated too
+    return(NULL)
+  } else {
+    #select the control and treated cohorts
+    did_setup <- rep(NA, aux$id_size)
+    did_setup[get_cohort_pos(aux$cohort_sizes, min_control_cohort, max_control_cohort)] <- 0
+    did_setup[get_cohort_pos(aux$cohort_sizes, g)] <- 1 #treated cannot be controls, assign treated after control to overwrite
+    return(did_setup)
+  }
+}
+
 get_cohort_pos <- function(cohort_sizes, start_cohort, end_cohort = start_cohort){
   start <- cohort_sizes[G < start_cohort, sum(cohort_size)]+1
   end <- cohort_sizes[G <= end_cohort, sum(cohort_size)]
diff --git a/R/fastdid.R b/R/fastdid.R
index bd4a437..3a3d1ba 100644
--- a/R/fastdid.R
+++ b/R/fastdid.R
@@ -20,6 +20,7 @@
 #' @param varycovariatesvar A character vector containing the names of time-varying covariate variables (optional).
 #' @param copy whether to copy the dataset before processing, set to false to speed up the process, but the input data will be altered.
 #' @param validate whether to validate the dataset before processing.
+#' @param anticipation periods with aniticipation (delta in CS, default is 0, reference period is g - delta - 1).
 #' 
 #' @import data.table parglm stringr dreamerr BMisc 
 #' @importFrom stats quantile vcov sd binomial fitted qnorm rnorm as.formula
@@ -60,9 +61,9 @@ fastdid <- function(data,
                     timevar, cohortvar, unitvar, outcomevar, 
                     control_option="both",result_type="group_time", balanced_event_time = NA,
                     control_type = "ipw", allow_unbalance_panel = FALSE, boot=FALSE, biters = 1000,
-                    weightvar=NA,clustervar=NA,covariatesvar = NA, varycovariatesvar = NA,
+                    weightvar=NA,clustervar=NA, covariatesvar = NA, varycovariatesvar = NA, 
                     copy = TRUE, validate = TRUE,
-                    max_control_cohort_diff = NA
+                    max_control_cohort_diff = Inf, anticipation = 0, min_control_cohort_diff = -Inf, base_period = "universal"
                     ){
   
   # validate arguments --------------------------------------------------------
@@ -87,7 +88,9 @@ fastdid <- function(data,
   
   check_set_arg(control_option, "match", .choices = c("both", "never", "notyet")) #kinda bad names since did's notyet include both notyet and never
   check_set_arg(control_type, "match", .choices = c("ipw", "reg", "dr")) 
+  check_set_arg(base_period, "match", .choices = c("varying", "universal"))
   check_arg(copy, validate, boot, allow_unbalance_panel, "scalar logical")
+  check_arg(max_control_cohort_diff, min_control_cohort_diff, anticipation, "scalar numeric")
   
   if(!is.na(balanced_event_time)){
     if(result_type != "dynamic"){stop("balanced_event_time is only meaningful with result_type == 'dynamic'")}
@@ -106,6 +109,13 @@ fastdid <- function(data,
     stop("clustering only available with bootstrap")
   }
   
+  # coerce non-sensible option
+  
+  if((!is.infinite(max_control_cohort_diff) | !is.infinite(min_control_cohort_diff)) & control_option == "never"){
+    warning("control_cohort_diff can only be used with not yet")
+    p$control_option <- "notyet"
+  }
+  
   p <- list(timevar = timevar,
             cohortvar = cohortvar,
             unitvar = unitvar,
@@ -121,7 +131,10 @@ fastdid <- function(data,
             allow_unbalance_panel = allow_unbalance_panel,
             boot = boot, 
             biters = biters,
-            max_control_cohort_diff = max_control_cohort_diff)
+            max_control_cohort_diff = max_control_cohort_diff,
+            min_control_cohort_diff = min_control_cohort_diff,
+            anticipation = anticipation, 
+            base_period = base_period)
   
   
   # validate data -----------------------------------------------------
diff --git a/inst/tinytest/test_1_fastdid.R b/inst/tinytest/test_1_fastdid.R
index 7acb010..cf112ec 100644
--- a/inst/tinytest/test_1_fastdid.R
+++ b/inst/tinytest/test_1_fastdid.R
@@ -63,7 +63,20 @@ expect_silent(fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",ou
 
 expect_silent(fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y",  result_type = "group_time", allow_unbalance_panel = TRUE,
                       max_control_cohort_diff = 2),
-              info = "balance panel false but dt is balance")
+              info = "max control cohort diff")
+
+expect_silent(fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y",  result_type = "group_time", allow_unbalance_panel = TRUE,
+                      min_control_cohort_diff = 4),
+              info = "min control cohort diff")
+
+expect_silent(fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y",  result_type = "group_time", allow_unbalance_panel = TRUE,
+                      anticipation = 2),
+              info = "anticipation")
+
+expect_silent(fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y",  result_type = "group_time", allow_unbalance_panel = TRUE,
+                      base_period = "varying"),
+              info = "baseperiod vary")
+
 
 # dt that needs adjustment ---------------------------
 
diff --git a/inst/tinytest/test_2_compare_gt.R b/inst/tinytest/test_2_compare_gt.R
index 4106183..479a8b2 100644
--- a/inst/tinytest/test_2_compare_gt.R
+++ b/inst/tinytest/test_2_compare_gt.R
@@ -10,6 +10,7 @@ dt <- simdt$dt
 
 est_diff_ratio <- function(result, did_result){
   did_result_dt <- data.table(cohort = did_result$group, time = did_result$t, did_att = did_result$att, did_se = did_result$se)
+  did_result_dt <- did_result_dt[did_att != 0]
   compare <- did_result_dt |> merge(result, by = c("cohort", "time"), all = TRUE) 
   att_diff_per <- compare[, sum(abs(did_att-att), na.rm = TRUE)/sum(did_att, na.rm = TRUE)]
   se_diff_per <- compare[, sum(abs(did_se-se), na.rm = TRUE)/sum(did_se, na.rm = TRUE)]
@@ -85,6 +86,35 @@ expect_equal(est_diff_ratio(result, did_result), c(0,0), tolerance = tol,
              info = "covariates dr")
 rm(result, did_result)
 
+# anticipation -----------------
+
+result <- fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y",  result_type = "group_time",
+                  control_type = "dr",
+                  covariatesvar = c("x"), anticipation = 2)
+did_result <- did::att_gt(yname = "y",gname = "G",idname = "unit",tname = "time",data = dt,base_period = "universal",est_method = "dr",cband = FALSE,
+                          xformla = ~x,
+                          control_group = "notyettreated",
+                          clustervars = NULL,
+                          bstrap = FALSE, anticipation = 2)
+
+expect_equal(est_diff_ratio(result, did_result), c(0,0), tolerance = tol,
+             info = "anticipation")
+rm(result, did_result)
+
+# alternative base period -----------------
+
+result <- fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y",  result_type = "group_time",
+                  control_type = "dr",
+                  covariatesvar = c("x"), base_period = "varying")
+did_result <- did::att_gt(yname = "y",gname = "G",idname = "unit",tname = "time",data = dt,base_period = "varying",est_method = "dr",cband = FALSE,
+                          xformla = ~x,
+                          control_group = "notyettreated",
+                          clustervars = NULL,
+                          bstrap = FALSE)
+
+expect_equal(est_diff_ratio(result, did_result), c(0,0), tolerance = tol,
+             info = "varying base period")
+rm(result, did_result)
 
 # bootstrap --------------------
 

From 7237787631b8b569879c5a7030e4cfde6a3a18c1 Mon Sep 17 00:00:00 2001
From: TsaiLintung <101155567+TsaiLintung@users.noreply.github.com>
Date: Tue, 7 May 2024 11:41:56 +0800
Subject: [PATCH 14/15] add filtervar

---
 R/estimate_gtatt.R             |  5 +++++
 R/fastdid.R                    | 22 ++++++++++++++++++----
 R/validate_did.R               |  5 ++++-
 inst/tinytest/test_1_fastdid.R |  8 ++++++++
 4 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/R/estimate_gtatt.R b/R/estimate_gtatt.R
index 93a56ac..ad0a531 100644
--- a/R/estimate_gtatt.R
+++ b/R/estimate_gtatt.R
@@ -101,6 +101,11 @@ get_did_setup <- function(g, t, base_period, aux, p){
     did_setup <- rep(NA, aux$id_size)
     did_setup[get_cohort_pos(aux$cohort_sizes, min_control_cohort, max_control_cohort)] <- 0
     did_setup[get_cohort_pos(aux$cohort_sizes, g)] <- 1 #treated cannot be controls, assign treated after control to overwrite
+    
+    if(!is.na(p$filtervar)){
+      did_setup[!aux$filters[[base_period]]] <- NA #only use units with filter == TRUE at base period
+    }
+    
     return(did_setup)
   }
 }
diff --git a/R/fastdid.R b/R/fastdid.R
index 3a3d1ba..d950f85 100644
--- a/R/fastdid.R
+++ b/R/fastdid.R
@@ -61,7 +61,7 @@ fastdid <- function(data,
                     timevar, cohortvar, unitvar, outcomevar, 
                     control_option="both",result_type="group_time", balanced_event_time = NA,
                     control_type = "ipw", allow_unbalance_panel = FALSE, boot=FALSE, biters = 1000,
-                    weightvar=NA,clustervar=NA, covariatesvar = NA, varycovariatesvar = NA, 
+                    weightvar=NA,clustervar=NA, covariatesvar = NA, varycovariatesvar = NA, filtervar = NA,
                     copy = TRUE, validate = TRUE,
                     max_control_cohort_diff = Inf, anticipation = 0, min_control_cohort_diff = -Inf, base_period = "universal"
                     ){
@@ -83,7 +83,7 @@ fastdid <- function(data,
             "NA | multi match", .choices = dt_names, .message = covariate_message)
   
   checkvar_message <- "__ARG__ must be NA or a character scalar if a name of columns from the dataset."
-  check_set_arg(weightvar, clustervar,
+  check_set_arg(weightvar, clustervar, filtervar,
             "NA | match", .choices = dt_names, .message = checkvar_message)
   
   check_set_arg(control_option, "match", .choices = c("both", "never", "notyet")) #kinda bad names since did's notyet include both notyet and never
@@ -122,6 +122,7 @@ fastdid <- function(data,
             outcomevar =  outcomevar,
             weightvar = weightvar,
             clustervar = clustervar,
+            filtervar = filtervar,
             covariatesvar = covariatesvar,
             varycovariatesvar = varycovariatesvar,
             control_option = control_option,
@@ -142,7 +143,7 @@ fastdid <- function(data,
   setnames(dt, c(timevar, cohortvar, unitvar), c("time", "G", "unit"))
   
   if(validate){
-    varnames <- c("time", "G", "unit", outcomevar,weightvar,clustervar,covariatesvar,varycovariatesvar)
+    varnames <- c("time", "G", "unit",outcomevar,weightvar,clustervar,covariatesvar,varycovariatesvar,filtervar)
     dt <- validate_did(dt, varnames, p)
   }
   
@@ -277,6 +278,18 @@ get_auxdata <- function(dt, p){
     varycovariates <- NA
   }
   
+  # filters
+  filters <- list()
+  if(!is.na(p$filtervar)){
+    for(i in time_periods){
+      start <- (i-1)*id_size+1
+      end <- i*id_size
+      filters[[i]] <- dt[seq(start,end), .SD, .SDcols = p$filtervar]
+    }
+  } else {
+    filters <- NA
+  }
+  
   if(!allNA(p$covariatesvar)){
     covariates <- cbind(const = -1, dt_inv[,.SD, .SDcols = p$covariatesvar])
   } else {
@@ -304,7 +317,8 @@ get_auxdata <- function(dt, p){
               varycovariates = varycovariates,
               covariates = covariates,
               cluster = cluster,
-              weights = weights)
+              weights = weights,
+              filters = filters)
   
   return(aux)
   
diff --git a/R/validate_did.R b/R/validate_did.R
index 839f751..c4a9844 100644
--- a/R/validate_did.R
+++ b/R/validate_did.R
@@ -7,6 +7,10 @@ validate_did <- function(dt,varnames,p){
     if(p$balanced_event_time > dt[, max(time-G)]){stop("balanced_event_time is larger than the max event time in the data")}
   }
   
+  if(!is.na(p$filtervar) && !is.logical(dt[[p$filtervar]])){
+    stop("filter var needs to be a logical column")
+  }
+  
   #doesn't allow missing value for now
   for(col in varnames){
     if(is.na(col)){next}
@@ -17,7 +21,6 @@ validate_did <- function(dt,varnames,p){
     }
   }
   
-  
   if(!allNA(p$covariatesvar) && uniqueN(dt, by = c("unit", p$covariatesvar)) > raw_unit_size){
     warning("some covariates is time-varying, fastdid only use the first observation for covariates.")
   }
diff --git a/inst/tinytest/test_1_fastdid.R b/inst/tinytest/test_1_fastdid.R
index cf112ec..207f497 100644
--- a/inst/tinytest/test_1_fastdid.R
+++ b/inst/tinytest/test_1_fastdid.R
@@ -77,6 +77,14 @@ expect_silent(fastdid(dt, timevar = "time", cohortvar = "G", unitvar = "unit",ou
                       base_period = "varying"),
               info = "baseperiod vary")
 
+# filtervar ------------------------------------------
+
+dt2 <- copy(dt)
+dt2[, f := x>0]
+
+expect_silent(fastdid(dt2, timevar = "time", cohortvar = "G", unitvar = "unit",outcomevar = "y",  result_type = "group_time", allow_unbalance_panel = TRUE,
+                      base_period = "varying", filtervar = "f"),
+              info = "filtervar")
 
 # dt that needs adjustment ---------------------------
 

From 3d1b82be8afdf8f4e766a02f9df84d22f470f811 Mon Sep 17 00:00:00 2001
From: TsaiLintung <101155567+TsaiLintung@users.noreply.github.com>
Date: Tue, 7 May 2024 13:42:24 +0800
Subject: [PATCH 15/15] 0.9.3

---
 R/fastdid.R                                   |   4 +
 R/global.R                                    |  16 +--
 R/sim_did.R                                   |   1 +
 README.md                                     |  29 ++--
 development/build_source.R                    |  18 +--
 .../{fastdid_0_9_2.R => fastdid_0_9_3.R}      | 135 +++++++++++++-----
 development/source_head.R                     |   3 -
 man/fastdid.Rd                                |  16 ++-
 man/sim_did.Rd                                |   2 +
 9 files changed, 148 insertions(+), 76 deletions(-)
 rename development/{fastdid_0_9_2.R => fastdid_0_9_3.R} (91%)
 delete mode 100644 development/source_head.R

diff --git a/R/fastdid.R b/R/fastdid.R
index d950f85..45a28fe 100644
--- a/R/fastdid.R
+++ b/R/fastdid.R
@@ -18,9 +18,13 @@
 #' @param clustervar The name of the cluster variable, can only be used when boot == TRUE (optional).
 #' @param covariatesvar A character vector containing the names of time-invariant covariate variables (optional).
 #' @param varycovariatesvar A character vector containing the names of time-varying covariate variables (optional).
+#' @param filtervar A logical vector that specifies which units to use (can be time varying, units will only be included in 2x2 when filtervar is TRUE is the base period)
 #' @param copy whether to copy the dataset before processing, set to false to speed up the process, but the input data will be altered.
 #' @param validate whether to validate the dataset before processing.
 #' @param anticipation periods with aniticipation (delta in CS, default is 0, reference period is g - delta - 1).
+#' @param min_control_cohort_diff the min cohort difference between treated and control group 
+#' @param max_control_cohort_diff the max cohort difference between treated and control group 
+#' @param base_period same as did
 #' 
 #' @import data.table parglm stringr dreamerr BMisc 
 #' @importFrom stats quantile vcov sd binomial fitted qnorm rnorm as.formula
diff --git a/R/global.R b/R/global.R
index 0985e52..76be0cc 100644
--- a/R/global.R
+++ b/R/global.R
@@ -1,24 +1,10 @@
 NULL
 
-if(FALSE){
-  text <- ". agg_weight att att_cont att_treat attgt cohort cohort_size
-    conf_lwb conf_upb const cont_ipw_weight count delta_y element_rect
-    element_text event_time pg placeholder post.y pre.y ps s se target
-    tau time_fe treat_ipw_weight treat_latent type unit unit_fe weight x
-    x2 x_trend y y0 y1 y2 time outcome V1 att_cont_post att_cont_pre att_treat_post att_treat_pre inpost
-     inpre max_et min_et new_unit or_delta or_delta_post or_delta_pre
-     targeted used"
-  text <- text |> str_remove_all("\\\n") |>str_split(" ") |> unlist()
-  text <- text[text!=""]
-  text <- text |> str_flatten(collapse = "','")
-  text <- paste0("c('", text, "')")
-}
-
 # quiets concerns of R CMD check re: the .'s that appear in pipelines and data.table variables
 utils::globalVariables(c('.','agg_weight','att','att_cont','att_treat','attgt','cohort','cohort_size','conf_lwb','conf_upb',
                          'const','cont_ipw_weight','count','delta_y','element_rect','element_text','event_time','pg','placeholder',
                          'post.y','pre.y','ps','s','se','target','tau','time_fe',
                          'treat_ipw_weight','treat_latent','type','unit','unit_fe','weight','x','x2',
-                         'x_trend','y','y0','y1','y2', 'time', 'weights', 'outcome', "G", "D",
+                         'x_trend','y','y0','y1','y2', 'time', 'weights', 'outcome', "G", "D", 'xvar',
                          'V1','att_cont_post','att_cont_pre','att_treat_post','att_treat_pre','inpost','inpre','max_et','min_et','new_unit','or_delta','or_delta_post','or_delta_pre','targeted','used'))
 
diff --git a/R/sim_did.R b/R/sim_did.R
index d8ba2e5..8a43e13 100644
--- a/R/sim_did.R
+++ b/R/sim_did.R
@@ -15,6 +15,7 @@
 #' @param seed Seed for random number generation.
 #' @param stratify Whether to stratify the dataset based on a binary covariate.
 #' @param treatment_assign The method for treatment assignment ("latent" or "uniform").
+#' @param vary_cov include time-varying covariates
 #'
 #' @return A list containing the simulated dataset (dt) and the treatment effect values (att).
 #'
diff --git a/README.md b/README.md
index 3c8fd85..f6bc369 100644
--- a/README.md
+++ b/README.md
@@ -65,7 +65,7 @@ result <- fastdid(data = dt,
                   clustervar = "x", boot = TRUE) #add clustering by using bootstrap
 ```
 
-Estimation for multiple outcomes can be done in one call by providing a vector of outcome column names (saves a lot of time when controlling for covariates since logit estimands can be recycled across outcomes). 
+Estimation for multiple outcomes can be done in one call by providing a vector of outcome column names (saves a lot of time when controlling for covariates since logit estimates can be recycled across outcomes). 
 
 ```
 #calling fastdid
@@ -122,9 +122,9 @@ Aggregated parameters: `fastdid` aggregates in the same function.
 
 ## Other
 
-1. **fastdid** currently only offers inverse probability weights estimators for controlling for covariates when allowing for unbalanced panels.
-2. **fastdid** only uses the time before the event as base periods ("universal" in `attgt`)
-4. **fastdid** currently only reports the pointwise confidence intervals, instead of the simultaneously valid confidence intervals (check section 4.1 of Callaway and Sant'Anna's (2021) for more detail.)
+1. **fastdid** only offers inverse probability weights estimators for controlling for covariates when allowing for unbalanced panels.
+2. **fastdid** use universal base periods as default. 
+4. **fastdid** only reports the pointwise confidence intervals, instead of the simultaneously valid confidence intervals (check section 4.1 of Callaway and Sant'Anna's (2021) for more detail.)
 
 # Roadmap
 
@@ -134,17 +134,26 @@ Aggregated parameters: `fastdid` aggregates in the same function.
 - Min/max event time and balanced composition :white_check_mark:
 - DR and OR estimators :white_check_mark:
 - Allowing for unbalanced panels :white_check_mark: (*well, not fully because DR and OR still need to be added*)
-- Larger-than-memory data support
+- Anticipation :white_check_mark:
+- Varying base periods :white_check_mark:
 - User-provided aggregation scheme
-- drop-in interface for did
-- Anticipation
-- Varying base periods
 - User-provided control formula
-- Simultaneously valid confidence bands
-- Further optimization!
+- simultaneously valid confidence bands
+- Further optimization
+
+# Source version
+
+Since **fastdid** is not on CRAN yet, it needs to be converted to R scripts to be used in some restricted environments. This can be done with `development/build_source.R`. After changing the working directory, the script will produce `development/fastdid_VERNAME.R`, which can be sourced to mimic the functionalities of the package.
 
 # Update
 
+## 0.9.3 (2024/5/7)
+
+- add anticipation and varying base period option
+- add min and max control cohort difference
+- add time-varying control ([reference](https://arxiv.org/abs/2202.02903))
+- add filtervar 
+
 ## 0.9.2 (2023/12/20)
 
 - add support to doubly robust and outcome regression estimators
diff --git a/development/build_source.R b/development/build_source.R
index 7d3bbc3..d29baf2 100644
--- a/development/build_source.R
+++ b/development/build_source.R
@@ -2,24 +2,20 @@ rm(list = ls())
 gc()
 
 library(stringr)
-library(here)
 
-source_files <- list.files(here("R"), include.dirs = FALSE, full.names = TRUE)
+setwd("~/GitHub/fastdid") #change this if necessary
+source_files <- list.files("R", include.dirs = FALSE, full.names = TRUE)
 
-ver <- "0.9.2"
-vername <- "unbalanced"
+ver <- "0.9.3"
+vername <- "timing"
 dep <- c("data.table", "stringr", "BMisc", "collapse", "dreamerr", "parglm")
 
-sink(here("development/source_head.R"))
+sink(paste0("development/fastdid_", str_replace_all(ver, "\\.", "_"), ".R"))
 cat(paste0("#", as.character(Sys.Date()), "\n"))
 cat(paste0("message('loading fastdid source ver. ver: ", ver, " (", vername ,"), date: " , as.character(Sys.Date()), "')\n"))
-cat(paste0("require(", dep, ");") |> str_flatten())
-sink()
-
-sink(paste0("development/fastdid_", str_replace_all(ver, "\\.", "_"), ".R"))
-cat(readLines(here("development/source_head.R")), sep ="\n")
+cat(paste0("require(", dep, ");" |> str_flatten(), "\n"))
 for(file in source_files){
-  current_file = readLines(file)
+  current_file = readLines(file, warn = FALSE)
   cat(current_file, sep ="\n")
   cat("\n")
 }
diff --git a/development/fastdid_0_9_2.R b/development/fastdid_0_9_3.R
similarity index 91%
rename from development/fastdid_0_9_2.R
rename to development/fastdid_0_9_3.R
index f9f4ae4..379a0de 100644
--- a/development/fastdid_0_9_2.R
+++ b/development/fastdid_0_9_3.R
@@ -1,6 +1,11 @@
-#2024-04-09
-message('loading fastdid source ver. ver: 0.9.2 (unbalanced), date: 2024-04-09')
-require(data.table);require(stringr);require(BMisc);require(collapse);require(dreamerr);require(parglm)
+#2024-05-07
+message('loading fastdid source ver. ver: 0.9.3 (timing), date: 2024-05-07')
+require(data.table);
+ require(stringr);
+ require(BMisc);
+ require(collapse);
+ require(dreamerr);
+ require(parglm);
 aggregate_gt <- function(gt_result, aux, p){
   
 
@@ -540,18 +545,12 @@ estimate_did_rc <- function(dt_did, covvars, control_type,
 
 estimate_gtatt <- function(aux, p) {
 
+  #chcek if there is availble never-treated group
   if(!Inf %in% aux$cohorts & p$control_option != "notyet"){
     warning("no never-treated availble, switching to not-yet-treated control")
     p$control_option <- "notyet"
   }
   
-  if(!is.na(p$max_control_cohort_diff)){
-    warning("max_control_cohort_diff can only be used with not yet")
-    p$control_option <- "notyet"
-  }
-  
-  treated_cohorts <- aux$cohorts[!is.infinite(aux$cohorts)]
-  
   cache_ps_fit_list <- list() #for the first outcome won't be able to use cache, empty list returns null for call like cache_ps_fit[["1.3"]]
   cache_hess_list <- list()
   outcome_result_list <- list()
@@ -567,26 +566,14 @@ estimate_gtatt <- function(aux, p) {
       for(g in aux$cohorts){
         
         gt_name <- paste0(g, ".", t)
-        
-        #setup and checks
-        base_period <- g-1
-        min_control_cohort <- ifelse(p$control_option  == "never", Inf, max(t+1, base_period+1)) #not-yet treated / never treated in both base and "treated" period
-        
-        if(!is.na(p$max_control_cohort_diff)){
-          max_control_cohort <- min(g+p$max_control_cohort_diff, max(treated_cohorts))
+        if(p$base_period == "universal"){
+          base_period <- g-1-p$anticipation
         } else {
-          max_control_cohort <- ifelse(p$control_option == "notyet", max(treated_cohorts), Inf) 
+          base_period <- ifelse(t>=g, g-1-p$anticipation, t-1)
         }
         
-        if(t == base_period){next} #no treatment effect for the base period
-        if(base_period < min(aux$time_periods)){next} #no treatment effect for the first period, since base period is not observed
-        if(g >= max_control_cohort){next} #no treatment effect for never treated or the last treated cohort (for not yet notyet)
-        if(t >= max_control_cohort){next} #no control available if the last cohort is treated too
-        
-        #select the control and treated cohorts
-        did_setup <- rep(NA, aux$id_size)
-        did_setup[get_cohort_pos(aux$cohort_sizes, min_control_cohort, max_control_cohort)] <- 0
-        did_setup[get_cohort_pos(aux$cohort_sizes, g)] <- 1 #treated cannot be controls, assign treated after control to overwrite
+        did_setup <- get_did_setup(g, t, base_period, aux, p)
+        if(is.null(did_setup)){next} #no gtatt if no did setup
         
         #construct the covariates matrix
         covvars <- get_covvars(aux$covariates, aux$varycovariates, base_period, t)
@@ -627,6 +614,47 @@ estimate_gtatt <- function(aux, p) {
   return(outcome_result_list)
 }
 
+get_did_setup <- function(g, t, base_period, aux, p){
+  
+  treated_cohorts <- aux$cohorts[!is.infinite(aux$cohorts)]
+  
+  #setup and checks
+  
+  if(p$control_option == "never"){
+    min_control_cohort <- Inf
+  } else {
+    if(!is.infinite(p$min_control_cohort_diff)){
+      min_control_cohort <- max(g+p$min_control_cohort_diff, min(treated_cohorts))
+    } else {
+      min_control_cohort <- max(t, base_period)+p$anticipation+1
+    }
+  }
+
+  if(!is.infinite(p$max_control_cohort_diff)){
+    max_control_cohort <- min(g+p$max_control_cohort_diff, max(treated_cohorts))
+  } else {
+    max_control_cohort <- ifelse(p$control_option == "notyet", max(treated_cohorts), Inf) 
+  }
+  
+  if(t == base_period | #no treatment effect for the base period
+     base_period < min(aux$time_periods) | #no treatment effect for the first period, since base period is not observed
+     g >= max_control_cohort | #no treatment effect for never treated or the last treated cohort (for not yet notyet)
+     t >= max_control_cohort){ #no control available if the last cohort is treated too
+    return(NULL)
+  } else {
+    #select the control and treated cohorts
+    did_setup <- rep(NA, aux$id_size)
+    did_setup[get_cohort_pos(aux$cohort_sizes, min_control_cohort, max_control_cohort)] <- 0
+    did_setup[get_cohort_pos(aux$cohort_sizes, g)] <- 1 #treated cannot be controls, assign treated after control to overwrite
+    
+    if(!is.na(p$filtervar)){
+      did_setup[!aux$filters[[base_period]]] <- NA #only use units with filter == TRUE at base period
+    }
+    
+    return(did_setup)
+  }
+}
+
 get_cohort_pos <- function(cohort_sizes, start_cohort, end_cohort = start_cohort){
   start <- cohort_sizes[G < start_cohort, sum(cohort_size)]+1
   end <- cohort_sizes[G <= end_cohort, sum(cohort_size)]
@@ -679,11 +707,17 @@ get_covvars <- function(covariates, varycovariates, base_period, t){
 #' @param clustervar The name of the cluster variable, can only be used when boot == TRUE (optional).
 #' @param covariatesvar A character vector containing the names of time-invariant covariate variables (optional).
 #' @param varycovariatesvar A character vector containing the names of time-varying covariate variables (optional).
+#' @param filtervar A logical vector that specifies which units to use (can be time varying, units will only be included in 2x2 when filtervar is TRUE is the base period)
 #' @param copy whether to copy the dataset before processing, set to false to speed up the process, but the input data will be altered.
 #' @param validate whether to validate the dataset before processing.
+#' @param anticipation periods with aniticipation (delta in CS, default is 0, reference period is g - delta - 1).
+#' @param min_control_cohort_diff the min cohort difference between treated and control group 
+#' @param max_control_cohort_diff the max cohort difference between treated and control group 
+#' @param base_period same as did
 #' 
-#' @import data.table parglm stringr collapse dreamerr BMisc 
+#' @import data.table parglm stringr dreamerr BMisc 
 #' @importFrom stats quantile vcov sd binomial fitted qnorm rnorm as.formula
+#' @importFrom collapse allNA fnrow whichNA fnunique fsum na_insert
 #' @return A data.table containing the estimated treatment effects and standard errors.
 #' @export
 #'
@@ -720,9 +754,9 @@ fastdid <- function(data,
                     timevar, cohortvar, unitvar, outcomevar, 
                     control_option="both",result_type="group_time", balanced_event_time = NA,
                     control_type = "ipw", allow_unbalance_panel = FALSE, boot=FALSE, biters = 1000,
-                    weightvar=NA,clustervar=NA,covariatesvar = NA, varycovariatesvar = NA,
+                    weightvar=NA,clustervar=NA, covariatesvar = NA, varycovariatesvar = NA, filtervar = NA,
                     copy = TRUE, validate = TRUE,
-                    max_control_cohort_diff = NA
+                    max_control_cohort_diff = Inf, anticipation = 0, min_control_cohort_diff = -Inf, base_period = "universal"
                     ){
   
   # validate arguments --------------------------------------------------------
@@ -742,12 +776,14 @@ fastdid <- function(data,
             "NA | multi match", .choices = dt_names, .message = covariate_message)
   
   checkvar_message <- "__ARG__ must be NA or a character scalar if a name of columns from the dataset."
-  check_set_arg(weightvar, clustervar,
+  check_set_arg(weightvar, clustervar, filtervar,
             "NA | match", .choices = dt_names, .message = checkvar_message)
   
   check_set_arg(control_option, "match", .choices = c("both", "never", "notyet")) #kinda bad names since did's notyet include both notyet and never
   check_set_arg(control_type, "match", .choices = c("ipw", "reg", "dr")) 
+  check_set_arg(base_period, "match", .choices = c("varying", "universal"))
   check_arg(copy, validate, boot, allow_unbalance_panel, "scalar logical")
+  check_arg(max_control_cohort_diff, min_control_cohort_diff, anticipation, "scalar numeric")
   
   if(!is.na(balanced_event_time)){
     if(result_type != "dynamic"){stop("balanced_event_time is only meaningful with result_type == 'dynamic'")}
@@ -766,12 +802,20 @@ fastdid <- function(data,
     stop("clustering only available with bootstrap")
   }
   
+  # coerce non-sensible option
+  
+  if((!is.infinite(max_control_cohort_diff) | !is.infinite(min_control_cohort_diff)) & control_option == "never"){
+    warning("control_cohort_diff can only be used with not yet")
+    p$control_option <- "notyet"
+  }
+  
   p <- list(timevar = timevar,
             cohortvar = cohortvar,
             unitvar = unitvar,
             outcomevar =  outcomevar,
             weightvar = weightvar,
             clustervar = clustervar,
+            filtervar = filtervar,
             covariatesvar = covariatesvar,
             varycovariatesvar = varycovariatesvar,
             control_option = control_option,
@@ -781,7 +825,10 @@ fastdid <- function(data,
             allow_unbalance_panel = allow_unbalance_panel,
             boot = boot, 
             biters = biters,
-            max_control_cohort_diff = max_control_cohort_diff)
+            max_control_cohort_diff = max_control_cohort_diff,
+            min_control_cohort_diff = min_control_cohort_diff,
+            anticipation = anticipation, 
+            base_period = base_period)
   
   
   # validate data -----------------------------------------------------
@@ -789,7 +836,7 @@ fastdid <- function(data,
   setnames(dt, c(timevar, cohortvar, unitvar), c("time", "G", "unit"))
   
   if(validate){
-    varnames <- c("time", "G", "unit", outcomevar,weightvar,clustervar,covariatesvar,varycovariatesvar)
+    varnames <- c("time", "G", "unit",outcomevar,weightvar,clustervar,covariatesvar,varycovariatesvar,filtervar)
     dt <- validate_did(dt, varnames, p)
   }
   
@@ -924,6 +971,18 @@ get_auxdata <- function(dt, p){
     varycovariates <- NA
   }
   
+  # filters
+  filters <- list()
+  if(!is.na(p$filtervar)){
+    for(i in time_periods){
+      start <- (i-1)*id_size+1
+      end <- i*id_size
+      filters[[i]] <- dt[seq(start,end), .SD, .SDcols = p$filtervar]
+    }
+  } else {
+    filters <- NA
+  }
+  
   if(!allNA(p$covariatesvar)){
     covariates <- cbind(const = -1, dt_inv[,.SD, .SDcols = p$covariatesvar])
   } else {
@@ -951,7 +1010,8 @@ get_auxdata <- function(dt, p){
               varycovariates = varycovariates,
               covariates = covariates,
               cluster = cluster,
-              weights = weights)
+              weights = weights,
+              filters = filters)
   
   return(aux)
   
@@ -1017,7 +1077,7 @@ utils::globalVariables(c('.','agg_weight','att','att_cont','att_treat','attgt','
                          'const','cont_ipw_weight','count','delta_y','element_rect','element_text','event_time','pg','placeholder',
                          'post.y','pre.y','ps','s','se','target','tau','time_fe',
                          'treat_ipw_weight','treat_latent','type','unit','unit_fe','weight','x','x2',
-                         'x_trend','y','y0','y1','y2', 'time', 'weights', 'outcome',
+                         'x_trend','y','y0','y1','y2', 'time', 'weights', 'outcome', "G", "D",
                          'V1','att_cont_post','att_cont_pre','att_treat_post','att_treat_pre','inpost','inpre','max_et','min_et','new_unit','or_delta','or_delta_post','or_delta_pre','targeted','used'))
 
 
@@ -1245,6 +1305,10 @@ validate_did <- function(dt,varnames,p){
     if(p$balanced_event_time > dt[, max(time-G)]){stop("balanced_event_time is larger than the max event time in the data")}
   }
   
+  if(!is.na(p$filtervar) && !is.logical(dt[[p$filtervar]])){
+    stop("filter var needs to be a logical column")
+  }
+  
   #doesn't allow missing value for now
   for(col in varnames){
     if(is.na(col)){next}
@@ -1255,7 +1319,6 @@ validate_did <- function(dt,varnames,p){
     }
   }
   
-  
   if(!allNA(p$covariatesvar) && uniqueN(dt, by = c("unit", p$covariatesvar)) > raw_unit_size){
     warning("some covariates is time-varying, fastdid only use the first observation for covariates.")
   }
diff --git a/development/source_head.R b/development/source_head.R
deleted file mode 100644
index ea6800f..0000000
--- a/development/source_head.R
+++ /dev/null
@@ -1,3 +0,0 @@
-#2024-04-17
-message('loading fastdid source ver. ver: 0.9.2 (unbalanced), date: 2024-04-17')
-require(data.table);require(stringr);require(BMisc);require(collapse);require(dreamerr);require(parglm);
\ No newline at end of file
diff --git a/man/fastdid.Rd b/man/fastdid.Rd
index 0ea6537..3ca9594 100644
--- a/man/fastdid.Rd
+++ b/man/fastdid.Rd
@@ -21,9 +21,13 @@ fastdid(
   clustervar = NA,
   covariatesvar = NA,
   varycovariatesvar = NA,
+  filtervar = NA,
   copy = TRUE,
   validate = TRUE,
-  max_control_cohort_diff = NA
+  max_control_cohort_diff = Inf,
+  anticipation = 0,
+  min_control_cohort_diff = -Inf,
+  base_period = "universal"
 )
 }
 \arguments{
@@ -59,9 +63,19 @@ fastdid(
 
 \item{varycovariatesvar}{A character vector containing the names of time-varying covariate variables (optional).}
 
+\item{filtervar}{A logical vector that specifies which units to use (can be time varying, units will only be included in 2x2 when filtervar is TRUE is the base period)}
+
 \item{copy}{whether to copy the dataset before processing, set to false to speed up the process, but the input data will be altered.}
 
 \item{validate}{whether to validate the dataset before processing.}
+
+\item{max_control_cohort_diff}{the max cohort difference between treated and control group}
+
+\item{anticipation}{periods with aniticipation (delta in CS, default is 0, reference period is g - delta - 1).}
+
+\item{min_control_cohort_diff}{the min cohort difference between treated and control group}
+
+\item{base_period}{same as did}
 }
 \value{
 A data.table containing the estimated treatment effects and standard errors.
diff --git a/man/sim_did.Rd b/man/sim_did.Rd
index 540022a..e7fbb3d 100644
--- a/man/sim_did.Rd
+++ b/man/sim_did.Rd
@@ -38,6 +38,8 @@ sim_did(
 
 \item{second_cov}{Whether to include a second covariate.}
 
+\item{vary_cov}{include time-varying covariates}
+
 \item{na}{Whether to generate missing data ("none", "y", "x", or "both").}
 
 \item{balanced}{Whether to balance the dataset by random sampling.}