-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathXTconfounding.R
134 lines (113 loc) · 4.8 KB
/
XTconfounding.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# auxiliary function to find the most common item (returns the smallest if many)
# Credit: https://community.rstudio.com/t/replace-values-of-a-variable-by-the-most-frequent-value/11140/7
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
## Need to run "mywindow.R" first !!!
# 2. H confounds (X) and T (XT-Confounding Model)
set.seed(7)
cutoff = 5
min_x = 0
max_x = 10
Y_error = 0.5
h_0 = 1
h_1 = 3
H_error = 0 # One can set the error for H to be positive, but then the estimation process becomes little unstable.
# Sample size
n_sample = 200
# Beta pool
betas = seq(0, 1, 0.05)
n_betas = length(betas)
# bias^2 is stored
bias.squared = rep(NA, n_betas)
# For each beta, we use 100 trials to choose a bandwidth
# and then use another 100 trials to calculate the average result
for (b in c(1:n_betas)){
beta = betas[b]
adj.cutoff = 5 - (beta*h_0)
# Bandwidth selection for ITT / delta.FRD
window.XT = rep(NA, 100)
window.delta = rep(NA, 100)
for (j in c(1:100)){
# Data Generation
assign_var = sort(rnorm(n_sample, mean=5, sd=1.5))
assign_var = ifelse(assign_var<0, 0,
ifelse(assign_var > 100, 100, assign_var))
H = ifelse(assign_var < cutoff, h_0, h_1)
H = H + rnorm(n_sample, mean=0, sd=H_error)
D = (assign_var + beta*H >= cutoff) # D: Treatment indicator
W = (assign_var >= adj.cutoff) # W: (adjusted) Threshold indicator
observed_outcome = ifelse(assign_var + beta*H < cutoff, 0.4*assign_var+1, 0.2*assign_var+3)
observed_outcome = observed_outcome + rnorm(n_sample, mean=0, sd=Y_error)
# Data Formatting
data = data.frame(assign_var, D, W, observed_outcome)
centered_data = data
centered_data["assign_var"] = centered_data["assign_var"] - adj.cutoff
# select a proper window : my.window()
w_pool = seq(from=0.2, to=1, by=0.05)
n_pool = length(w_pool)
# Bandwidth selected from IIT
my_loss = rep(NA, n_pool)
for (i in c(1:n_pool)){
w = w_pool[i]
index = which(((assign_var >= (adj.cutoff-w)) * (assign_var <= (adj.cutoff+w))) == TRUE)
window_data = centered_data[index,]
my_loss[i] = my.window(window_data, w, min_x, max_x, lambda=3)[1]
}
# Handling NaN
na.my = sum(is.na(my_loss))
m.mywindow = which(my_loss[!is.na(my_loss)] == min(my_loss[!is.na(my_loss)]))
window.XT[j] = w_pool[m.mywindow+na.my]
# Bandwidth selected from delta.FRD
delta_loss = rep(NA, n_pool)
for (i in c(1:n_pool)){
w = w_pool[i]
index = which(((assign_var >= (adj.cutoff-w)) * (assign_var <= (adj.cutoff+w))) == TRUE)
window_data = centered_data[index,]
window_data["observed_outcome"] = window_data["D"]
delta_loss[i] = my.window(window_data, w, min_x, max_x, lambda=3)[1]
}
# Handling NaN
na.delta = sum(is.na(delta_loss))
m.delta = which(my_loss[!is.na(my_loss)] == min(my_loss[!is.na(my_loss)]))
window.delta[j] = w_pool[m.delta+na.delta]
}
# bandwidth selected for gamma = g
w.delta = Mode(window.delta)
w.tau = Mode(window.XT)
w.beta = min(w.delta, w.tau)
# calculating bias^2 for gamma = g
bias.sq = rep(NA, 100)
for (k in c(1:100)){
assign_var = sort(rnorm(n_sample, mean=5, sd=1.5))
assign_var = ifelse(assign_var<0, 0,
ifelse(assign_var > 100, 100, assign_var))
H = ifelse(assign_var < cutoff, h_0, h_1)
H = H + rnorm(n_sample, mean=0, sd=H_error)
D = (assign_var + beta*H >= cutoff)
W = (assign_var >= adj.cutoff)
observed_outcome = ifelse(assign_var + beta*H < cutoff, 0.4*assign_var+1, 0.2*assign_var+3)
observed_outcome = observed_outcome + rnorm(n_sample, mean=0, sd=Y_error)
# Data Formatting
data = data.frame(assign_var, D, assign_var*D, W, assign_var*W, observed_outcome)
centered_data = data
centered_data["assign_var"] = centered_data["assign_var"] - adj.cutoff # Centering around the threshold
centered_data["assign_var...D"] = centered_data["assign_var"]*centered_data["D"]
centered_data["assign_var...W"] = centered_data["assign_var"]*centered_data["W"]
index = which(((assign_var >= (adj.cutoff-w.beta)) * (assign_var <= (adj.cutoff+w.beta))) == TRUE)
window_data = centered_data[index,]
estimated.ITT = lm(observed_outcome ~ assign_var * W, data=window_data)$coef[3]
estimated.delta = lm(D ~ assign_var * W, data=window_data)$coef[3]
estimated.effect = estimated.ITT / estimated.delta
# tau(beta) and bias squared
true.effect = 2 - 0.2*adj.cutoff
bias.sq[k] = (true.effect - estimated.effect)^2
}
# average of 100 trials are stored
bias.squared[b] = mean(bias.sq)
}
library(latex2exp)
plot(betas, bias.squared, ylim=c(0, 1), pch=19, xlab=TeX("$\\beta$"))
abline(h=mean(bias.squared), lty='solid', col='red', lwd=1.5)
print(mean(bias.squared))