-
Notifications
You must be signed in to change notification settings - Fork 119
/
Copy pathabortion.R
181 lines (149 loc) · 7.12 KB
/
abortion.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
library(tidyverse)
library(gamlr)
####### donohue and levitt 2001/2008: abortion and crime:
### https://www.nber.org/papers/w8004
## example reading non csv data: this is a dump from a STATA file
## skip says skip the first line of the file, sep="/t" says 'tab separated'
data = read.table("../data/abortion.dat", skip=1, sep="\t")
names(data) = c("state","year","pop","y_viol","y_prop","y_murd",
"a_murd","a_viol","a_prop",'prison','police',
'ur','inc','pov','afdc','gun','beer')
## the control variables that D & L use are:
## prison: log of lagged prisoners per capita
## police: the log of lagged police per capita
## ur: the unemployment rate
## inc: per-capita income
## pov: the poverty rate
## AFDC: state welfare generosity at year t-15
## gun: dummy for concealed weapons law
## beer: beer consumption per capita (capturing alcohol-related violence)
data = data[!(data$state%in%c(2,9,12)),] # AK, DC, HA are strange places, per D & L
data = data[data$year>84 & data$year<98,] # incomplete data outside these years
data$pop = log(data$pop)
t = data$year-85 # a notional t=0 year
t2 = t^2 # adding a quadratic term
s = factor(data$state) ## the states are numbered alphabetically
controls = data.frame(data[,c(3,10:17)])
## y is de-trended log crime rate, a is as described below
## note we also have violent and property crime versions
y = data$y_murd # response variable
d = data$a_murd # "treatment" variable
## The abortion 'a_' variables are weighted average of abortion rates where
## weights are determined by the fraction of the type of crime committed by
## various age groups. For example, if 60% of violent crime were committed by 18
## year olds and 40% were committed by 19 year olds in state i, the abortion rate
## for violent crime at time t in state i would be constructed as .6 times the
## abortion rate in state i at time t − 18 plus .4 times the abortion rate in
## state i at time t − 19. See Donohue and Levitt (2001) for further detail.
## we'll just look at murder
## note for convenience here I've made y,d,t, global vars: they are not in controls.
orig = glm(y ~ d+t+s+., data=controls)
summary(orig)$coef # tons of controls
summary(orig)$coef['d',] %>% round(3)
## this is the levitt analysis: higher abortion leads to lower crime
## Now the same analysis, but for cellphones rather than abortion
cell = read.csv("../data/us_cellphone.csv")
# center on 1985 and scale by 1997-1985
cellrate = (cell[,2]-cell[1,2])/(cell[13,2]-cell[1,2])
## what if we're just fitting a quadratic trend?
## there are many things that increased with similar shapes over time
## (cellphone usage, yoga revenues, home prices, ...)
plot(1985:1997, tapply(d, t, mean), xlab="year", ylab="adjusted rate", pch=21, bg=2)
points(1985:1997, cellrate, bg=4, pch=21)
legend("topleft", fill=c(2,4), legend=c("abortions","cellphones"), bty="n")
phone = cellrate[t+1]
## Hey, cellphones fight crime!
tech = glm(y ~ phone+t+s+., data=controls)
summary(tech)$coef['phone',] %>% round(3)
## what is happening here is that murder changed quadratically over the study period.
## and we have no other controls that do so. To correctly control for the secular trend, you need
## to allow quadratic trends that could be caused by other confounding variables (e.g. technology)
## each state should also have a different baseline linear and quadratic trend
## and, at very least, controls should interact with each other.
## we also allow the control effects to change in time (interact with t+t2)
# (no intercept, since we've removed the reference level from state)
interact = glm(y ~ d + (s + .^2)*(t+t2), data=controls)
summary(interact)$coef['d',]
## Abortion is no longer significant.
dim(model.matrix(y ~ d + (s + .^2)*(t+t2), data=controls))
## but notice we have very few observations relative to number of parameters.
## so we need a way to select only important controls
## try using a lasso
## refactor state to have NA reference level
s = factor(s, levels=c(NA,levels(s)), exclude=NULL)
x = sparse.model.matrix(~ (s + .^2)*(t+t2), data=controls)[,-1]
dim(x)
## naive lasso regression
naive = gamlr(cbind(d,x),y)
coef(naive)["d",] # effect is AICc selected <0
## problem here is that the lasso regularizes away the confounders!
## it can explain variation in y using one variable (d) or multiple variables correlated with d.
## it picks the most parsimonious solution!
## see e.g. https://projecteuclid.org/euclid.ba/1484103680
## Not a valid causal identification strategy
## now, what if we explicitly include dhat confounding,
# regressing the "treatment" variable on all our control vars?
treat = gamlr(x,d,lambda.min.ratio=1e-4)
# we needed to drop lambda.min.ratio because AICc wants a complex model
# that indicates that abortion rates are highly correlated with controls.
plot(treat)
coef(treat)
# Now, grab the predicted treatment. given controls
# type="response" is redundant here (gaussian),
# but you'd want it if d was binary
# note: drop turns to a vector rather than sparse matrix
dhat = predict(treat, x, type="response") %>% drop
## basically the ugliest plot you can imagine for D&L...
## not much signal in d left after controlling via dhat, even with regularization...
plot(dhat,d,bty="n",pch=21,bg=8)
## that means we have little to resemble an experiment here...
## "independent signal" R^2 for treatment given controls?
cor(drop(dhat),d)^2
## Note: this R2 is what governs how much independent signal
## you have for estimating a treatment effect. lower is better!!
# dres here is like our quasiexperimental variation in the treatment,
# i.e. what's independent from the controls
dres = drop(d - dhat)
# simple linear model: no significance to dres
causal = lm(y ~ dres + dhat)
summary(causal)
# could also re-run lasso, with this dhat(2nd column) included unpenalized
# this approach is more useful if you want to estimate effect modifiers
causal2 = gamlr(cbind(d, dhat, x),y,free=2,lmr=1e-4)
coef(causal2)["d",] # AICc says abortion has no causal effect.
## BOOTSTRAP
n = nrow(x)
## Bootstrapping our lasso causal estimator is easy
gamb = c() # empty gamma
for(b in 1:20){
## create a matrix of resampled indices
ib = sample(1:n, n, replace=TRUE)
## create the resampled data
xb = x[ib,]
db = d[ib]
yb = y[ib]
## run the treatment regression
treatb = gamlr(xb,db,lambda.min.ratio=1e-3)
dhatb = predict(treatb, xb, type="response")
fitb = gamlr(cbind(db,dhatb,xb),yb,free=2)
gamb = c(gamb,coef(fitb)["db",])
print(b)
}
## not very exciting though: all zeros
summary(gamb)
## it's saying there's near zero chance AICc selects gamma!=0
#######################
## EXTRA
##
## FREQUENTIST OPTION: the Belloni, Chernozukov, Hansen version of this
yonx = gamlr(x,y,lambda.min.ratio=0.001)
## get the union of controls that predict y or d
inthemodel = unique(c(which(coef(yonx)[-1]!=0), # -1 drops intercept
which(coef(treat)[-1]!=0))) # unique grabs union
selectdata = cBind(d,x[,inthemodel])
selectdata = as.data.frame(as.matrix(selectdata)) # make it a data.frame
dim(selectdata) ## p about half n
## run a glm
causal_glm = glm(y~., data=selectdata)
## The BCH theory says that the standard SE calc for gamma is correct
summary(causal_glm)$coef["d",] # BCH agrees with AICc: not significant.