-
Notifications
You must be signed in to change notification settings - Fork 294
/
Copy pathabortion.R
118 lines (98 loc) · 5.04 KB
/
abortion.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
library(tidyverse)
library(gamlr)
####### donohue and levitt 2001/2008: abortion and crime:
### https://www.nber.org/papers/w8004
## example reading non csv data: this is a dump from a STATA file
## skip says skip the first line of the file, sep="/t" says 'tab separated'
data = read.table("../data/abortion.dat", skip=1, sep="\t")
names(data) = c("state","year","pop","y_viol","y_prop","y_murd",
"a_murd","a_viol","a_prop",'prison','police',
'ur','inc','pov','afdc','gun','beer')
## the control variables that D & L use are:
## prison: log of lagged prisoners per capita
## police: the log of lagged police per capita
## ur: the unemployment rate
## inc: per-capita income
## pov: the poverty rate
## AFDC: state welfare generosity at year t-15
## gun: dummy for concealed weapons law
## beer: beer consumption per capita (capturing alcohol-related violence)
data = data[!(data$state%in%c(2,9,12)),] # AK, DC, HA are strange places, per D & L
data = data[data$year>84 & data$year<98,] # incomplete data outside these years
data$pop = log(data$pop)
t = data$year-85 # a notional t=0 year
t2 = t^2 # adding a quadratic term
s = factor(data$state) ## the states are numbered alphabetically
controls = data.frame(data[,c(3,10:17)])
## y is de-trended log crime rate, a is as described below
## note we also have violent and property crime versions
y = data$y_murd # response variable
d = data$a_murd # "treatment" variable
## The abortion 'a_' variables are weighted average of abortion rates where
## weights are determined by the fraction of the type of crime committed by
## various age groups. For example, if 60% of violent crime were committed by 18
## year olds and 40% were committed by 19 year olds in state i, the abortion rate
## for violent crime at time t in state i would be constructed as .6 times the
## abortion rate in state i at time t − 18 plus .4 times the abortion rate in
## state i at time t − 19. See Donohue and Levitt (2001) for further detail.
## we'll just look at murder
## note for convenience here I've made y,d,t, global vars: they are not in controls.
orig = glm(y ~ d+t+s+., data=controls)
summary(orig)$coef # tons of controls
summary(orig)$coef['d',] %>% round(3)
## this is the levitt analysis: higher abortion leads to lower crime
## Now the same analysis, but for cellphones rather than abortion
cell = read.csv("../data/us_cellphone.csv")
# center on 1985 and scale by 1997-1985
cellrate = (cell[,2]-cell[1,2])/(cell[13,2]-cell[1,2])
## what if we're just fitting a quadratic trend?
## there are many things that increased with similar shapes over time
## (cellphone usage, yoga revenues, home prices, ...)
plot(1985:1997, tapply(d, t, mean), xlab="year", ylab="adjusted rate", pch=21, bg=2)
points(1985:1997, cellrate, bg=4, pch=21)
legend("topleft", fill=c(2,4), legend=c("abortions","cellphones"), bty="n")
phone = cellrate[t+1]
## Hey, cellphones fight crime!
tech = glm(y ~ phone+t+s+., data=controls)
summary(tech)$coef['phone',] %>% round(3)
## what is happening here is that murder changed quadratically over the study period.
## and we have no other controls that do so. To correctly control for the secular trend, you need
## to allow quadratic trends that could be caused by other confounding variables (e.g. technology)
## each state should also have a different baseline linear and quadratic trend
## and, at very least, controls should interact with each other.
## we also allow the control effects to change in time (interact with t+t2)
# (no intercept, since we've removed the reference level from state)
interact = glm(y ~ d + (s + .^2)*(t+t2), data=controls)
summary(interact)$coef['d',]
## Abortion is no longer significant.
dim(model.matrix(y ~ d + (s + .^2)*(t+t2), data=controls))
## but notice we have very few observations relative to number of parameters.
## so we need a way to select only important controls
## try using a naive lasso
## refactor state to have NA reference level
s = factor(s, levels=c(NA,levels(s)), exclude=NULL)
x = sparse.model.matrix(~ (s + .^2)*(t+t2), data=controls)[,-1]
dim(x)
## naive lasso regression
naive = gamlr(cbind(d,x),y)
coef(naive)["d",] # effect is AICc selected <0
## problem here is that the lasso regularizes away the confounders!
## it can explain variation in y using one variable (d) or multiple variables correlated with d.
## it picks the most parsimonious solution!
## see e.g. https://projecteuclid.org/euclid.ba/1484103680
## Not a valid causal identification strategy
# SO let's try the double lasso
lm_d = cv.gamlr(x,d)
lm_y = cv.gamlr(x,y)
include_from_d = which(coef(lm_d) != 0) - 1
include_from_y = which(coef(lm_y) != 0) - 1
include = union(include_from_d, include_from_y)
combined = as.matrix(cbind(d, x[,include]))
lm_double = lm(y ~ combined)
# no significance
summary(lm_double)$coef['combinedd',] %>% round(3)
## not much signal in d left after controlling via x, even with regularization...
dhat = predict(lm_d, newdata=x)
plot(dhat,d,bty="n",pch=21,bg=8)
## this is basically the ugliest plot you can imagine if you're D&L...
## we have little to resemble quasi-experimental variation here...