-
Notifications
You must be signed in to change notification settings - Fork 0
/
MA thesis reduced sample size regression.R
420 lines (345 loc) · 17.7 KB
/
MA thesis reduced sample size regression.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
#
#
# MA thesis R script
#
# David Klug
#
#
#
# Packages ----------------------------------------------------------------
library(renv)
library(here)
library(rio)
library(MASS)
library(stargazer) # for R >= 4.2.2 run this bugfix https://gist.github.com/alexeyknorre/b0780836f4cec04d41a863a683f91b53
library(tidyverse)
library(AER)
library(readtext)
library(quanteda)
library(ggplot2)
library(stringr)
library(scales)
library(lemon)
library(ggpattern)
library(flextable)
library(ggcorrplot)
library(devtools)
#devtools::install_github("caijun/ggcorrplot2")
library(ggcorrplot2)
library(ggforce) #implicit dependency of ggcorrplot2
library(performance)
library(mice)
library(pscl)
library(poissonreg)
library(conflicted)
conflicted::conflict_prefer("filter", "dplyr")
conflicted::conflict_prefer("select", "dplyr")
# Setting up the environment and data import-------------------------------
# Clear the R environment
rm(list=ls())
# Set the working directory (should be redundant thanks to .Rproj)
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
# Import data
base::load("regression_input_data.RData")
# Regression analysis ---------------------------------------------------
## Preparing the data ------------------------------------------------------
# a separate set of regression results was created based on data that excludes
# the somewhat extreme values of below -40
# d <- filter(d, HRP > -40)
# Formulas
fhd_max <- as.formula("hD ~ HRP + RL + ASSUR + HRR + lnA + ROA + mREG + sLAW")
fsd_max <- as.formula("sD ~ HRP + RL + ASSUR + HRR + lnA + ROA + mREG + sLAW")
# ####################### #
# !!! change input data !!!
# ####################### #
d <- d %>% filter(HRP > -40) # this excludes the somewhat extreme HRP values of below -40
m <- 30
d_imp <- mice::mice(data = d, m = m, maxit = 10, seed = 1, print = F)
## Soft disclosure models --------------------------------------------------
# Note: MICE imputation is only necessary for those models that contain report length (RL)
# create minimal sD model
model_sd_min <- glm(sD ~ HRP, data = d, family = "poisson")
# create sD model with firm-level control variables
model_sd_firmlvl <- glm(sD ~ HRP + lnA + ROA, data = d, family = "poisson")
# create sD model with country-level control variables
model_sd_cntrylvl <- glm(sD ~ HRP + mREG + sLAW, data = d, family = "poisson")
# create sD model with document-level control variables
models_sd_documentlvl <- lapply(1:m, function(i) {
glm(sD ~ HRP + RL + ASSUR + HRR,
data = mice::complete(data = d_imp, i), family = "poisson")
})
# create maximal sD model with imputed datasets
models_sd_max <- lapply(1:m, function(i) {
glm(sD ~ HRP + RL + ASSUR + HRR + lnA + ROA + mREG + sLAW,
data = mice::complete(data = d_imp, i), family = "poisson")
})
# create stepwise sD model with imputed datasets
stepwise_regression_sd <- function(data) {
step(glm(sD ~ HRP, data = data, family = "poisson"),
direction = "forward",
scope = fsd_max) }
models_sd_stepwise <- lapply(1:m, function(i) {
stepwise_regression_sd(mice::complete(data = d_imp, i))
})
## Hard disclosure models --------------------------------------------------
# create minimal hD model
model_hd_min <- glm.nb(hD ~ HRP, data = d)
# create hD model with firm-level control variables
model_hd_firmlvl <- glm.nb(hD ~ HRP + lnA + ROA, data = d)
# create hD model with country-level control variables
model_hd_cntrylvl <- glm.nb(hD ~ HRP + mREG + sLAW, data = d)
# create hD model with document-level control variables
models_hd_documentlvl <- lapply(1:m, function(i) {
glm.nb(hD ~ HRP + RL + ASSUR + HRR,
data = mice::complete(data = d_imp, i))
})
# create maximal hD model with imputed datasets
models_hd_max <- lapply(1:m, function(i) {
glm.nb(hD ~ HRP + RL + ASSUR + HRR + lnA + ROA + mREG + sLAW,
data = mice::complete(data = d_imp, i))
})
# create stepwise hD model with imputed datasets
stepwise_regression_hd <- function(data) {
step(glm.nb(hD ~ HRP, data = data),
direction = "forward",
scope = fhd_max) }
models_hd_stepwise <- lapply(1:m, function(i) {
stepwise_regression_hd(mice::complete(data = d_imp, i))
})
## Pooling the MICE regression results -------------------------------------
# extract MICE pooled regression results for soft disclosure models
models_sd_max_pooled <- mice::pool(models_sd_max)
models_sd_stepwise_pooled <- mice::pool(models_sd_stepwise)
models_sd_documentlvl_pooled <- mice::pool(models_sd_documentlvl)
# extract MICE pooled regression results for hard disclosure models
models_hd_max_pooled <- mice::pool(models_hd_max)
models_hd_stepwise_pooled <- mice::pool(models_hd_stepwise)
models_hd_documentlvl_pooled <- mice::pool(models_hd_documentlvl)
## Exporting regression model results -------------------------------------
# for R >= 4.2.2 run this bugfix https://gist.github.com/alexeyknorre/b0780836f4cec04d41a863a683f91b53
# create big regression table with soft and hard disclosure regression results
stargazer(
# specify models (these mostly serve as placeholders because the coefficents etc. need to be extracted manually for pooled regression)
model_sd_min, models_sd_documentlvl[[1]], model_sd_firmlvl, model_sd_cntrylvl, models_sd_max[[1]], models_sd_stepwise[[1]],
model_hd_min, models_hd_documentlvl[[1]], model_hd_firmlvl, model_hd_cntrylvl, models_hd_max[[1]], models_hd_stepwise[[1]],
dep.var.caption = "Dependent variable: human rights disclosure score",
dep.var.labels = c("soft disclosure", "hard disclosure"),
title = "Poisson and negative binomial GLM regression about the relationship between human rights disclosure and performance",
type = "html",
out = "Outputs/reg_table_main_HRP-40.html",
omit.stat = "theta",
coef = list(coef(model_sd_min),
summary(models_sd_documentlvl_pooled)$estimate,
coef(model_sd_firmlvl),
coef(model_sd_cntrylvl),
summary(models_sd_max_pooled)$estimate,
summary(models_sd_stepwise_pooled)$estimate,
coef(model_hd_min),
summary(models_hd_documentlvl_pooled)$estimate,
coef(model_hd_firmlvl),
coef(model_hd_cntrylvl),
summary(models_hd_max_pooled)$estimate,
summary(models_hd_stepwise_pooled)$estimate),
se = list(summary(model_sd_min)$coefficients[, "Std. Error"],
summary(models_sd_documentlvl_pooled)$std.error,
summary(model_sd_firmlvl)$coefficients[, "Std. Error"],
summary(model_sd_cntrylvl)$coefficients[, "Std. Error"],
summary(models_sd_max_pooled)$std.error,
summary(models_sd_stepwise_pooled)$std.error,
summary(model_hd_min)$coefficients[, "Std. Error"],
summary(models_hd_documentlvl_pooled)$std.error,
summary(model_hd_firmlvl)$coefficients[, "Std. Error"],
summary(model_hd_cntrylvl)$coefficients[, "Std. Error"],
summary(models_hd_max_pooled)$std.error,
summary(models_hd_stepwise_pooled)$std.error),
t = list(coef(model_sd_min)/summary(model_sd_min)$coefficients[, "Std. Error"],
summary(models_sd_documentlvl_pooled)$statistic,
coef(model_sd_firmlvl)/summary(model_sd_firmlvl)$coefficients[, "Std. Error"],
coef(model_sd_cntrylvl)/summary(model_sd_cntrylvl)$coefficients[, "Std. Error"],
summary(models_sd_max_pooled)$statistic,
summary(models_sd_stepwise_pooled)$statistic,
coef(model_hd_min)/summary(model_hd_min)$coefficients[, "Std. Error"],
summary(models_hd_documentlvl_pooled)$statistic,
coef(model_hd_firmlvl)/summary(model_hd_firmlvl)$coefficients[, "Std. Error"],
coef(model_hd_cntrylvl)/summary(model_hd_cntrylvl)$coefficients[, "Std. Error"],
summary(models_hd_max_pooled)$statistic,
summary(models_hd_stepwise_pooled)$statistic),
p = list(coef(summary(model_sd_min))[,4],
summary(models_sd_documentlvl_pooled)$p.value,
coef(summary(model_sd_firmlvl))[,4],
coef(summary(model_sd_cntrylvl))[,4],
summary(models_sd_max_pooled)$p.value,
summary(models_sd_stepwise_pooled)$p.value,
coef(summary(model_hd_min))[,4],
summary(models_hd_documentlvl_pooled)$p.value,
coef(summary(model_hd_firmlvl))[,4],
coef(summary(model_hd_cntrylvl))[,4],
summary(models_hd_max_pooled)$p.value,
summary(models_hd_stepwise_pooled)$p.value)
)
# Robustness tests --------------------------------------------------------
## Hurdle models for hard disclosure ---------------------------------------
# create minimal hD hurdle model
model_hurdle_min <- hurdle(hD ~ HRP, data = d, dist = "negbin", zero.dist = "negbin")
hurdletest(model_hurdle_min)
# create hD hurdle model with firm-level control variables
model_hurdle_firmlvl <- hurdle(hD ~ HRP + lnA + ROA, data = d, dist = "negbin", zero.dist = "negbin")
# create hD hurdle model with country-level control variables
model_hurdle_cntrylvl <- hurdle(hD ~ HRP + mREG + sLAW, data = d, dist = "negbin", zero.dist = "negbin")
# create hD hurdle model with document-level control variables
models_hurdle_documentlvl <- lapply(1:m, function(i) {
hurdle(hD ~ HRP + RL + ASSUR + HRR,
data = mice::complete(data = d_imp, i), dist = "negbin", zero.dist = "negbin")
})
# create maximal hD hurdle model with imputed datasets
models_hurdle_max <- lapply(1:m, function(i) {
hurdle(hD ~ HRP + RL + ASSUR + HRR + lnA + ROA + mREG + sLAW,
data = mice::complete(data = d_imp, i), dist = "negbin", zero.dist = "negbin")
})
# create stepwise hD hurdle model with imputed datasets
stepwise_regression_hurdle <- function(data) {
step(hurdle(hD ~ HRP, data = data),
direction = "forward",
scope = fhd_max) }
models_hurdle_stepwise <- lapply(1:m, function(i) {
stepwise_regression_hurdle(mice::complete(data = d_imp, i))
})
## Pooling the hurdle MICE models --------------------------------------------
# The approach for extracting hurdle model information is based on an SO post here:
# https://stackoverflow.com/questions/75717211/problem-with-pooling-estimates-from-multiple-imputed-datasets-using-mice-in-r-z
# Step 1
# Convert to MICE models to tidy format
models_hurdle_max_tidy <- mice::getfit(models_hurdle_max) %>%
lapply(function(fit) poissonreg::tidy(fit, type="all")) %>%
bind_rows()
models_hurdle_stepwise_tidy <- mice::getfit(models_hurdle_stepwise) %>%
lapply(function(fit) poissonreg::tidy(fit, type="all")) %>%
bind_rows()
models_hurdle_documentlvl_tidy <- mice::getfit(models_hurdle_documentlvl) %>%
lapply(function(fit) poissonreg::tidy(fit, type="all")) %>%
bind_rows()
# Step 2
# Convenience wrapper function around pool.scalar.
# pool.scalar also returns a "qhat" and "u" which are vectors,
# and we don't need them. Those vectors mess up the format of
# the summary that we want to compute later.
wrap.pool.scalar <- function(estimates, variances, n, k) {
pool_res <- mice::pool.scalar(estimates, variances, n = n, k = k)
return(as_tibble(list(
qbar = pool_res$qbar,
ubar = pool_res$ubar,
b = pool_res$b,
t = pool_res$t,
df = pool_res$df,
r = pool_res$r,
fmi = pool_res$fmi)))
}
# For each (term,type) pair, compute pooled univariate estimates using
# wrap.pool.scalar
models_hurdle_max_tidy_pooled <- models_hurdle_max_tidy %>%
group_by(term, type) %>%
reframe(wrap.pool.scalar(estimate, std.error^2, n=60, k=1)) %>%
mutate(estimate = qbar)
models_hurdle_stepwise_tidy_pooled <- models_hurdle_stepwise_tidy %>%
group_by(term, type) %>%
reframe(wrap.pool.scalar(estimate, std.error^2, n=60, k=1)) %>%
mutate(estimate = qbar)
models_hurdle_documentlvl_tidy_pooled <- models_hurdle_documentlvl_tidy %>%
group_by(term, type) %>%
reframe(wrap.pool.scalar(estimate, std.error^2, n=60, k=1)) %>%
mutate(estimate = qbar)
# Step 3
# Copy the pooled estimate calculations from the the MICE package
# https://github.com/amices/mice/blob/master/R/mipo.R#L69-L71
models_hurdle_max_tidy_smry <- models_hurdle_max_tidy_pooled %>% mutate(
std.error = sqrt(t),
statistic = estimate / std.error,
p.value = 2 * (pt(abs(statistic), pmax(df, 0.001), lower.tail = FALSE))) %>%
dplyr::select(term, type, estimate, std.error, statistic, df, p.value)
models_hurdle_stepwise_tidy_smry <- models_hurdle_stepwise_tidy_pooled %>% mutate(
std.error = sqrt(t),
statistic = estimate / std.error,
p.value = 2 * (pt(abs(statistic), pmax(df, 0.001), lower.tail = FALSE))) %>%
dplyr::select(term, type, estimate, std.error, statistic, df, p.value)
models_hurdle_documentlvl_tidy_smry <- models_hurdle_documentlvl_tidy_pooled %>% mutate(
std.error = sqrt(t),
statistic = estimate / std.error,
p.value = 2 * (pt(abs(statistic), pmax(df, 0.001), lower.tail = FALSE))) %>%
dplyr::select(term, type, estimate, std.error, statistic, df, p.value)
# extract zero or count part of the tidy hurdle models
models_hurdle_max_tidy_smry_zero <- models_hurdle_max_tidy_smry[models_hurdle_max_tidy_smry$type=="zero",]
models_hurdle_max_tidy_smry_count <- models_hurdle_max_tidy_smry[models_hurdle_max_tidy_smry$type=="count",]
models_hurdle_stepwise_tidy_smry_zero <- models_hurdle_stepwise_tidy_smry[models_hurdle_stepwise_tidy_smry$type=="zero",]
models_hurdle_stepwise_tidy_smry_count <- models_hurdle_stepwise_tidy_smry[models_hurdle_stepwise_tidy_smry$type=="count",]
models_hurdle_documentlvl_tidy_smry_zero <- models_hurdle_documentlvl_tidy_smry[models_hurdle_documentlvl_tidy_smry$type=="zero",]
models_hurdle_documentlvl_tidy_smry_count <- models_hurdle_documentlvl_tidy_smry[models_hurdle_documentlvl_tidy_smry$type=="count",]
# also tidy up non MICE models for consistency
model_hurdle_min_zero <- poissonreg::tidy(model_hurdle_min, type="all") %>% filter(type=="zero")
model_hurdle_min_count <- poissonreg::tidy(model_hurdle_min, type="all") %>% filter(type=="count")
model_hurdle_firmlvl_zero <- poissonreg::tidy(model_hurdle_firmlvl, type="all") %>% filter(type=="zero")
model_hurdle_firmlvl_count <- poissonreg::tidy(model_hurdle_firmlvl, type="all") %>% filter(type=="count")
model_hurdle_cntrylvl_zero <- poissonreg::tidy(model_hurdle_cntrylvl, type="all") %>% filter(type=="zero")
model_hurdle_cntrylvl_count <- poissonreg::tidy(model_hurdle_cntrylvl, type="all") %>% filter(type=="count")
## Exporting hurdle model results ------------------------------------------
# for R >= 4.2.2 run this bugfix https://gist.github.com/alexeyknorre/b0780836f4cec04d41a863a683f91b53
stargazer(
# specify models (these mostly serve as placeholders because the coefficents etc. need to be extracted manually for pooled regression)
model_hurdle_min, models_hurdle_documentlvl[[1]], model_hurdle_firmlvl,
model_hurdle_cntrylvl, models_hurdle_max[[1]], models_hurdle_stepwise[[1]],
model_hurdle_min, models_hurdle_documentlvl[[1]], model_hurdle_firmlvl,
model_hurdle_cntrylvl, models_hurdle_max[[1]], models_hurdle_stepwise[[1]],
dep.var.caption = "Dependent variable: hard human rights disclosure score",
dep.var.labels = "",
title = "Neg. binomial hurdle regression that models the relationship between human rights disclosure and performance<br>1-6 are zero models, 7-12 are count models",
type = "html",
out = "Outputs/reg_table_hurdle_HRP-40.html",
omit.stat = "theta",
coef = list(model_hurdle_min_zero$estimate,
models_hurdle_documentlvl_tidy_smry_zero$estimate,
model_hurdle_firmlvl_zero$estimate,
model_hurdle_cntrylvl_zero$estimate,
models_hurdle_max_tidy_smry_zero$estimate,
models_hurdle_stepwise_tidy_smry_zero$estimate,
model_hurdle_min_count$estimate,
models_hurdle_documentlvl_tidy_smry_count$estimate,
model_hurdle_firmlvl_count$estimate,
model_hurdle_cntrylvl_count$estimate,
models_hurdle_max_tidy_smry_count$estimate,
models_hurdle_stepwise_tidy_smry_count$estimate),
se = list(model_hurdle_min_zero$std.error,
models_hurdle_documentlvl_tidy_smry_zero$std.error,
model_hurdle_firmlvl_zero$std.error,
model_hurdle_cntrylvl_zero$std.error,
models_hurdle_max_tidy_smry_zero$std.error,
models_hurdle_stepwise_tidy_smry_zero$std.error,
model_hurdle_min_count$std.error,
models_hurdle_documentlvl_tidy_smry_count$std.error,
model_hurdle_firmlvl_count$std.error,
model_hurdle_cntrylvl_count$std.error,
models_hurdle_max_tidy_smry_count$std.error,
models_hurdle_stepwise_tidy_smry_count$std.error),
t = list(model_hurdle_min_zero$statistic,
models_hurdle_documentlvl_tidy_smry_zero$statistic,
model_hurdle_firmlvl_zero$statistic,
model_hurdle_cntrylvl_zero$statistic,
models_hurdle_max_tidy_smry_zero$statistic,
models_hurdle_stepwise_tidy_smry_zero$statistic,
model_hurdle_min_count$statistic,
models_hurdle_documentlvl_tidy_smry_count$statistic,
model_hurdle_firmlvl_count$statistic,
model_hurdle_cntrylvl_count$statistic,
models_hurdle_max_tidy_smry_count$statistic,
models_hurdle_stepwise_tidy_smry_count$statistic),
p = list(model_hurdle_min_zero$p.value,
models_hurdle_documentlvl_tidy_smry_zero$p.value,
model_hurdle_firmlvl_zero$p.value,
model_hurdle_cntrylvl_zero$p.value,
models_hurdle_max_tidy_smry_zero$p.value,
models_hurdle_stepwise_tidy_smry_zero$p.value,
model_hurdle_min_count$p.value,
models_hurdle_documentlvl_tidy_smry_count$p.value,
model_hurdle_firmlvl_count$p.value,
model_hurdle_cntrylvl_count$p.value,
models_hurdle_max_tidy_smry_count$p.value,
models_hurdle_stepwise_tidy_smry_count$p.value)
)