-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path1.6 prepare_consensus.R
153 lines (110 loc) · 5.63 KB
/
1.6 prepare_consensus.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
##### Script to prepare consensus forecast data ----
## Note1: Latin American countries are not available in the first week of Consensus.
# Likewise, advanced and Asian economies are not available in the second week.
# Overlapping in 2007/2008 for some countries (Switzerland, Taiwan, Thailand, US and Vietnam) with forecasts released both first and second week.
# Kept first week.
## Note2: for India, forecast are for the previous and current year instead of current and year-ahead in April 2008 and April 2010. We filter those occurences.
# This is included in the correction parameter of the function.
## Note3: for 2007 and 2008, we are missing some horizons given that previous report are not
# available in this format.
# Wrangling consensus forecasts function: ----
#' Wrangling consensus forecasts from excel sheet format for every issue (single variable) to a
#' more user-friendly database.
#'
#' @param path path to the xlsx workbook in the locale.
#' @param correction see Note2.
#' @param n_col numeric, number of columns before forecasts value.
#'
#' @return tibble with four identifiers (imf code, country name, year forecasted and name of forecaster) and
#' variable1 to 4 that correspond from current-year sept. to year-ahead apr. forecasts.
#'
#' @details Aggregate 'Euro Area' excluded.
#' @details Missing obs for first year forecasted because for first year no forecasts other than same year issues
#' and so on...
#'
wrangle_consensus_forecasts <- function(path = "../When_where_and_why_material/raw_data/consensus/gdp_2008_2019_eme.xlsx",
correction = FALSE,
n_col = 12){
path = path
sheets_name <- getSheetNames(path)
sheets_year <- getSheetNames(path) %>%
str_extract("\\d{4}")
forecasts <- sheets_name %>%
map(~ read_xlsx(path,sheet = .x)) %>%
map(~ .x %>% filter(Country != "Euro Area")) %>%
map(~ .x %>% remove_empty("cols"))
### Correct problematic features of dataframe (see Note1):
if(correction == TRUE){
forecasts <- forecasts %>%
map(~ if(any(.x$`Survey Date` == "2008 Apr 14" | .x$`Survey Date` == "2010 Apr 12")){
.x %>% filter(Country != "India")
} else{
.x
}) %>%
map(~ if(any(.x$`Survey Date` == "2015 Sep 07")){
.x %>%
mutate(iso3 = countrycode(Country, "country.name","iso3c")) %>%
select(EcDatabase, Series_code, Country, iso3, everything())
} else {
.x
})
###
}
forecasts <- forecasts %>%
map(~ .x %>% split(.$`Forecast Length`)) %>%
modify_depth(2, ~ .x %>% remove_empty("cols")) %>%
modify_depth(2, ~ .x %>% gather("year_forecasted","variable",n_col:ncol(.))) %>%
map(~ .x %>% bind_rows())
forecasts <- forecasts %>%
map(~ .x %>% mutate(Series_code = str_extract(Series_code, "\\d{3}"))) %>%
map(~ .x %>% select(Series_code, `Data Type/ Forecaster`,year_forecasted, variable)) %>%
map2(sheets_name, ~ .x %>% mutate(date_publication = .y)) %>%
map2(sheets_year, ~ .x %>% mutate(year_publication = .y)) %>%
bind_rows() %>%
group_split(year_forecasted) %>%
map(~ .x %>% select(-year_publication)) %>%
map(~ .x %>% spread(date_publication, variable)) %>%
map(~ .x %>% rename_at(vars(starts_with("apr")), ~ paste0(.,"apr"))) %>%
map(~ .x %>% rename_at(vars(starts_with("apr")), ~ str_remove(.,"^apr"))) %>%
map(~ .x %>% rename_at(vars(starts_with("sep")), ~ paste0(.,"sep"))) %>%
map(~ .x %>% rename_at(vars(starts_with("sep")), funs(str_remove(.,"^sep"))))
# Order names of the columns:
for(i in 1:length(forecasts)){
forecasts[[i]] <- forecasts[[i]][,order(names(forecasts[[i]]))]
}
# Discard years for which no actual value:
forecasts <- forecasts %>%
discard(~ unique(.x$year_forecasted) > 2019)
# Naming similar to Zidong and bind together:
final_forecasts <- forecasts %>%
map(~ if(length(names(.x)) == 7){
.x %>% setNames(c(rev(paste0("variable",seq(4:1))),"forecaster","country_code","year"))
} else if(length(names(.x)) == 6){
.x %>% setNames(c(rev(paste0("variable",seq(3:1))),"forecaster","country_code","year"))
} else if(length(names(.x)) == 4){
.x %>% setNames(c(paste0("variable",1),"forecaster","country_code","year"))
}) %>%
bind_rows() %>%
mutate(country = countrycode(country_code,"imf","country.name")) %>%
filter(complete.cases(country)) %>%
select(country_code, country, year, forecaster, everything()) %>%
arrange(country,year)
# Final step - remove unnecessary summary stats:
unnecessary_stat <- c("High","Low","Number of Forecasts","No of Forecasts","Standard Deviation",
"Total","Ref.")
final <- final_forecasts %>%
filter(!forecaster %in% unnecessary_stat)
return(final)
}
# Create and export final dataframe: -----
advanced <- wrangle_consensus_forecasts("../When_where_and_why_material/raw_data/consensus/gdp_2008_2019_firstweek.xlsx",T,12)
emerging <- wrangle_consensus_forecasts("../When_where_and_why_material/raw_data/consensus/gdp_2008_2019_secondweek.xlsx",F,11)
consensus_clean <- rbind(advanced, emerging) %>%
arrange(country, year)
# Problem of duplicates:
consensus_clean <- consensus_clean %>%
distinct(country, year, forecaster, .keep_all = T)
# Export:
saveRDS(consensus_clean, file = "../When_where_and_why_material/intermediate_data/consensus/gdp_consensus_cleaned.RDS")
cat(crayon::green(paste0("Consesus data succesfully cleaned.\nExported RData in directory:"
," ../When_where_and_why_material/intermediate_data/consensus")))