forked from kuprinaga/R-ladies-workshop
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.R
65 lines (48 loc) · 1.66 KB
/
preprocess.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
library(readr)
library(tidyverse)
library(magrittr)
library(lettercase)
#### Data source: The World Bank
#### Original data: https://datacatalog.worldbank.org/dataset/gender-statistics
#### Used under CC license: https://datacatalog.worldbank.org/public-licenses#cc-by
#### Changes made to data: filtering for selected Nordic countries
df <- read_csv('Gender_StatsData.csv')
# names for the first 4 columns have spaces
names(df)[1:4] %<>% make_names()
# clean the data; reshape it twice
df_clean <- df %>%
select(-X64) %>%
gather(year, val, -Country_Name, -Country_Code, -Indicator_Name, -Indicator_Code) %>%
select(-Indicator_Code) %>%
spread(Indicator_Name, val)
# make pretty names for the categories
names(df_clean) %<>% make_names()
# how many NAs? -- many
colMeans(is.na(df_clean))
# na % by country by metric
nas <- df_clean %>%
group_by(Country_Name) %>%
summarise_each(funs(100*mean(is.na(.)))) %>%
gather(metric, val, -Country_Name, -Country_Code, -year)
## top countries
nas %>%
group_by(Country_Name) %>%
summarise(mean(val)) %>%
arrange(`mean(val)`)
# countries that could work:
countries_selected <- c('Denmark', 'Norway', 'Sweden', 'Finland')
# save metrics which have less than 5% nas
metrics_to_use <- nas %>%
filter(Country_Name %in% !!countries_selected) %>%
group_by(metric) %>%
summarise(mean(val)) %>%
arrange(`mean(val)`) %>%
filter(`mean(val)` <= 5) %>%
pull(metric)
# save into df
nordics <- df_clean %>%
filter(Country_Name %in% !!countries_selected) %>%
select(year, Country_Name, Country_Code, !!metrics_to_use)
# another NA check -- looks ok
colMeans(is.na(nordics))
write_csv(nordics, 'nordic_data.csv')