-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfunctions.R
113 lines (110 loc) · 3.99 KB
/
functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
create_dataframe<-function(xlsx_file){
# get sheet names
year<-str_replace(tail(strsplit(xlsx_file, " ")[[1]], n=1), ".xlsx", "")
sheet_names<-xlsx_sheet_names(xlsx_file)
if (substr(strsplit(xlsx_file, " Table ")[[1]][1], 1, 5)=="/PROV"){
file_breakdown<-strsplit(strsplit(xlsx_file, " Table ")[[1]][1], "/PROV - ")[[1]][2]
}
else {
file_breakdown<-strsplit(gsub("/", "", xlsx_file), " Table ")[[1]][1]
}
df<-data.frame()
# loop through sheets
for (sheet_name in sheet_names) {
if (sheet_name=="Notes") {}
else {
tidyxl_data<-xlsx_cells(xlsx_file, sheets=sheet_name)
tidyxl_format<-xlsx_formats(xlsx_file)
acceptable<-tidyxl_data[tidyxl_data$local_format_id %in%
which(tidyxl_format$local$fill$patternFill$fgColor$rgb == "FF33CCCC"), c("address")] %>%
mutate(comment="acceptable")
reasonably_precise<-tidyxl_data[tidyxl_data$local_format_id %in%
which(tidyxl_format$local$fill$patternFill$bgColor$rgb == "FF00FFFF"), c("address")] %>%
mutate(comment="reasonably precise")
comment<-bind_rows(acceptable, reasonably_precise)
readxl_data<-read_xlsx(xlsx_file, sheet=sheet_name, trim_ws=F, col_names = FALSE, .name_repair = ~ ifelse(nzchar(.x), .x, LETTERS[seq_along(.x)]))
readxl_data<-readxl_data %>%
select(A, D) %>%
tibble::rownames_to_column(var="rowname") %>%
filter(complete.cases(.)) %>%
mutate(address=paste0("D",rowname)) %>%
tibble::column_to_rownames("rowname") %>%
left_join(comment) %>%
select(-address) %>%
rename(!!file_breakdown :=A) %>%
rename(Value=D) %>%
slice(-1) %>%
mutate(Year=year) %>%
mutate(Units="Median")
readxl_data<-add_columns(readxl_data, sheet_name)
readxl_data[readxl_data$comment=="acceptable" & readxl_data$Value=="x","comment"]<-"unacceptable"
df<-bind_rows(df, readxl_data)
}
}
return(df)
}
add_columns<-function(readxl_data, sheet_name){
if (sheet_name=="Male"){
readxl_data<-readxl_data %>%
mutate(Sex="Male")
}
if (sheet_name=="Female"){
readxl_data<-readxl_data %>%
mutate(Sex="Female")
}
if (sheet_name=="Full-Time"){
readxl_data<-readxl_data %>%
mutate(WorkingPattern="Full-Time")
}
if (sheet_name=="Part-Time"){
readxl_data<-readxl_data %>%
mutate(WorkingPattern="Part-Time")
}
if (sheet_name=="Male Full-Time"){
readxl_data<-readxl_data %>%
mutate(Sex="Male") %>%
mutate(WorkingPattern="Full-Time")
}
if (sheet_name=="Male Part-Time"){
readxl_data<-readxl_data %>%
mutate(Sex="Male") %>%
mutate(WorkingPattern="Part-Time")
}
if (sheet_name=="Female Full-Time"){
readxl_data<-readxl_data %>%
mutate(Sex="Female") %>%
mutate(WorkingPattern="Full-Time")
}
if (sheet_name=="Female Part-Time"){
readxl_data<-readxl_data %>%
mutate(Sex="Female") %>%
mutate(WorkingPattern="Part-Time")
}
return(readxl_data)
}
download_zips<-function(dataframe, years) {
urlstart<-"https://www.ons.gov.uk/file?uri=/employmentandlabourmarket/peopleinwork/earningsandworkinghours/datasets/"
for (i in 1:nrow(dataframe)) {
tablename<-ashetables[i, "tablename"]
tablenumber<-ashetables[i, "tablenumber"]
for (year in years) {
if (year==2020) {
type<-"provisional"
}
else {
type<-"revised"
}
if (ashetables[i, "tablename"]=="industry2digitsic" & year > 2017) {
url<-paste0(urlstart,tablename,"ashetable",tablenumber,"/",year,type,"/sic2007table",tablenumber,year,type,".zip")
}
else {
url<-paste0(urlstart,tablename,"ashetable",tablenumber,"/",year,type,"/table",tablenumber,year,type,".zip")
}
tempfile <- tempfile()
download.file(url, tempfile)
unzipped_files<-unzip(tempfile, list=TRUE)
unzip(zipfile=tempfile, files = unzipped_files[grep(".7a", unzipped_files$Name), ][["Name"]])
unlink(tempfile)
}
}
}