-
Notifications
You must be signed in to change notification settings - Fork 0
/
01_wrangleDogs_Cars.R
358 lines (244 loc) · 11.8 KB
/
01_wrangleDogs_Cars.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
### This R script wrangles the data
### Plots and regression analysis is done in script 02.
#---------- read data ----------------
library(tidyverse)
library(tmap)
library(sf)
#--------------read datasets to be wrangled ------------------------
#dog data
dogs <- read_csv("data/DFEFAdogdata.csv")
postcode_oa <- read_csv("data/OA_postcode.csv")
lookup <- read_csv("data/OA11_LSOA11_MSOA11_LAD11_EW_LUv2.csv")
postcodepop <- read_csv("population_postcodes/Postcode_Estimates_Table_1.csv")
sum(postcodepop$Total)
#This is a set of various variables including rural urban classification
clusterdata <- read_csv("MOTdata/LSOAamasterEcc030418subset.csv")
clusterdata <- clusterdata %>% dplyr::select(LSOA11CD,income,green900,RUC11,ru5)
#This is the % of families with 1 or more dependent children
#from the 2011 census table QS118ew
child <- read_csv("data/QS118ew_dep_child.csv")
#MOT data (version 9) Data from the MOT project
#see ref to Cairns et al 2014 in paper table 1
mot <- read_csv("data/v9carsAndPop.csv")
#spatial data
#lsoa polygons
lsoa <- st_read(dsn = "LSOA11SG", layer = "LSOAEW11SG")
#English and Welsh LSOA population weighted centroids
pwc <- st_read("data/dist_to_stations.gpkg") # pop weighted cents with rail access info/
pwc <- pwc %>% dplyr::select(code,name,dist_to_nearest_station)
#geodemographic data
income <- read_csv("data/ExperianMedianHHIncome2011.csv")
census <- read_csv("data/selected_UKcensus2011.csv")
census <- census %>% dplyr::select(LSOA11CD,nocar_percent,
car1_percent,
car2_percent,
car3_percent,
car4_or_more_percent,
age_median,
white_percent,
dwel_detached_percent,
dwel_semi_detached_percent,
dwel_terraced_percent,
flat_percent,
econ_all_econ_active_percent)
female <- read_csv("data/urp_pc_female.csv")
female <- female %>% dplyr::select(llsoa11cd,pc_female)
census <- left_join(census,female,by = c("LSOA11CD" = "llsoa11cd" ))
#vul index I didn't use this in the analysis in the end
vul <- read_csv("data/Mattiolietal.csv")
#postcode districts
pcdist <- st_read("GB_Postcodes/PostalDistrict.shp")
#qtm(pcdist)
summary(census$pc_female)
##### wrangling data ###################
#We have data at 2 spatial resolutions, dog data at postcode district resolution which are larger areas
#than lsoas which is the resolution of the MOT data.
#WE can aggregate to postcode districts: this smooths out some variation in the mot vehicle use data
#or we can attribute the dog ownership level to each LSOA. This preserves variation in MOT vehicle use data,
#but there can be some ecological fallacy issues.
#----------- wrangle the dogs data so the spatial resolution is postcode outcode--------
#wrangle to postcodes to outcode (e.g. LS2 also known as the postcode district).
#Outcode and Postcode district are the same thing. #
#generate a human population estimate for the postcode districts, by joining data to the dogs data
#spatial join the postcode polygon attributes to points
#join the dog data.
#------------------ join postcode population to postcode district ----
#first aggregate postcode population to postcode district population
postcode_split2 <- postcodepop %>%
extract(Postcode, into = c('out', 'in'), '(\\S{2,4})\\s*(\\d\\w\\w)') %>%
mutate(Postcode = sprintf('% -4s%s', out, `in`))
names(postcodepop)
names(postcode_split2)
#group
groups <- group_by(postcode_split2,out)
#summarize
postcodedistpop <- dplyr::summarize(groups, totalhumanpop = sum(Total))
sum(postcodedistpop$totalhumanpop)
#This gives a population per postcode district. Next we can
#make a postcode area variable of dogs per person.
#this will eventually get compared to the MOT miles per person variable.
dogs <- left_join(dogs,postcodedistpop,by = c("PostcodeDistrict" = "out"))
names(dogs)
sum(dogs$totalhumanpop,na.rm=T)
sum(dogs$EstimatedDogPopulation,na.rm=T)
dogs$Dogs_pp <- dogs$EstimatedDogPopulation / dogs$totalhumanpop
summary(dogs$Dogs_pp)
summary(dogs$totalhumanpop)
plot(dogs$totalhumanpop,dogs$EstimatedDogPopulation,main = "Dog population vs Human Population")
cor(dogs$totalhumanpop,dogs$EstimatedDogPopulation,use= "pairwise.complete.obs",method = "pearson")
#0.437772
cor(dogs$totalhumanpop,dogs$EstimatedDogPopulation,use= "pairwise.complete.obs",method = "spearman")
#0.5213059
#there is a moderately strong correlation, th plot shows a cone.
#There is likely another factor / factors which might provide a stronger
#relationship with multiple regression
names(dogs)
#[1] "PostcodeDistrict" "EstimatedDogPopulation" "totalhumanpop" "Dogs_pp"
n = dogs %>% dplyr::select(PostcodeDistrict) %>% distinct()
duplicates = dogs %>%
group_by(PostcodeDistrict) %>%
filter(n()>1)
#------------- process LSOA data ------------------
#you can uncomment this and make a map if you want but it takes a while to render
# tm_shape(pcdist)+
# tm_polygons(alpha = 0.5)+
# tm_shape(pwc)+
# tm_dots(col = "red")
#------------ dplyr::select columns required from the mot data
names(mot)
mot2 <- mot %>% dplyr::select(LSOA,Pop,HH,N,km_pp,km_perAdult,emfac_co2Av,litresAv,sizeAv)
pwc <- left_join(pwc,mot2,by = c("code" = "LSOA"))
pwc <- pwc %>% filter(!is.na(km_pp)) # removes scottish lsoa/dz as we only have dogs data for England and Wales
pwc<- left_join(pwc,clusterdata,by = c("code" = "LSOA11CD"))
pwc<- left_join(pwc,income,by = c("code" = "LSOA11CD"))
#make an income quintile
pwc <- pwc %>% mutate(income_quintile = ntile(Median_Household_Income, 5))
#remove outlier lsoas where mean kmpp is over 20,000 pa
filtered <- pwc %>% filter(km_pp > 20000)
pwc <- pwc %>% filter(km_pp < 20000)
#join dplyr::selected census variables
pwc<- left_join(pwc,census,by = c("code" = "LSOA11CD"))
pwc$multicarpc <- pwc$car4_or_more_percent + pwc$car3_percent +pwc$car2_percent
names(child)
pwc <- left_join(pwc,child, by = c("code" = "lsoa11cd"))
#join vul index
pwc <- left_join(pwc,vul, by = c("code" = "LSOA11CD"))
#######---------- spatial joining -------########
#----- make data at postcode district resolution ---------
st_crs(pwc)
pcdist <- st_transform(pcdist,crs = 27700)
names(pwc)
pcdist2 <- st_join(pcdist, pwc %>% filter(!is.na(ru5))) %>% group_by(PostDist) %>%
summarise(meankm_pp = mean(km_pp,na.rm = T),
meankm_per_adult = mean(km_perAdult,na.rm = T),
mean_emfac_co2Av = mean(emfac_co2Av,na.rm = T),
mean_engine_size = mean(sizeAv,na.rm = T),
mean_fuelpp_l = mean(litresAv,na.rm = T),
tot_pop_mot = sum(Pop,na.rm = T),
num_cars = sum(N,na.rm = T),
HH_mot = sum(HH,na.rm = T),
dist_to_nearest_station = mean(dist_to_nearest_station,na.rm = T),
median_hh_income = mean(Median_Household_Income,na.rm = T),
ru5 = min(ru5,na.rm = T),
nocar_percent = mean(nocar_percent,na.rm = T),
multicarpc = mean(multicarpc,na.rm = T),
age_median = mean(age_median,na.rm = T),
white_percent = mean(white_percent,na.rm = T),
dwel_detached_percent = mean(dwel_detached_percent,na.rm = T),
dwel_semi_detached_percent = mean(dwel_semi_detached_percent,na.rm = T),
dwel_terraced_percent = mean(dwel_terraced_percent,na.rm = T),
flat_percent = mean(flat_percent,na.rm =T),
econ_all_econ_active_percent = mean(econ_all_econ_active_percent,na.rm =T),
vul = mean(vul,na.rm = T),
pc_female = mean(pc_female,na.rm = T),
pc_child = mean(pc_families_dep_children)
)
#---remove scotland unfortunately because *data* is a devolved issue and England and Scotland publish data separately --
#scotland postcodes will have na for km _pp because the MOT data I read in is for England and Wales
pcdist2 <- pcdist2 %>% filter(!is.na(meankm_pp))
#--- deal with RU5
summary(pwc$ru5)
summary(pcdist2$ru5)
test <- pcdist2 %>% filter(is.na(ru5))
pcdist2$ru5 <- as.factor(pcdist2$ru5)
pcdist2 <- pcdist2 %>% mutate(income_quintile = ntile(median_hh_income, 5))
summary(pcdist2$income_quintile)
#--------- check for duplicates ----
duplicates = pcdist2 %>%
group_by(PostDist) %>%
filter(n()>1) #no duplicates found
#note outliers of a mean of over 20000km per person pa per lsoa have been removed
#remove 9 outlier postcode districts #done above
#filtered <- pcdist2 %>% filter(meankm_pp > 20000) #done above
hist(pcdist2$meankm_pp)
#map the mot data mean km per person aggregated to postcode districts
#tm_shape(pcdist2)+
# tm_polygons(col = "meankm_pp",n=5,style = "quantile",
# alpha = 0.5)
glimpse(pcdist2)
#-----join to the dogs data ------
pcdist2 <- left_join(pcdist2,dogs,by = c("PostDist" = "PostcodeDistrict" ))
#there are no missing values for dog population
pcdist2 %>% filter(is.na(EstimatedDogPopulation))
#make the dogs per person metrics with both population figures
pcdist2$Dogs_ppmot <- pcdist2$EstimatedDogPopulation/pcdist2$tot_pop_mot
pcdist2$Dogs_pppcd <- pcdist2$EstimatedDogPopulation/pcdist2$totalhumanpop
#base r plot
plot(pcdist2$Dogs_ppmot,pcdist2$Dogs_pppcd)
#ggplot
pcdist2 %>%
ggplot()+
geom_point(aes(x = Dogs_ppmot, y = Dogs_pppcd))+
geom_smooth(aes(x = Dogs_ppmot, y = Dogs_pppcd))
cor(pcdist2$Dogs_ppmot,pcdist2$Dogs_pppcd,use= "pairwise.complete.obs",method = "pearson")
#0.96
cor(pcdist2$Dogs_ppmot,pcdist2$Dogs_pppcd,use= "pairwise.complete.obs",method = "spearman")
#0.99
#dissimilarity increases with high values - places
#with more than 1 dog per person mot and more than 1.5 pcd
summary(pcdist2$Dogs_ppmot)
summary(pcdist2$Dogs_pppcd)
#---- human population estimates seem consistent
#we have an estimate of postcode population, and also census population from LSOAs.
#when lsoa data is aggregated to postcode level it seems consistent
#correlation between human population estimates is very high and linear
pcdist2 %>%
ggplot()+
geom_point(aes(x = tot_pop_mot, y = totalhumanpop))+
geom_smooth(aes(x = tot_pop_mot, y = totalhumanpop))
#4 postcode districts have more than 1.5 dogs per person,
#DL11, Upper Swaledale
#LA20 duddon eskdale and wasdale,
#LL23, Bala N Wales
#and
#PO41 Isle of white Yarmouth
topdogs <- filter(pcdist2, Dogs_ppmot >1.5)
#use dogs postcode district in the analysis
pcdist2$Dogs_pp <- pcdist2$Dogs_pppcd
#------ map the dogs data --------------
# #tm_shape(pcdist2)+
# # tm_polygons(col = "Dogs_pp",n=5,style = "quantile",
# # alpha = 0.5)
#
# tm_shape(pcdist2)+
# tm_polygons(col = "Dogs_pp",n=5,style = "fisher",
# alpha = 0.5)
pcdist2$cars_pp_pcdist <- pcdist2$num_cars/pcdist2$totalhumanpop
#make a set of dummy variables for RU5
pcdist2$MajorConurbation <- 0
pcdist2$MajorConurbation[pcdist2$ru5 == 1] <- 1
pcdist2$MinorConurbation <- 0
pcdist2$MinorConurbation[pcdist2$ru5 == 2] <- 1
pcdist2$CityTown <- 0
pcdist2$CityTown[pcdist2$ru5 == 3] <- 1
pcdist2$RuralTownFringe <- 0
pcdist2$RuralTownFringe[pcdist2$ru5 == 4] <- 1
pcdist2$RuralVillageDispersed <- 0
pcdist2$RuralVillageDispersed[pcdist2$ru5 == 5] <- 1
glimpse(pcdist2)
#---------save key datasets -----------------
#if you want to rewrite these then un-comment here
#st_write(pcdist2,"output_postcode_focus/postcode_focussed_data281021.gpkg")
pcdist2csv <- pcdist2
st_geometry(pcdist2csv) <- NULL
#write_csv(pcdist2csv,"output_postcode_focus/postcode_focussed_data281021.csv")