diff --git a/analysis/01-design-base-map.Rmd b/analysis/01-design-base-map.Rmd
new file mode 100644
index 0000000..e1f5b8f
--- /dev/null
+++ b/analysis/01-design-base-map.Rmd
@@ -0,0 +1,155 @@
+---
+title: "Design base map"
+author: "Qingqing Chen"
+date: "Last compiled date: `r format(Sys.time(), '%d %B, %Y')`"
+output: github_document
+editor_options:
+ chunk_output_type: console
+---
+
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE, fig.align='center', fig.width = 13, fig.height = 10, warning = FALSE, message = FALSE)
+library(tidyverse)
+library(sf)
+library(here)
+library(tmap)
+library(tmaptools)
+library(osmdata)
+library(cartography)
+library(mapedit)
+library(magick)
+library(cowplot)
+library(grid)
+```
+
+
+## Prepare datasets
+```{r}
+# sg planning areas
+sg_planing_areas <- read_sf(here("data/raw_data/sg-subzone/MP14_SUBZONE_NO_SEA_PL.shp")) %>%
+ st_make_valid() %>%
+ st_transform(crs = 3414) %>%
+ group_by(REGION_N, PLN_AREA_N, SUBZONE_N) %>%
+ dplyr::summarise() %>%
+ ungroup()
+
+# sg boundary
+if(file.exists(here("data/derived_data/sg_boundary.rds"))){
+ sg_boundary <- readRDS(here("data/derived_data/sg_boundary.rds"))
+}else{
+ sg_boundary <- sg_planing_areas %>% summarise()
+ saveRDS(sg_boundary, file = here("data/derived_data/sg_boundary.rds"))
+}
+
+# get sg bounding box
+sg_bbox <- getbb("singapore") %>% as_tibble()
+if(file.exists(here("data/derived_data/streets.rds"))){
+ streets <- readRDS(here("data/derived_data/streets.rds"))
+}else{
+ streets <- opq(bbox = c(sg_bbox$min[1], sg_bbox$min[2], sg_bbox$max[1], sg_bbox$max[2])) %>%
+ add_osm_feature(key = "highway", value = c("primary", "trunk")) %>%
+ osmdata_sf()
+ streets <- streets$osm_lines %>%
+ filter(!is.na(name)) %>%
+ st_simplify(dTolerance = 0.0001) %>%
+ st_transform(crs = 3414) %>%
+ st_join(., sg_planing_areas, largest = T) %>%
+ filter(!is.na(REGION_N))
+ saveRDS(streets, file = here("data/derived_data/streets.rds"))
+}
+
+if(file.exists(here("data/derived_data/area_centers.rds"))){
+ area_centers <- readRDS(here("data/derived_data/area_centers.rds"))
+}else{
+ # reference area
+ area <- sg_planing_areas %>%
+ filter(SUBZONE_N %in% c("NATIONAL UNIVERSITY OF S'PORE", "SENTOSA", "EAST COAST", "WOODLANDS", "CENTRAL WATER CATCHMENT", "JURONG GATEWAY", "CITY HALL", "CHANGI AIRPORT", "TAMPINES EAST", "SERANGOON CENTRAL")) %>% # NORTHSHORE
+ mutate(SUBZONE_N = map_chr(SUBZONE_N, stringr::str_to_title))
+
+ # reference area centers
+ area_centers <- area %>%
+ st_centroid() %>% rownames_to_column(var = "id") %>%
+ mutate(label = paste0(id, ". ", SUBZONE_N)) %>%
+ mutate(label = factor(label, levels = label))
+ saveRDS(area_centers, file = here("data/derived_data/area_centers.rds"))
+}
+
+# hatched reference areas
+if(file.exists(here("data/derived_data/area_hatched.rds"))){
+ area_hatched <- readRDS(here("data/derived_data/area_hatched.rds"))
+}else{
+ area_hatched <- hatchedLayer(area, pattern = "right2left", mode = "sfc", density = 7)
+ saveRDS(area_hatched, file = here("data/derived_data/area_hatched.rds"))
+}
+```
+
+
+## Prepare image
+
+```{r}
+tune_img <- function(img_nm, img_title, geom_size){
+ base_path <- "data/photos/"
+ img <- image_read(here(paste0(base_path, img_nm))) %>%
+ image_resize(geometry = geom_size)
+ ggdraw() +
+ draw_image(img, x = 0.9, y = 0.9, hjust = 1, vjust = 1, height = 0.88) +
+ draw_plot_label(img_title, fontface = "plain", x = 0.0, y = 0.99, hjust = 0, vjust = 1, size = 10)
+}
+
+img1 <- tune_img(img_nm = "cityhall.jpeg", img_title = "1. City Hall", geom_size = "400x550")
+img2 <- tune_img(img_nm = "flickr-eastcoast.jpg", img_title = "2. East Coast", geom_size = "400x550")
+img3 <- tune_img(img_nm = "flickr-nus.jpg", img_title = "3. National University of Singapore", geom_size = "400x550")
+img4 <- tune_img(img_nm = "flickr-sentosa.jpg", img_title = "4. Sentosa", geom_size = "400x550")
+img5 <- tune_img(img_nm = "flickr-changiairport.jpg", img_title = "5. Changi Airport", geom_size = "400x550")
+img6 <- tune_img(img_nm = "flickr-tampines.jpg", img_title = "6. Tampines", geom_size = "400x550")
+img7 <- tune_img(img_nm = "flickr-central-water-catchment.jpg", img_title = "7. Central Water Catchment", geom_size = "400x550")
+img8 <- tune_img(img_nm = "flickr-woodlands.jpg", img_title = "8. Woodlands", geom_size = "400x550")
+# img8 <- tune_img(img_nm = "flickr-northshore.jpg", img_title = "8. Northshore", geom_size = "400x550")
+img9 <- tune_img(img_nm = "flickr-serangoon.jpg", img_title = "9. Serangoon Centre", geom_size = "400x550")
+img10 <- tune_img(img_nm = "flickr-jurongeast.jpg", img_title = "10. Jurong Gateway", geom_size = "400x550")
+```
+
+## Draw base map
+```{r}
+base_map <- tm_shape(sg_boundary) +
+ tm_borders(col = "grey") +
+ tm_shape(streets) +
+ tm_lines(col = rgb(80, 80, 80, maxColorValue = 255), lwd = 1, alpha = 1) +
+ tm_shape(area_hatched) +
+ tm_lines(col = rgb(80, 110, 120, maxColorValue = 255)) +
+ tm_shape(area_centers) +
+ tm_bubbles(col = rgb(80, 80, 80, maxColorValue = 255), size = 1) +
+ tm_text(text = "id", col = "white", size = 1) +
+ tm_layout(frame = F)
+
+grid.newpage()
+pushViewport(viewport(layout = grid.layout(ncol = 3, nrow = 5, widths = c(3.4, 0.9, 0.9))))
+print(base_map, vp = viewport(layout.pos.col = 1))
+grid.text("Labeled places are reference areas", x = 0.5, y = 0.1, just = c(1, 1), gp = gpar(fontsize = 12, fontface = "italic"), vp = viewport(layout.pos.col = 1))
+grid.text("Singapore map", x = 0.35, y = 0.06, just = c(1, 1), gp = gpar(fontsize = 12, fontface = "italic"), vp = viewport(layout.pos.col = 1))
+print(img1, vp = viewport(layout.pos.col = 2, layout.pos.row = 1))
+print(img2, vp = viewport(layout.pos.col = 2, layout.pos.row = 2))
+print(img3, vp = viewport(layout.pos.col = 2, layout.pos.row = 3))
+print(img4, vp = viewport(layout.pos.col = 2, layout.pos.row = 4))
+print(img5, vp = viewport(layout.pos.col = 2, layout.pos.row = 5))
+print(img6, vp = viewport(layout.pos.col = 3, layout.pos.row = 1))
+print(img7, vp = viewport(layout.pos.col = 3, layout.pos.row = 2))
+print(img8, vp = viewport(layout.pos.col = 3, layout.pos.row = 3))
+print(img9, vp = viewport(layout.pos.col = 3, layout.pos.row = 4))
+print(img10, vp = viewport(layout.pos.col =3, layout.pos.row = 5))
+```
+
+
+## Photo links
+
+- [1. City Hall photography by the author]()
+- [2. East Coast photography by Tamaki Hayashi](https://www.flickr.com/photos/hayashi28/16187474/in/photolist-2qXYo-qCvt7K-26wSwcJ-kdaVu7-27yncys-2eMRcjE-w4hvEx-24T2eJh-KuRxG5-pcnWhA-21cq2Ng-YXSYvY-6aXnGB-R75jxc-6gLWAK-6g9xTZ-6gLWAP-27CLs4P-fTqkGF-bAk7oS-2g4jTCz-2g4jYpG-R75kAp-YnLmvG-srHoHY-2eMQQib-2dLuWHw-V31HLx-2dtEcc6-2eMQMdy-spAc1Q-2eMQMC1-2dLuYfQ-GYKYLE-9evMBH-abgqEr-9ibhpX-8fm2wr-4FwJcy-5X6RJu-W9WWcK-arJG48-9eyTM5-9vbbes-6WAMiU-qCvVRr-gJBndV-B1aft-4CfEFn-dmJHhU)
+- [3. National University of Singapore photography by Melvin Yap](https://www.flickr.com/photos/mjmyap/12746332024/in/photolist-kqmhrf-kd1VQp-kd1WeR-56JJ3-6ajFyw)
+- [4. Sentosa photography by William Cho](https://www.flickr.com/photos/adforce1/5559170576/in/photolist-9tfcJy-yENBCs-a6dsbx-gvdAZA-gvfCpz-gvfh59-gv9Jdn-gvaVma-gvabFJ-gveNCH-gvafdy-gvakNP-gvdM3G-gv9wGL-gvf1rT-gvbAxj-gveNxp-gva7Er-gv9JAf-gvaaaB-gvbE4M-8hCRLa-gvbLuU-gvbHoo-gveK3D-gvcjEn-gveE5x-gveSRK-gveSk5-gvbcUT-gvbRz3-R8vPx6-gwRKmM-gvfd9v-gvcoiz-gvc1Td-gvccSe-gvbHgD-gv9XHD-gvbDRU-gv6XX1-gvdxZs-ejZX6j-gv84r6-gwR99w-gveGmV-gwSaTH-gwSjC4-gwRsrp-gv7RtD)
+- [5. Changi Airport photography by Geoff Whalan](https://www.flickr.com/photos/geoffwhalan/48355675096/in/photolist-2gF2tw9-2gF1XF7-2jdxvdU-2jjBSA9-2gHKZTv-2gPxibT-2gPwfCN-2gPwnKD-2fs2UAE-2e8NpV8-2ie9BdV-2jjzcQe-2gEY8uh-2iebWic-2jxGTU4-2hbfZA9-2h1kTU3-2jjz6rh-2jjyQST-2jbMbxg-24UyAXV-2g2i2TN-2e8NpUB-2e8NpUg-2jxGSRT-RLiBue-2jFRJqy-2jjBQLT-2iCLYF9-2jxCwhk-2hq6rfz-2hAJ5z7-2hAJ5X1-2hAFeDC-2hAJ5QH-2hAJ5Lu-2hAJYz5-2hHcTuw-2hAJ5pn-2gZQN1S-2jxGTBa-2gpdtGz-2gpcFV9-2gpd5KB-2gpciRt-2estN52-2gpcj5p-2gpdtCB-2gpd5RU-2gZR7LY)
+- [6. Tampines photography by Dickson Phua](https://www.flickr.com/photos/gunman47/37077267100/in/photolist-Xst9mG-JMYrh-YuoFKS-2iv6iDA-48jdNv-oCZhM6-opU96Z-7FwzH5-7FwzTm-o9UV7N-NtHhkd-9RQyvV)
+- [7. Central Water Catchment photography by CW Gan](https://www.flickr.com/photos/gancw1/41754717620/in/photolist-26BHQQu-27ZnxXM-2jMae6A-4oR778-cp4c1Q-2jMae5Z-2jM5NBQ-28gVPJ5-28h5Gcd-27Znyig-29nvBt4-29iecwo-26BHRuf-26BHQSy-4oXyNQ-4p8q41-4oTvq8-4oXyUs-Sysczd-4BJDiH-4oTze6-G5A7h-SB9Xe4-26BHRaN-26BHR2S-272S1B-26BHQWG-26BHReq-bkU6Zy-27Zny8g-27Zny5F-4oKwZB-4oKx6c-byNYg8-fdYdKs-4p4ms2-272RV6-26BHRZJ-277ijj-4pefyr-4oTvwK-4p4miP-4DFfFe-4p4mo8-4DKusW-26BHQYf-4oKx3H-4p4mhk-4BNKdN-FVEP5)
+- [8. Northshore photography by Steel Wool](https://www.flickr.com/photos/wynnie/7232205516/in/photolist-c25WJy-28d8aTw-c25TFU-fDeFiT-c25Wq3-NQniYf-Vt1gzM-wL6qfk-cBjohb-hyiq8i-cBjoyN-hygGJw-2i1Wv2x-c25Uoo-c25Tjs-fDeHEP-c25WWE-c25Vhw-c25Xx1-c25Vxf-c25Xfb-c25TWG-c25W5o-c25VN9-c25Uad-c25SH3-c25UPf-c25SVQ-c25V19-c25UAN-fDw9Fm-fDeEhB-fDf1dt-fDeJ8F-hyioiM-cBjnHo-hygTXo-2kJUSS-fDeDHa-fDwgXo-fDeWcc-fDeCx2-fDwhro-fDwaSd-fDwg3L-fDeJwt-hygUKf-hyitqR-fDwwus-hyhnyd)
+- [9. Serangoon Centre photography by Ellen Forsyth](https://www.flickr.com/photos/ellf/8409937430/in/photolist-dPa8pd-9MFzf7-dP4rHD-7bFYeH-7bFYeF-dP4tMa-78GKXb-dP4tzH-dP4pFp-dP4qpc-dP4sWF-dP4tjk-dPa6d3-dPa3wh-dPa5m3-dPa2wS-dP4u3P-dP4rQ4-dPa7M3-dPa6p9-dP9YzG-dPa4z9-dPa8as-dP9Zwm-dPa6PY-dP4syM-dP4vy4-dPa4NQ-8rSNck-dP4v9c-dPa3iY-2hJHQhg-dP4rgR-dPa3DS-dP4uAa-dPa4Yd-dP4ntR-eNSisJ-dP4rvr-dP4qAH-2iCnxqK-5S2zFs-8Ragv2-8419A8-5Yma9Q-8zKDPW-bJQ45B-b3ErW-4sVcWh-impgN5)
+- [10. Jurong East photography by Edsel Little](https://www.flickr.com/photos/edsel_/28009328562/in/photolist-JF68xY-HTJSDF-24TRubm-2coooYE-JQ85Mx-HTKfE4-JHdugK-HTKay2-uVauU-HTJL9r-JQ5gJD-cSjsx-JEZTnd-ef5yUz-ejtNDs-8cHrMz-7Zrhr6-7QXRYr-22uSBnK-22s6MX5-8azi5a-JM53WU-22s42Ys-JpjoRb-22uT9bD-4C7sx9-21gDFSG-JF66Fw-JHd2HK-JQ8za4-HTMgZW-JQ8wPx-8cH3t1-JQ3nxM-8CdUwk-JM4PWf-JHemmp-JQ3Vec-JM4Mfd-JQ8Eec-HTM3XU-24eAawG-8cHrMK-JQ8cua-4C3aoR-22uXQUe-6TsdYW-HTKRGt-8cHrMP-5tgHAH)
diff --git a/analysis/01-design-base-map.md b/analysis/01-design-base-map.md
new file mode 100644
index 0000000..44e8aab
--- /dev/null
+++ b/analysis/01-design-base-map.md
@@ -0,0 +1,145 @@
+Design base map
+================
+Qingqing Chen
+Last compiled date: 17 September, 2021
+
+## Prepare datasets
+
+``` r
+# sg planning areas
+sg_planing_areas <- read_sf(here("data/raw_data/sg-subzone/MP14_SUBZONE_NO_SEA_PL.shp")) %>%
+ st_make_valid() %>%
+ st_transform(crs = 3414) %>%
+ group_by(REGION_N, PLN_AREA_N, SUBZONE_N) %>%
+ dplyr::summarise() %>%
+ ungroup()
+
+# sg boundary
+if(file.exists(here("data/derived_data/sg_boundary.rds"))){
+ sg_boundary <- readRDS(here("data/derived_data/sg_boundary.rds"))
+}else{
+ sg_boundary <- sg_planing_areas %>% summarise()
+ saveRDS(sg_boundary, file = here("data/derived_data/sg_boundary.rds"))
+}
+
+# get sg bounding box
+sg_bbox <- getbb("singapore") %>% as_tibble()
+if(file.exists(here("data/derived_data/streets.rds"))){
+ streets <- readRDS(here("data/derived_data/streets.rds"))
+}else{
+ streets <- opq(bbox = c(sg_bbox$min[1], sg_bbox$min[2], sg_bbox$max[1], sg_bbox$max[2])) %>%
+ add_osm_feature(key = "highway", value = c("primary", "trunk")) %>%
+ osmdata_sf()
+ streets <- streets$osm_lines %>%
+ filter(!is.na(name)) %>%
+ st_simplify(dTolerance = 0.0001) %>%
+ st_transform(crs = 3414) %>%
+ st_join(., sg_planing_areas, largest = T) %>%
+ filter(!is.na(REGION_N))
+ saveRDS(streets, file = here("data/derived_data/streets.rds"))
+}
+
+if(file.exists(here("data/derived_data/area_centers.rds"))){
+ area_centers <- readRDS(here("data/derived_data/area_centers.rds"))
+}else{
+ # reference area
+ area <- sg_planing_areas %>%
+ filter(SUBZONE_N %in% c("NATIONAL UNIVERSITY OF S'PORE", "SENTOSA", "EAST COAST", "WOODLANDS", "CENTRAL WATER CATCHMENT", "JURONG GATEWAY", "CITY HALL", "CHANGI AIRPORT", "TAMPINES EAST", "SERANGOON CENTRAL")) %>% # NORTHSHORE
+ mutate(SUBZONE_N = map_chr(SUBZONE_N, stringr::str_to_title))
+
+ # reference area centers
+ area_centers <- area %>%
+ st_centroid() %>% rownames_to_column(var = "id") %>%
+ mutate(label = paste0(id, ". ", SUBZONE_N)) %>%
+ mutate(label = factor(label, levels = label))
+ saveRDS(area_centers, file = here("data/derived_data/area_centers.rds"))
+}
+
+# hatched reference areas
+if(file.exists(here("data/derived_data/area_hatched.rds"))){
+ area_hatched <- readRDS(here("data/derived_data/area_hatched.rds"))
+}else{
+ area_hatched <- hatchedLayer(area, pattern = "right2left", mode = "sfc", density = 7)
+ saveRDS(area_hatched, file = here("data/derived_data/area_hatched.rds"))
+}
+```
+
+## Prepare image
+
+``` r
+tune_img <- function(img_nm, img_title, geom_size){
+ base_path <- "data/photos/"
+ img <- image_read(here(paste0(base_path, img_nm))) %>%
+ image_resize(geometry = geom_size)
+ ggdraw() +
+ draw_image(img, x = 0.9, y = 0.9, hjust = 1, vjust = 1, height = 0.88) +
+ draw_plot_label(img_title, fontface = "plain", x = 0.0, y = 0.99, hjust = 0, vjust = 1, size = 10)
+}
+
+img1 <- tune_img(img_nm = "cityhall.jpeg", img_title = "1. City Hall", geom_size = "400x550")
+img2 <- tune_img(img_nm = "flickr-eastcoast.jpg", img_title = "2. East Coast", geom_size = "400x550")
+img3 <- tune_img(img_nm = "flickr-nus.jpg", img_title = "3. National University of Singapore", geom_size = "400x550")
+img4 <- tune_img(img_nm = "flickr-sentosa.jpg", img_title = "4. Sentosa", geom_size = "400x550")
+img5 <- tune_img(img_nm = "flickr-changiairport.jpg", img_title = "5. Changi Airport", geom_size = "400x550")
+img6 <- tune_img(img_nm = "flickr-tampines.jpg", img_title = "6. Tampines", geom_size = "400x550")
+img7 <- tune_img(img_nm = "flickr-central-water-catchment.jpg", img_title = "7. Central Water Catchment", geom_size = "400x550")
+img8 <- tune_img(img_nm = "flickr-woodlands.jpg", img_title = "8. Woodlands", geom_size = "400x550")
+# img8 <- tune_img(img_nm = "flickr-northshore.jpg", img_title = "8. Northshore", geom_size = "400x550")
+img9 <- tune_img(img_nm = "flickr-serangoon.jpg", img_title = "9. Serangoon Centre", geom_size = "400x550")
+img10 <- tune_img(img_nm = "flickr-jurongeast.jpg", img_title = "10. Jurong Gateway", geom_size = "400x550")
+```
+
+## Draw base map
+
+``` r
+base_map <- tm_shape(sg_boundary) +
+ tm_borders(col = "grey") +
+ tm_shape(streets) +
+ tm_lines(col = rgb(80, 80, 80, maxColorValue = 255), lwd = 1, alpha = 1) +
+ tm_shape(area_hatched) +
+ tm_lines(col = rgb(80, 110, 120, maxColorValue = 255)) +
+ tm_shape(area_centers) +
+ tm_bubbles(col = rgb(80, 80, 80, maxColorValue = 255), size = 1) +
+ tm_text(text = "id", col = "white", size = 1) +
+ tm_layout(frame = F)
+
+grid.newpage()
+pushViewport(viewport(layout = grid.layout(ncol = 3, nrow = 5, widths = c(3.4, 0.9, 0.9))))
+print(base_map, vp = viewport(layout.pos.col = 1))
+grid.text("Labeled places are reference areas", x = 0.5, y = 0.1, just = c(1, 1), gp = gpar(fontsize = 12, fontface = "italic"), vp = viewport(layout.pos.col = 1))
+grid.text("Singapore map", x = 0.35, y = 0.06, just = c(1, 1), gp = gpar(fontsize = 12, fontface = "italic"), vp = viewport(layout.pos.col = 1))
+print(img1, vp = viewport(layout.pos.col = 2, layout.pos.row = 1))
+print(img2, vp = viewport(layout.pos.col = 2, layout.pos.row = 2))
+print(img3, vp = viewport(layout.pos.col = 2, layout.pos.row = 3))
+print(img4, vp = viewport(layout.pos.col = 2, layout.pos.row = 4))
+print(img5, vp = viewport(layout.pos.col = 2, layout.pos.row = 5))
+print(img6, vp = viewport(layout.pos.col = 3, layout.pos.row = 1))
+print(img7, vp = viewport(layout.pos.col = 3, layout.pos.row = 2))
+print(img8, vp = viewport(layout.pos.col = 3, layout.pos.row = 3))
+print(img9, vp = viewport(layout.pos.col = 3, layout.pos.row = 4))
+print(img10, vp = viewport(layout.pos.col =3, layout.pos.row = 5))
+```
+
+
+
+## Photo links
+
+- [1. City Hall photography by the author]()
+- [2. East Coast photography by Tamaki
+ Hayashi](https://www.flickr.com/photos/hayashi28/16187474/in/photolist-2qXYo-qCvt7K-26wSwcJ-kdaVu7-27yncys-2eMRcjE-w4hvEx-24T2eJh-KuRxG5-pcnWhA-21cq2Ng-YXSYvY-6aXnGB-R75jxc-6gLWAK-6g9xTZ-6gLWAP-27CLs4P-fTqkGF-bAk7oS-2g4jTCz-2g4jYpG-R75kAp-YnLmvG-srHoHY-2eMQQib-2dLuWHw-V31HLx-2dtEcc6-2eMQMdy-spAc1Q-2eMQMC1-2dLuYfQ-GYKYLE-9evMBH-abgqEr-9ibhpX-8fm2wr-4FwJcy-5X6RJu-W9WWcK-arJG48-9eyTM5-9vbbes-6WAMiU-qCvVRr-gJBndV-B1aft-4CfEFn-dmJHhU)
+- [3. National University of Singapore photography by Melvin
+ Yap](https://www.flickr.com/photos/mjmyap/12746332024/in/photolist-kqmhrf-kd1VQp-kd1WeR-56JJ3-6ajFyw)
+- [4. Sentosa photography by William
+ Cho](https://www.flickr.com/photos/adforce1/5559170576/in/photolist-9tfcJy-yENBCs-a6dsbx-gvdAZA-gvfCpz-gvfh59-gv9Jdn-gvaVma-gvabFJ-gveNCH-gvafdy-gvakNP-gvdM3G-gv9wGL-gvf1rT-gvbAxj-gveNxp-gva7Er-gv9JAf-gvaaaB-gvbE4M-8hCRLa-gvbLuU-gvbHoo-gveK3D-gvcjEn-gveE5x-gveSRK-gveSk5-gvbcUT-gvbRz3-R8vPx6-gwRKmM-gvfd9v-gvcoiz-gvc1Td-gvccSe-gvbHgD-gv9XHD-gvbDRU-gv6XX1-gvdxZs-ejZX6j-gv84r6-gwR99w-gveGmV-gwSaTH-gwSjC4-gwRsrp-gv7RtD)
+- [5. Changi Airport photography by Geoff
+ Whalan](https://www.flickr.com/photos/geoffwhalan/48355675096/in/photolist-2gF2tw9-2gF1XF7-2jdxvdU-2jjBSA9-2gHKZTv-2gPxibT-2gPwfCN-2gPwnKD-2fs2UAE-2e8NpV8-2ie9BdV-2jjzcQe-2gEY8uh-2iebWic-2jxGTU4-2hbfZA9-2h1kTU3-2jjz6rh-2jjyQST-2jbMbxg-24UyAXV-2g2i2TN-2e8NpUB-2e8NpUg-2jxGSRT-RLiBue-2jFRJqy-2jjBQLT-2iCLYF9-2jxCwhk-2hq6rfz-2hAJ5z7-2hAJ5X1-2hAFeDC-2hAJ5QH-2hAJ5Lu-2hAJYz5-2hHcTuw-2hAJ5pn-2gZQN1S-2jxGTBa-2gpdtGz-2gpcFV9-2gpd5KB-2gpciRt-2estN52-2gpcj5p-2gpdtCB-2gpd5RU-2gZR7LY)
+- [6. Tampines photography by Dickson
+ Phua](https://www.flickr.com/photos/gunman47/37077267100/in/photolist-Xst9mG-JMYrh-YuoFKS-2iv6iDA-48jdNv-oCZhM6-opU96Z-7FwzH5-7FwzTm-o9UV7N-NtHhkd-9RQyvV)
+- [7. Central Water Catchment photography by CW
+ Gan](https://www.flickr.com/photos/gancw1/41754717620/in/photolist-26BHQQu-27ZnxXM-2jMae6A-4oR778-cp4c1Q-2jMae5Z-2jM5NBQ-28gVPJ5-28h5Gcd-27Znyig-29nvBt4-29iecwo-26BHRuf-26BHQSy-4oXyNQ-4p8q41-4oTvq8-4oXyUs-Sysczd-4BJDiH-4oTze6-G5A7h-SB9Xe4-26BHRaN-26BHR2S-272S1B-26BHQWG-26BHReq-bkU6Zy-27Zny8g-27Zny5F-4oKwZB-4oKx6c-byNYg8-fdYdKs-4p4ms2-272RV6-26BHRZJ-277ijj-4pefyr-4oTvwK-4p4miP-4DFfFe-4p4mo8-4DKusW-26BHQYf-4oKx3H-4p4mhk-4BNKdN-FVEP5)
+- [8. Northshore photography by Steel
+ Wool](https://www.flickr.com/photos/wynnie/7232205516/in/photolist-c25WJy-28d8aTw-c25TFU-fDeFiT-c25Wq3-NQniYf-Vt1gzM-wL6qfk-cBjohb-hyiq8i-cBjoyN-hygGJw-2i1Wv2x-c25Uoo-c25Tjs-fDeHEP-c25WWE-c25Vhw-c25Xx1-c25Vxf-c25Xfb-c25TWG-c25W5o-c25VN9-c25Uad-c25SH3-c25UPf-c25SVQ-c25V19-c25UAN-fDw9Fm-fDeEhB-fDf1dt-fDeJ8F-hyioiM-cBjnHo-hygTXo-2kJUSS-fDeDHa-fDwgXo-fDeWcc-fDeCx2-fDwhro-fDwaSd-fDwg3L-fDeJwt-hygUKf-hyitqR-fDwwus-hyhnyd)
+- [9. Serangoon Centre photography by Ellen
+ Forsyth](https://www.flickr.com/photos/ellf/8409937430/in/photolist-dPa8pd-9MFzf7-dP4rHD-7bFYeH-7bFYeF-dP4tMa-78GKXb-dP4tzH-dP4pFp-dP4qpc-dP4sWF-dP4tjk-dPa6d3-dPa3wh-dPa5m3-dPa2wS-dP4u3P-dP4rQ4-dPa7M3-dPa6p9-dP9YzG-dPa4z9-dPa8as-dP9Zwm-dPa6PY-dP4syM-dP4vy4-dPa4NQ-8rSNck-dP4v9c-dPa3iY-2hJHQhg-dP4rgR-dPa3DS-dP4uAa-dPa4Yd-dP4ntR-eNSisJ-dP4rvr-dP4qAH-2iCnxqK-5S2zFs-8Ragv2-8419A8-5Yma9Q-8zKDPW-bJQ45B-b3ErW-4sVcWh-impgN5)
+- [10. Jurong East photography by Edsel
+ Little](https://www.flickr.com/photos/edsel_/28009328562/in/photolist-JF68xY-HTJSDF-24TRubm-2coooYE-JQ85Mx-HTKfE4-JHdugK-HTKay2-uVauU-HTJL9r-JQ5gJD-cSjsx-JEZTnd-ef5yUz-ejtNDs-8cHrMz-7Zrhr6-7QXRYr-22uSBnK-22s6MX5-8azi5a-JM53WU-22s42Ys-JpjoRb-22uT9bD-4C7sx9-21gDFSG-JF66Fw-JHd2HK-JQ8za4-HTMgZW-JQ8wPx-8cH3t1-JQ3nxM-8CdUwk-JM4PWf-JHemmp-JQ3Vec-JM4Mfd-JQ8Eec-HTM3XU-24eAawG-8cHrMK-JQ8cua-4C3aoR-22uXQUe-6TsdYW-HTKRGt-8cHrMP-5tgHAH)
diff --git a/analysis/01-design-base-map_files/figure-gfm/unnamed-chunk-3-1.png b/analysis/01-design-base-map_files/figure-gfm/unnamed-chunk-3-1.png
new file mode 100644
index 0000000..d032ca2
Binary files /dev/null and b/analysis/01-design-base-map_files/figure-gfm/unnamed-chunk-3-1.png differ
diff --git a/analysis/02-dataset-information.Rmd b/analysis/02-dataset-information.Rmd
new file mode 100644
index 0000000..425cf6a
--- /dev/null
+++ b/analysis/02-dataset-information.Rmd
@@ -0,0 +1,130 @@
+---
+title: "Dataset information"
+author: "Qingqing Chen"
+date: "Last compiled date: `r format(Sys.time(), '%d %B, %Y')`"
+output: github_document
+editor_options:
+ chunk_output_type: console
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE, message = F, warning = F, fig.width = 10, fig.height = 10)
+library(tidyverse)
+library(dplyr)
+library(tmap)
+library(janitor)
+library(here)
+library(sf)
+library(lubridate)
+```
+
+## Load data
+
+```{r}
+df <- readRDS(here::here("data/derived_data/deidentified_sg_tweets_updated.rds"))
+sg_boundary <- readRDS(here("data/derived_data/sg_boundary.rds"))
+streets <- readRDS(here("data/derived_data/streets.rds"))
+area_hatched <- readRDS(here("data/derived_data/area_hatched.rds"))
+area_centers <- readRDS(here("data/derived_data/area_centers.rds"))
+grids <- read_sf(here("data/derived_data/spatial_hex_grid.shp"), quiet = T) %>%
+ st_transform(crs = 3414)
+```
+
+## Visualization
+
+```{r}
+sf_counts <- df %>%
+ group_by(grid_id) %>%
+ summarise(n_tweets = n(),
+ n_users = n_distinct(u_id)) %>%
+ left_join(., grids, by = c("grid_id" = "grid_id")) %>%
+ st_as_sf()
+```
+
+### Spatial distribution of tweets
+```{r}
+tm_shape(sg_boundary) +
+ tm_borders(col = "grey") +
+ tm_shape(sf_counts) +
+ tm_fill("n_tweets",
+ palette = "PuRd",
+ breaks = c(0, 500, 1000, 5000, 10000, 30000, 50000, 100000, 200000, 250000),
+ style = "fixed",
+ alpha = 0.8,
+ legend.is.portrait = F,
+ legend.hist = TRUE,
+ legend.format = list(text.align = "center"),
+ title = "Number of tweets") +
+ tm_shape(streets) +
+ tm_lines(col = rgb(80, 80, 80, maxColorValue = 255), lwd = 1, alpha = 0.8) +
+ tm_shape(area_hatched) +
+ tm_lines(col = rgb(80, 110, 120, maxColorValue = 255), alpha = 0.8) +
+ tm_layout(title.position = c("left", "top"),
+ main.title = "Spatial distribution of tweets",
+ main.title.size = 1.5,
+ frame = F,
+ legend.position = c("right", "bottom"),
+ legend.bg.color = "white",
+ legend.width = 0.4,
+ legend.title.size = 1.2,
+ legend.text.size = 0.5,
+ legend.hist.height = 0.27,
+ legend.hist.width = 0.4,
+ legend.hist.size = 0.6)
+```
+
+### Spatial distribution of users
+
+```{r}
+tm_shape(sg_boundary) +
+ tm_borders(col = "grey") +
+ tm_shape(sf_counts) +
+ tm_fill("n_users",
+ palette = "PuRd",
+ breaks = c(0, 100, 300, 500, 1000, 3000, 5000, 10000, 30000, 50000),
+ style = "fixed",
+ alpha = 0.8,
+ legend.is.portrait = F,
+ legend.hist = TRUE,
+ title = "Number of users") +
+ tm_shape(streets) +
+ tm_lines(col = rgb(80, 80, 80, maxColorValue = 255), lwd = 1, alpha = 0.8) +
+ tm_shape(area_hatched) +
+ tm_lines(col = rgb(80, 110, 120, maxColorValue = 255), alpha = 0.8) +
+ tm_layout(title.position = c("left", "top"),
+ main.title = "Spatial distribution of users",
+ main.title.size = 1.5,
+ frame = F,
+ legend.position = c("right", "bottom"),
+ legend.bg.color = "white",
+ legend.width = 0.4,
+ legend.title.size = 1.2,
+ legend.text.size = 0.5,
+ legend.hist.height = 0.27,
+ legend.hist.width = 0.4,
+ legend.hist.size = 0.6)
+```
+
+### Distribution of tweets and users over time
+
+```{r}
+counts_ym <- df %>%
+ mutate(year_month = format(df$created_at, "%Y-%m")) %>%
+ group_by(year_month) %>%
+ dplyr::summarise(n_tweets = n(),
+ n_users = n_distinct(u_id)) %>%
+ arrange(year_month)
+
+counts_ym %>%
+ arrange(year_month) %>%
+ mutate(year_month = factor(year_month, levels = year_month)) %>%
+ gather(key = "key", value = "value", -year_month) %>%
+ mutate(key = recode(key, "n_tweets" = "Number of Tweets", "n_users" = "Number of Users")) %>%
+ ggplot() +
+ geom_bar(aes(x = year_month, y = value, fill = key), stat = "identity") +
+ facet_wrap(~key, scales = "free_y", ncol = 1) +
+ theme_classic() +
+ theme(legend.position = "NULL",
+ axis.text.x = element_text(angle = 90)) +
+ labs(x = "", y = "Count", title = "Distribution of tweets and users from 2012 - 2016")
+```
diff --git a/analysis/02-dataset-information.md b/analysis/02-dataset-information.md
new file mode 100644
index 0000000..46055e8
--- /dev/null
+++ b/analysis/02-dataset-information.md
@@ -0,0 +1,122 @@
+Dataset information
+================
+Qingqing Chen
+Last compiled date: 17 September, 2021
+
+## Load data
+
+``` r
+df <- readRDS(here::here("data/derived_data/deidentified_sg_tweets_updated.rds"))
+sg_boundary <- readRDS(here("data/derived_data/sg_boundary.rds"))
+streets <- readRDS(here("data/derived_data/streets.rds"))
+area_hatched <- readRDS(here("data/derived_data/area_hatched.rds"))
+area_centers <- readRDS(here("data/derived_data/area_centers.rds"))
+grids <- read_sf(here("data/derived_data/spatial_hex_grid.shp"), quiet = T) %>%
+ st_transform(crs = 3414)
+```
+
+## Visualization
+
+``` r
+sf_counts <- df %>%
+ group_by(grid_id) %>%
+ summarise(n_tweets = n(),
+ n_users = n_distinct(u_id)) %>%
+ left_join(., grids, by = c("grid_id" = "grid_id")) %>%
+ st_as_sf()
+```
+
+### Spatial distribution of tweets
+
+``` r
+tm_shape(sg_boundary) +
+ tm_borders(col = "grey") +
+ tm_shape(sf_counts) +
+ tm_fill("n_tweets",
+ palette = "PuRd",
+ breaks = c(0, 500, 1000, 5000, 10000, 30000, 50000, 100000, 200000, 250000),
+ style = "fixed",
+ alpha = 0.8,
+ legend.is.portrait = F,
+ legend.hist = TRUE,
+ legend.format = list(text.align = "center"),
+ title = "Number of tweets") +
+ tm_shape(streets) +
+ tm_lines(col = rgb(80, 80, 80, maxColorValue = 255), lwd = 1, alpha = 0.8) +
+ tm_shape(area_hatched) +
+ tm_lines(col = rgb(80, 110, 120, maxColorValue = 255), alpha = 0.8) +
+ tm_layout(title.position = c("left", "top"),
+ main.title = "Spatial distribution of tweets",
+ main.title.size = 1.5,
+ frame = F,
+ legend.position = c("right", "bottom"),
+ legend.bg.color = "white",
+ legend.width = 0.4,
+ legend.title.size = 1.2,
+ legend.text.size = 0.5,
+ legend.hist.height = 0.27,
+ legend.hist.width = 0.4,
+ legend.hist.size = 0.6)
+```
+
+![](02-dataset-information_files/figure-gfm/unnamed-chunk-3-1.png)
+
+### Spatial distribution of users
+
+``` r
+tm_shape(sg_boundary) +
+ tm_borders(col = "grey") +
+ tm_shape(sf_counts) +
+ tm_fill("n_users",
+ palette = "PuRd",
+ breaks = c(0, 100, 300, 500, 1000, 3000, 5000, 10000, 30000, 50000),
+ style = "fixed",
+ alpha = 0.8,
+ legend.is.portrait = F,
+ legend.hist = TRUE,
+ title = "Number of users") +
+ tm_shape(streets) +
+ tm_lines(col = rgb(80, 80, 80, maxColorValue = 255), lwd = 1, alpha = 0.8) +
+ tm_shape(area_hatched) +
+ tm_lines(col = rgb(80, 110, 120, maxColorValue = 255), alpha = 0.8) +
+ tm_layout(title.position = c("left", "top"),
+ main.title = "Spatial distribution of users",
+ main.title.size = 1.5,
+ frame = F,
+ legend.position = c("right", "bottom"),
+ legend.bg.color = "white",
+ legend.width = 0.4,
+ legend.title.size = 1.2,
+ legend.text.size = 0.5,
+ legend.hist.height = 0.27,
+ legend.hist.width = 0.4,
+ legend.hist.size = 0.6)
+```
+
+![](02-dataset-information_files/figure-gfm/unnamed-chunk-4-1.png)
+
+### Distribution of tweets and users over time
+
+``` r
+counts_ym <- df %>%
+ mutate(year_month = format(df$created_at, "%Y-%m")) %>%
+ group_by(year_month) %>%
+ dplyr::summarise(n_tweets = n(),
+ n_users = n_distinct(u_id)) %>%
+ arrange(year_month)
+
+counts_ym %>%
+ arrange(year_month) %>%
+ mutate(year_month = factor(year_month, levels = year_month)) %>%
+ gather(key = "key", value = "value", -year_month) %>%
+ mutate(key = recode(key, "n_tweets" = "Number of Tweets", "n_users" = "Number of Users")) %>%
+ ggplot() +
+ geom_bar(aes(x = year_month, y = value, fill = key), stat = "identity") +
+ facet_wrap(~key, scales = "free_y", ncol = 1) +
+ theme_classic() +
+ theme(legend.position = "NULL",
+ axis.text.x = element_text(angle = 90)) +
+ labs(x = "", y = "Count", title = "Distribution of tweets and users from 2012 - 2016")
+```
+
+![](02-dataset-information_files/figure-gfm/unnamed-chunk-5-1.png)
diff --git a/analysis/02-dataset-information_files/figure-gfm/unnamed-chunk-3-1.png b/analysis/02-dataset-information_files/figure-gfm/unnamed-chunk-3-1.png
new file mode 100644
index 0000000..0866aef
Binary files /dev/null and b/analysis/02-dataset-information_files/figure-gfm/unnamed-chunk-3-1.png differ
diff --git a/analysis/02-dataset-information_files/figure-gfm/unnamed-chunk-4-1.png b/analysis/02-dataset-information_files/figure-gfm/unnamed-chunk-4-1.png
new file mode 100644
index 0000000..101c7df
Binary files /dev/null and b/analysis/02-dataset-information_files/figure-gfm/unnamed-chunk-4-1.png differ
diff --git a/analysis/02-dataset-information_files/figure-gfm/unnamed-chunk-5-1.png b/analysis/02-dataset-information_files/figure-gfm/unnamed-chunk-5-1.png
new file mode 100644
index 0000000..3607749
Binary files /dev/null and b/analysis/02-dataset-information_files/figure-gfm/unnamed-chunk-5-1.png differ
diff --git a/analysis/03-create-hexagonal-grids.Rmd b/analysis/03-create-hexagonal-grids.Rmd
new file mode 100644
index 0000000..a97008a
--- /dev/null
+++ b/analysis/03-create-hexagonal-grids.Rmd
@@ -0,0 +1,67 @@
+---
+title: "Spatial aggregation: create hexagonal grids"
+author: "Qingqing Chen"
+date: "Last compiled date: `r format(Sys.time(), '%d %B, %Y')`"
+output: github_document
+editor_options:
+ chunk_output_type: console
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE, fig.align='center', message = FALSE, warning = FALSE, fig.height = 10, fig.width = 10)
+library(tidyverse)
+library(sf)
+library(tmap)
+library(here)
+```
+
+### Load Singapore boundary
+
+```{r}
+sg_subzone <- read_sf(here("data/raw_data/sg-subzone/MP14_SUBZONE_NO_SEA_PL.shp"), quiet = T) %>%
+ st_transform(crs = 3414) %>%
+ st_make_valid()
+tm_shape(sg_subzone) +
+ tm_polygons(alpha = 0.2) +
+ tm_layout(frame = F)
+```
+
+### Aggregated to hexgonal grids
+
+```{r}
+# assign the spatial unit size to cellsize argument, here we use 750m
+# and transform coordinates of simple feature to Singapore: https://epsg.io/3414
+set.seed(1314)
+grids <- st_make_grid(sg_subzone, cellsize = 750, square = F) %>%
+ st_sf() %>%
+ rowid_to_column("grid_id") %>%
+ st_transform(crs = 3414) %>%
+ st_intersection(sg_subzone) %>%
+ select(grid_id) %>%
+ group_by(grid_id) %>%
+ summarise()
+tm_shape(grids) +
+ tm_polygons(alpha = 0.2) +
+ tm_layout(frame = F)
+```
+
+The created grid cells can be saved under `data/derived_data/` directory.
+
+```{r eval=FALSE}
+st_write(grids, here("data/derived_data/spatial_hex_grid.shp"), quiet = T)
+```
+
+```{r}
+sg_subzone %>%
+ group_by(REGION_N) %>%
+ summarise() %>%
+ mutate(REGION_N = gsub(" REGION", "", REGION_N)) %>%
+ dplyr::rename(Regions = REGION_N) %>%
+ tm_shape() +
+ tm_polygons(col = "Regions", alpha = 0.6) +
+ tm_shape(grids) +
+ tm_borders(col = "grey20") +
+ tm_layout(frame = F) +
+ tm_credits("Singapore map", position = c(0.85, 0.95))
+```
+
diff --git a/analysis/03-create-hexagonal-grids.md b/analysis/03-create-hexagonal-grids.md
new file mode 100644
index 0000000..8728c71
--- /dev/null
+++ b/analysis/03-create-hexagonal-grids.md
@@ -0,0 +1,61 @@
+Spatial aggregation: create hexagonal grids
+================
+Qingqing Chen
+Last compiled date: 17 September, 2021
+
+### Load Singapore boundary
+
+``` r
+sg_subzone <- read_sf(here("data/raw_data/sg-subzone/MP14_SUBZONE_NO_SEA_PL.shp"), quiet = T) %>%
+ st_transform(crs = 3414) %>%
+ st_make_valid()
+tm_shape(sg_subzone) +
+ tm_polygons(alpha = 0.2) +
+ tm_layout(frame = F)
+```
+
+
+
+### Aggregated to hexgonal grids
+
+``` r
+# assign the spatial unit size to cellsize argument, here we use 750m
+# and transform coordinates of simple feature to Singapore: https://epsg.io/3414
+set.seed(1314)
+grids <- st_make_grid(sg_subzone, cellsize = 750, square = F) %>%
+ st_sf() %>%
+ rowid_to_column("grid_id") %>%
+ st_transform(crs = 3414) %>%
+ st_intersection(sg_subzone) %>%
+ select(grid_id) %>%
+ group_by(grid_id) %>%
+ summarise()
+tm_shape(grids) +
+ tm_polygons(alpha = 0.2) +
+ tm_layout(frame = F)
+```
+
+
+
+The created grid cells can be saved under `data/derived_data/`
+directory.
+
+``` r
+st_write(grids, here("data/derived_data/spatial_hex_grid.shp"), quiet = T)
+```
+
+``` r
+sg_subzone %>%
+ group_by(REGION_N) %>%
+ summarise() %>%
+ mutate(REGION_N = gsub(" REGION", "", REGION_N)) %>%
+ dplyr::rename(Regions = REGION_N) %>%
+ tm_shape() +
+ tm_polygons(col = "Regions", alpha = 0.6) +
+ tm_shape(grids) +
+ tm_borders(col = "grey20") +
+ tm_layout(frame = F) +
+ tm_credits("Singapore map", position = c(0.85, 0.95))
+```
+
+
diff --git a/analysis/03-create-hexagonal-grids_files/figure-gfm/unnamed-chunk-1-1.png b/analysis/03-create-hexagonal-grids_files/figure-gfm/unnamed-chunk-1-1.png
new file mode 100644
index 0000000..c8a9fd6
Binary files /dev/null and b/analysis/03-create-hexagonal-grids_files/figure-gfm/unnamed-chunk-1-1.png differ
diff --git a/analysis/03-create-hexagonal-grids_files/figure-gfm/unnamed-chunk-2-1.png b/analysis/03-create-hexagonal-grids_files/figure-gfm/unnamed-chunk-2-1.png
new file mode 100644
index 0000000..292a17e
Binary files /dev/null and b/analysis/03-create-hexagonal-grids_files/figure-gfm/unnamed-chunk-2-1.png differ
diff --git a/analysis/03-create-hexagonal-grids_files/figure-gfm/unnamed-chunk-4-1.png b/analysis/03-create-hexagonal-grids_files/figure-gfm/unnamed-chunk-4-1.png
new file mode 100644
index 0000000..e5fa269
Binary files /dev/null and b/analysis/03-create-hexagonal-grids_files/figure-gfm/unnamed-chunk-4-1.png differ
diff --git a/analysis/04-identify-home-locations.Rmd b/analysis/04-identify-home-locations.Rmd
new file mode 100644
index 0000000..991c606
--- /dev/null
+++ b/analysis/04-identify-home-locations.Rmd
@@ -0,0 +1,240 @@
+---
+title: "Identifying home locations for users"
+author: "Qingqing Chen"
+date: "Last compiled date: `r format(Sys.time(), '%d %B, %Y')`"
+output: github_document
+editor_options:
+ chunk_output_type: console
+bibliography: bibliography.bib
+---
+
+
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE, fig.width = 10, fig.height = 10)
+library(tidyverse)
+library(sf)
+library(here)
+library(lubridate)
+library(ggpubr)
+library(tmap)
+```
+
+To identify home locations for users from their spatio-temporal footprints, we use an ensemble approach proposed by Chen and Poorthuis [-@homelocator] in their paper named ["_Identifying Home Locations in Human Mobility Data: An Open-Source r Package for Comparison and Reproducibility_"](https://www.tandfonline.com/doi/abs/10.1080/13658816.2021.1887489). The introduced open-source R package, [homelocator](https://github.com/spatialnetworkslab/homelocator), provides a consistent framework and interface for the adoption of different approaches to home location identification.
+
+First, we need to install and load the `homelocator` package with:
+
+```{r eval=FALSE}
+install_github("spatialnetworkslab/homelocator")
+library(homelocator)
+```
+
+
+## Load data
+
+```{r}
+# grid cells
+grids <- st_read(here("data/derived_data/spatial_hex_grid.shp"), quiet = T) %>%
+ st_transform(crs = 3414)
+```
+
+
+```{r eval=FALSE}
+# de-identified tweets
+df <- read_csv(here("data/raw_data/deidentified_sg_tweets.csv")) %>% #load de-identified dataset
+ mutate(created_at = with_tz(created_at, tzone = "Asia/Singapore")) # the tweets were sent in Singapore, so must convert the timezone to SGT, the default timezone is UTC!
+df_counts <- df %>%
+ group_by(u_id) %>%
+ nest() %>%
+ mutate(n_tweets = map_dbl(data, nrow),
+ n_locs = map_dbl(data, function(x) n_distinct(x$grid_id)))
+df <- df_counts %>%
+ filter(n_tweets >= 10 & n_locs > 1) %>% # remove users with less than 10 tweets or only tweet at a single place
+ unnest(data) %>%
+ dplyr::select(-c(n_tweets, n_locs)) %>%
+ ungroup()
+```
+
+The updated de-identified dataset is in `analysis/data/derived_data`.
+
+```{r eval=FALSE}
+saveRDS(df, file = here("data/derived_data/deidentified_sg_tweets_updated.rds"))
+```
+
+
+## Identify home locations
+
+Finally, we apply the four built-in recipes from `homelocator` package on the de-identified data set and only assign home location for users if four algorithms agree.
+
+
+### Recipe: APDM
+
+Recipe 'APDM' (Rein Ahas et al., 2010) calculates both the average and standard deviation timestamps in each location for each user.
+
+```{r eval=FALSE}
+#generate grid neighbors
+st_queen <- function(a, b = a) st_relate(a, b, pattern = "F***T****")
+neighbors <- st_queen(grids)
+
+#convert list to tibble
+list_to_tibble <- function(index, neighbors){
+ tibble(grid_id = as.character(index)) %>%
+ mutate(neighbor = list(neighbors[[index]]))
+}
+df_neighbors <- do.call(rbind, map(1:length(neighbors), function(x) list_to_tibble(x, neighbors)))
+
+#recipe: APDM
+hm_apdm <- homelocator::identify_location(df, user = "u_id", timestamp = "created_at", location = "grid_id",
+ tz = "Asia/Singapore", keep_score = F, recipe = "APDM")
+```
+
+
+### Recipe: FREQ
+
+Recipe 'FREQ' simply selects the most frequently ‘visited’ location as users' home locations.
+
+```{r eval=FALSE}
+hm_freq <- homelocator::identify_location(df, user = "u_id", timestamp = "created_at", location = "grid_id",
+ tz = "Asia/Singapore", show_n_loc = 1, recipe = "FREQ")
+```
+
+### Recipe: HMLC
+
+Recipe ‘HMLC’ weighs data points across multiple time frames to ‘score’ potentially meaningful locations.
+
+
+```{r eval=FALSE}
+hm_hmlc <- homelocator::identify_location(df, user = "u_id", timestamp = "created_at", location = "grid_id",
+ tz = "Asia/Singapore", show_n_loc = 1, keep_score = F, recipe = "HMLC")
+```
+
+### Recipe: OSNA
+
+Recipe 'OSNA' (Efstathiades et al., 2015), only considers data points sent on weekdays and divides a day into three time frames - ‘rest time’, ‘leisure time’ and ‘active time’. The algorithm finds the most ‘popular’ location during ‘rest’ and ‘leisure’ time as the home locations for users.
+
+
+```{r eval=FALSE}
+hm_osna <- homelocator::identify_location(df, user = "u_id", timestamp = "created_at", location = "grid_id",
+ tz = "Asia/Singapore", show_n_loc = 1, recipe = "OSNA")
+```
+
+
+The identified homes are in `data/derived_data`.
+
+```{r eval=FALSE}
+write_csv(hm_apdm, path = here("data/derived_data/hm_apdm.csv"))
+write_csv(hm_freq, path = here("data/derived_data/hm_freq.csv"))
+write_csv(hm_hmlc, path = here("data/derived_data/hm_hmlc.csv"))
+write_csv(hm_osna, path = here("data/derived_data/hm_osna.csv"))
+```
+
+### Assign agreed home locations for users
+
+```{r eval=FALSE}
+## find users with identified home locations that all four algorithms agree
+qualified_uses <- hm_full %>%
+ group_by(u_id) %>%
+ dplyr::summarise(method = dplyr::n_distinct(name),
+ homes = dplyr::n_distinct(home)) %>%
+ filter(method == 4) %>% ## all four algorithms can find the home for the user
+ filter(homes == 1) ## all four algorithms find the SAME home for the user
+
+identified_hms <- hm_full %>%
+ filter(u_id %in% qualified_uses$u_id) %>%
+ dplyr::select(-name) %>%
+ distinct(u_id, home, .keep_all = TRUE)
+```
+
+The identified home locations are saved under `data/derived_data/` directory.
+
+```{r eval=FALSE}
+write_csv(identified_hms, path = here("data/derived_data/identified_hms.csv"))
+```
+
+
+## Visualize identified home locations
+
+```{r}
+identified_hms <- read_csv(here("data/derived_data/identified_hms.csv")) %>%
+ group_by(home) %>%
+ dplyr::summarise(n_users_home = n_distinct(u_id)) %>%
+ left_join(., grids, by = c("home" = "grid_id")) %>%
+ replace(., is.na(.), 0) %>%
+ st_as_sf()
+```
+
+
+```{r}
+sg_boundary <- readRDS(here("data/derived_data/sg_boundary.rds"))
+streets <- readRDS(here("data/derived_data/streets.rds"))
+area_hatched <- readRDS(here("data/derived_data/area_hatched.rds"))
+area_centers <- readRDS(here("data/derived_data/area_centers.rds"))
+
+tm_shape(sg_boundary) +
+ tm_borders(col = "grey") +
+ tm_shape(identified_hms) +
+ tm_fill("n_users_home",
+ palette = "PuRd",
+ style = "quantile",
+ alpha = 0.8,
+ legend.is.portrait = F,
+ legend.format = list(digits = 0),
+ legend.hist = TRUE,
+ title = "Number of inferred homes") +
+ tm_shape(streets) +
+ tm_lines(col = rgb(80, 80, 80, maxColorValue = 255), lwd = 1, alpha = 0.8) +
+ tm_shape(area_hatched) +
+ tm_lines(col = rgb(80, 110, 120, maxColorValue = 255), alpha = 0.8) +
+ tm_layout(title.position = c("left", "top"),
+ title.size = 0.7,
+ main.title = "Spatial distribution of inferred home locations",
+ main.title.size = 1.2,
+ frame = F,
+ legend.position = c("right", "bottom"),
+ legend.bg.color = "white",
+ legend.width = 0.3,
+ legend.title.size = 0.8,
+ legend.text.size = 0.66,
+ legend.hist.height = 0.2,
+ legend.hist.width = 0.3,
+ legend.hist.size = 0.5)
+```
+
+
+```{r}
+#residents in Singapore 2015
+pop2015 <- st_read(here("data/raw_data/PLAN_BDY_DWELLING_TYPE_2015.shp"), quiet = T) %>%
+ st_transform(., crs = 3414) %>%
+ st_make_valid()
+
+inferred_residents <- pop2015 %>%
+ dplyr::select(PLN_AREA_N) %>%
+ st_join(identified_hms, ., largest = T) %>%
+ st_set_geometry(NULL) %>%
+ group_by(PLN_AREA_N) %>%
+ dplyr::summarise(n_inferred_residents = sum(n_users_home))
+
+actual_residents <- pop2015 %>%
+ st_set_geometry(NULL) %>%
+ select(c(PLN_AREA_N, TOTAL))
+
+norm_residents <- left_join(inferred_residents, actual_residents) %>%
+ mutate(norm_n_inferred_residents = n_inferred_residents/sum(n_inferred_residents),
+ norm_n_actual_residents = TOTAL/sum(TOTAL))
+
+
+ggscatter(norm_residents, x = "norm_n_inferred_residents", y = "norm_n_actual_residents",
+ add = "reg.line") +
+ stat_cor(label.y = 0.065) +
+ stat_regline_equation(label.y = 0.07) +
+ ggrepel::geom_text_repel(aes(label=PLN_AREA_N), size = 2.5) +
+ theme(panel.background=element_rect(fill = "white", colour = "black"),
+ title = element_text(size = 12)) +
+ labs(x = "Normalized number of inferred residents",
+ y = "Normalized number of actual residents",
+ title = "Correlation between inferred residents and actual residents")
+```
+
+## Reference
diff --git a/analysis/04-identify-home-locations.md b/analysis/04-identify-home-locations.md
new file mode 100644
index 0000000..5fafa52
--- /dev/null
+++ b/analysis/04-identify-home-locations.md
@@ -0,0 +1,250 @@
+Identifying home locations for users
+================
+Qingqing Chen
+Last compiled date: 18 September, 2021
+
+
+
+To identify home locations for users from their spatio-temporal
+footprints, we use an ensemble approach proposed by Chen and Poorthuis
+(2021) in their paper named [“*Identifying Home Locations in Human
+Mobility Data: An Open-Source r Package for Comparison and
+Reproducibility*”](https://www.tandfonline.com/doi/abs/10.1080/13658816.2021.1887489).
+The introduced open-source R package,
+[homelocator](https://github.com/spatialnetworkslab/homelocator),
+provides a consistent framework and interface for the adoption of
+different approaches to home location identification.
+
+First, we need to install and load the `homelocator` package with:
+
+``` r
+install_github("spatialnetworkslab/homelocator")
+library(homelocator)
+```
+
+## Load data
+
+``` r
+# grid cells
+grids <- st_read(here("data/derived_data/spatial_hex_grid.shp"), quiet = T) %>%
+ st_transform(crs = 3414)
+```
+
+``` r
+# de-identified tweets
+df <- read_csv(here("data/raw_data/deidentified_sg_tweets.csv")) %>% #load de-identified dataset
+ mutate(created_at = with_tz(created_at, tzone = "Asia/Singapore")) # the tweets were sent in Singapore, so must convert the timezone to SGT, the default timezone is UTC!
+df_counts <- df %>%
+ group_by(u_id) %>%
+ nest() %>%
+ mutate(n_tweets = map_dbl(data, nrow),
+ n_locs = map_dbl(data, function(x) n_distinct(x$grid_id)))
+df <- df_counts %>%
+ filter(n_tweets >= 10 & n_locs > 1) %>% # remove users with less than 10 tweets or only tweet at a single place
+ unnest(data) %>%
+ dplyr::select(-c(n_tweets, n_locs)) %>%
+ ungroup()
+```
+
+The updated de-identified dataset is in `analysis/data/derived_data`.
+
+``` r
+saveRDS(df, file = here("data/derived_data/deidentified_sg_tweets_updated.rds"))
+```
+
+## Identify home locations
+
+Finally, we apply the four built-in recipes from `homelocator` package
+on the de-identified data set and only assign home location for users if
+four algorithms agree.
+
+### Recipe: APDM
+
+Recipe ‘APDM’ (Rein Ahas et al., 2010) calculates both the average and
+standard deviation timestamps in each location for each user.
+
+``` r
+#generate grid neighbors
+st_queen <- function(a, b = a) st_relate(a, b, pattern = "F***T****")
+neighbors <- st_queen(grids)
+
+#convert list to tibble
+list_to_tibble <- function(index, neighbors){
+ tibble(grid_id = as.character(index)) %>%
+ mutate(neighbor = list(neighbors[[index]]))
+}
+df_neighbors <- do.call(rbind, map(1:length(neighbors), function(x) list_to_tibble(x, neighbors)))
+
+#recipe: APDM
+hm_apdm <- homelocator::identify_location(df, user = "u_id", timestamp = "created_at", location = "grid_id",
+ tz = "Asia/Singapore", keep_score = F, recipe = "APDM")
+```
+
+### Recipe: FREQ
+
+Recipe ‘FREQ’ simply selects the most frequently ‘visited’ location as
+users’ home locations.
+
+``` r
+hm_freq <- homelocator::identify_location(df, user = "u_id", timestamp = "created_at", location = "grid_id",
+ tz = "Asia/Singapore", show_n_loc = 1, recipe = "FREQ")
+```
+
+### Recipe: HMLC
+
+Recipe ‘HMLC’ weighs data points across multiple time frames to ‘score’
+potentially meaningful locations.
+
+``` r
+hm_hmlc <- homelocator::identify_location(df, user = "u_id", timestamp = "created_at", location = "grid_id",
+ tz = "Asia/Singapore", show_n_loc = 1, keep_score = F, recipe = "HMLC")
+```
+
+### Recipe: OSNA
+
+Recipe ‘OSNA’ (Efstathiades et al., 2015), only considers data points
+sent on weekdays and divides a day into three time frames - ‘rest time’,
+‘leisure time’ and ‘active time’. The algorithm finds the most ‘popular’
+location during ‘rest’ and ‘leisure’ time as the home locations for
+users.
+
+``` r
+hm_osna <- homelocator::identify_location(df, user = "u_id", timestamp = "created_at", location = "grid_id",
+ tz = "Asia/Singapore", show_n_loc = 1, recipe = "OSNA")
+```
+
+The identified homes are in `data/derived_data`.
+
+``` r
+write_csv(hm_apdm, path = here("data/derived_data/hm_apdm.csv"))
+write_csv(hm_freq, path = here("data/derived_data/hm_freq.csv"))
+write_csv(hm_hmlc, path = here("data/derived_data/hm_hmlc.csv"))
+write_csv(hm_osna, path = here("data/derived_data/hm_osna.csv"))
+```
+
+### Assign agreed home locations for users
+
+``` r
+## find users with identified home locations that all four algorithms agree
+qualified_uses <- hm_full %>%
+ group_by(u_id) %>%
+ dplyr::summarise(method = dplyr::n_distinct(name),
+ homes = dplyr::n_distinct(home)) %>%
+ filter(method == 4) %>% ## all four algorithms can find the home for the user
+ filter(homes == 1) ## all four algorithms find the SAME home for the user
+
+identified_hms <- hm_full %>%
+ filter(u_id %in% qualified_uses$u_id) %>%
+ dplyr::select(-name) %>%
+ distinct(u_id, home, .keep_all = TRUE)
+```
+
+The identified home locations are saved under `data/derived_data/`
+directory.
+
+``` r
+write_csv(identified_hms, path = here("data/derived_data/identified_hms.csv"))
+```
+
+## Visualize identified home locations
+
+``` r
+identified_hms <- read_csv(here("data/derived_data/identified_hms.csv")) %>%
+ group_by(home) %>%
+ dplyr::summarise(n_users_home = n_distinct(u_id)) %>%
+ left_join(., grids, by = c("home" = "grid_id")) %>%
+ replace(., is.na(.), 0) %>%
+ st_as_sf()
+```
+
+``` r
+sg_boundary <- readRDS(here("data/derived_data/sg_boundary.rds"))
+streets <- readRDS(here("data/derived_data/streets.rds"))
+area_hatched <- readRDS(here("data/derived_data/area_hatched.rds"))
+area_centers <- readRDS(here("data/derived_data/area_centers.rds"))
+
+tm_shape(sg_boundary) +
+ tm_borders(col = "grey") +
+ tm_shape(identified_hms) +
+ tm_fill("n_users_home",
+ palette = "PuRd",
+ style = "quantile",
+ alpha = 0.8,
+ legend.is.portrait = F,
+ legend.format = list(digits = 0),
+ legend.hist = TRUE,
+ title = "Number of inferred homes") +
+ tm_shape(streets) +
+ tm_lines(col = rgb(80, 80, 80, maxColorValue = 255), lwd = 1, alpha = 0.8) +
+ tm_shape(area_hatched) +
+ tm_lines(col = rgb(80, 110, 120, maxColorValue = 255), alpha = 0.8) +
+ tm_layout(title.position = c("left", "top"),
+ title.size = 0.7,
+ main.title = "Spatial distribution of inferred home locations",
+ main.title.size = 1.2,
+ frame = F,
+ legend.position = c("right", "bottom"),
+ legend.bg.color = "white",
+ legend.width = 0.3,
+ legend.title.size = 0.8,
+ legend.text.size = 0.66,
+ legend.hist.height = 0.2,
+ legend.hist.width = 0.3,
+ legend.hist.size = 0.5)
+```
+
+![](04-identify-home-locations_files/figure-gfm/unnamed-chunk-13-1.png)
+
+``` r
+#residents in Singapore 2015
+pop2015 <- st_read(here("data/raw_data/PLAN_BDY_DWELLING_TYPE_2015.shp"), quiet = T) %>%
+ st_transform(., crs = 3414) %>%
+ st_make_valid()
+
+inferred_residents <- pop2015 %>%
+ dplyr::select(PLN_AREA_N) %>%
+ st_join(identified_hms, ., largest = T) %>%
+ st_set_geometry(NULL) %>%
+ group_by(PLN_AREA_N) %>%
+ dplyr::summarise(n_inferred_residents = sum(n_users_home))
+
+actual_residents <- pop2015 %>%
+ st_set_geometry(NULL) %>%
+ select(c(PLN_AREA_N, TOTAL))
+
+norm_residents <- left_join(inferred_residents, actual_residents) %>%
+ mutate(norm_n_inferred_residents = n_inferred_residents/sum(n_inferred_residents),
+ norm_n_actual_residents = TOTAL/sum(TOTAL))
+
+
+ggscatter(norm_residents, x = "norm_n_inferred_residents", y = "norm_n_actual_residents",
+ add = "reg.line") +
+ stat_cor(label.y = 0.065) +
+ stat_regline_equation(label.y = 0.07) +
+ ggrepel::geom_text_repel(aes(label=PLN_AREA_N), size = 2.5) +
+ theme(panel.background=element_rect(fill = "white", colour = "black"),
+ title = element_text(size = 12)) +
+ labs(x = "Normalized number of inferred residents",
+ y = "Normalized number of actual residents",
+ title = "Correlation between inferred residents and actual residents")
+```
+
+![](04-identify-home-locations_files/figure-gfm/unnamed-chunk-14-1.png)
+
+## Reference
+
+
+
+
+
+Chen, Q. and Poorthuis, A. 2021. *Identifying Home Locations in Human
+Mobility Data: An Open-Source r Package for Comparison and
+Reproducibility*. *International Journal of Geographical Information
+Science, 0(0), Pp. 1–24.*
+.
+
+
+
+
diff --git a/analysis/04-identify-home-locations_files/figure-gfm/unnamed-chunk-13-1.png b/analysis/04-identify-home-locations_files/figure-gfm/unnamed-chunk-13-1.png
new file mode 100644
index 0000000..d3e8a2b
Binary files /dev/null and b/analysis/04-identify-home-locations_files/figure-gfm/unnamed-chunk-13-1.png differ
diff --git a/analysis/04-identify-home-locations_files/figure-gfm/unnamed-chunk-14-1.png b/analysis/04-identify-home-locations_files/figure-gfm/unnamed-chunk-14-1.png
new file mode 100644
index 0000000..7d40eb3
Binary files /dev/null and b/analysis/04-identify-home-locations_files/figure-gfm/unnamed-chunk-14-1.png differ
diff --git a/analysis/05-distinct-locals-and-visitors.Rmd b/analysis/05-distinct-locals-and-visitors.Rmd
new file mode 100644
index 0000000..328bbb9
--- /dev/null
+++ b/analysis/05-distinct-locals-and-visitors.Rmd
@@ -0,0 +1,72 @@
+---
+title: "Distinct locals and visitors"
+author: "Qingqing Chen"
+date: "Last compiled date: `r format(Sys.time(), '%d %B, %Y')`"
+output: github_document
+editor_options:
+ chunk_output_type: console
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = F)
+library(tidyverse)
+library(sf)
+library(here)
+library(purrrogress)
+```
+
+Locals are users with homes within that specific location or hexagonal cell. Conversely, visitors are all other users with homes outside of the cell.
+
+## Load data
+
+```{r}
+# load aggregated grids
+grids <- read_sf(here("data/derived_data/spatial_hex_grid.shp"), quiet = T) %>%
+ st_transform(crs = 3414)
+
+# load twitter dataset
+df <- readRDS(here("data/derived_data/deidentified_sg_tweets_updated.rds"))
+
+# load identified home locations
+identified_hms <- read_csv(here("data/derived_data/identified_hms.csv"))
+```
+
+
+## Distinct locals and visitors
+
+We only consider users with identified home locations. For each grid, a user is assigned as 'local' or 'visitor' according to his/her home location.
+
+- **Local**: If the visited grid is the same as the home location of the user, the user is assigned as 'local'.
+- **Visitor**: If the visited grid is different from the home location of the user, the user is assigned as 'visitor'.
+
+The distinct users in grids are saved under `data/derived_data/` directory.
+
+```{r}
+# users in each of grids
+users_in_grids <- df %>% dplyr::select(-created_at) %>% distinct(u_id, grid_id)
+
+# distinct users
+distinct_users_in_grid <- function(df_users_in_grids, df_home, grid_index){
+ output <- df_users_in_grids %>%
+ filter(grid_id == grid_index) %>%
+ left_join(., df_home) %>%
+ drop_na() %>% #remove users without identified home location
+ mutate(type = if_else(grid_id == home, "local", "visitor"))
+ return(output)
+}
+
+if(file.exists(here("data/derived_data/distinct_users.rds"))){
+ distinct_users <- readRDS(here("data/derived_data/distinct_users.rds"))
+}else{
+ distinct_users <- do.call(bind_rows, map(grids$grid_id, with_progress(function(x) distinct_users_in_grid(users_in_grids, identified_hms, x))))
+ saveRDS(distinct_users, file = here("data/derived_data/distinct_users.rds"))
+}
+
+head(distinct_users)
+```
+
+
+
+
+
+
diff --git a/analysis/05-distinct-locals-and-visitors.md b/analysis/05-distinct-locals-and-visitors.md
new file mode 100644
index 0000000..381e694
--- /dev/null
+++ b/analysis/05-distinct-locals-and-visitors.md
@@ -0,0 +1,70 @@
+Distinct locals and visitors
+================
+Qingqing Chen
+Last compiled date: 18 September, 2021
+
+Locals are users with homes within that specific location or hexagonal
+cell. Conversely, visitors are all other users with homes outside of the
+cell.
+
+## Load data
+
+``` r
+# load aggregated grids
+grids <- read_sf(here("data/derived_data/spatial_hex_grid.shp"), quiet = T) %>%
+ st_transform(crs = 3414)
+
+# load twitter dataset
+df <- readRDS(here("data/derived_data/deidentified_sg_tweets_updated.rds"))
+
+# load identified home locations
+identified_hms <- read_csv(here("data/derived_data/identified_hms.csv"))
+```
+
+## Distinct locals and visitors
+
+We only consider users with identified home locations. For each grid, a
+user is assigned as ‘local’ or ‘visitor’ according to his/her home
+location.
+
+- **Local**: If the visited grid is the same as the home location of
+ the user, the user is assigned as ‘local’.
+- **Visitor**: If the visited grid is different from the home location
+ of the user, the user is assigned as ‘visitor’.
+
+The distinct users in grids are saved under `data/derived_data/`
+directory.
+
+``` r
+# users in each of grids
+users_in_grids <- df %>% dplyr::select(-created_at) %>% distinct(u_id, grid_id)
+
+# distinct users
+distinct_users_in_grid <- function(df_users_in_grids, df_home, grid_index){
+ output <- df_users_in_grids %>%
+ filter(grid_id == grid_index) %>%
+ left_join(., df_home) %>%
+ drop_na() %>% #remove users without identified home location
+ mutate(type = if_else(grid_id == home, "local", "visitor"))
+ return(output)
+}
+
+if(file.exists(here("data/derived_data/distinct_users.rds"))){
+ distinct_users <- readRDS(here("data/derived_data/distinct_users.rds"))
+}else{
+ distinct_users <- do.call(bind_rows, map(grids$grid_id, with_progress(function(x) distinct_users_in_grid(users_in_grids, identified_hms, x))))
+ saveRDS(distinct_users, file = here("data/derived_data/distinct_users.rds"))
+}
+
+head(distinct_users)
+```
+
+ ## # A tibble: 6 × 4
+ ## u_id grid_id home type
+ ##
+ ## 1 82965004 3 1594 visitor
+ ## 2 90218329 3 626 visitor
+ ## 3 49010226 3 1461 visitor
+ ## 4 37768255 4 1158 visitor
+ ## 5 37716383 8 1387 visitor
+ ## 6 12470940 8 442 visitor
diff --git a/analysis/06-calculate-distance.Rmd b/analysis/06-calculate-distance.Rmd
new file mode 100644
index 0000000..d9c55af
--- /dev/null
+++ b/analysis/06-calculate-distance.Rmd
@@ -0,0 +1,230 @@
+---
+title: "Distance of flows"
+author: "Qingqing Chen"
+date: "Last compiled date: `r format(Sys.time(), '%d %B, %Y')`"
+output: github_document
+editor_options:
+ chunk_output_type: console
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE, fig.align='center', fig.width = 10, fig.height = 10, warning = FALSE, message = FALSE)
+library(tidyverse)
+library(dplyr)
+library(sf)
+library(tmap)
+library(RColorBrewer)
+library(purrrogress)
+library(here)
+source(here("R/viz.R"))
+```
+
+
+
+
+
+We calculate the spatial distance from the centroid of a home location to the centroid of each visited location. The mean distance across all users in a location is used for the visualizations and the spatial regression modeling in the paper.
+
+
+## Load data
+
+```{r}
+# full grids
+grids <- read_sf(here("data/derived_data/spatial_hex_grid.shp"), quiet = T) %>%
+ st_transform(crs = 3414)
+
+# grid centroids
+grid_centroids <- grids %>% st_centroid()
+
+# identified home users
+df_hms <- read_csv(here("data/derived_data/identified_hms.csv"))
+
+# sg tweets
+df_tweets <- readRDS(here("data/derived_data/deidentified_sg_tweets_updated.rds"))
+## sg tweets sent by users with identified home locations
+df_hm_tweets <- df_tweets %>% filter(u_id %in% df_hms$u_id)
+
+# distinct users
+distinct_users <- readRDS(here("data/derived_data/distinct_users.rds"))
+```
+
+
+## Distance calculation
+
+```{r}
+# function for distance calculation
+cal_distance <- function(distinct_users, df_hm_tweets, type, grids, grid_index){
+ # step1: get visitors/locals in the input grid
+ if(type == "inflow"){
+ # visitors
+ users_in_grids <- distinct_users %>%
+ filter(grid_id == grid_index) %>% # all users in the input grid
+ filter(type == "visitor")
+ } else if(type == "outflow"){
+ # locals
+ users_in_grids <- distinct_users %>%
+ filter(grid_id == grid_index) %>% # all users in the input grid
+ filter(type == "local")
+ }
+
+ # step2: remove grids with fewer than 5 visitors/locals
+ if(n_distinct(users_in_grids$u_id) < 5){
+ message("Less than 5 users in the grid, remove this grid!")
+ output <- tibble()
+ } else{
+ # step3: get tweets for visitors/locals
+ if(type == "inflow"){
+ df_tweets <- df_hm_tweets %>%
+ filter(grid_id == grid_index) %>% # filter all tweets in the target grid
+ left_join(users_in_grids, .) # get tweets send by visitors
+ }else if(type == "outflow"){
+ df_tweets <- df_hm_tweets %>%
+ filter(u_id %in% users_in_grids$u_id) %>% # tweets sent by locals
+ filter(grid_id != grid_index) %>% # remove tweets sent at home place
+ mutate(home = grid_index) # add home location
+ }
+ # step 4: calculate distance
+ # geometry of home grid
+ home_geometry <- df_tweets %>%
+ left_join(., grids, by = c("home" = "grid_id")) %>%
+ st_as_sf() %>%
+ st_centroid() %>%
+ st_geometry()
+ # geometry of outside home grids
+ grid_geometry <- df_tweets %>%
+ left_join(., grids, by = "grid_id") %>%
+ st_as_sf() %>%
+ st_centroid() %>%
+ st_geometry()
+ # calculate distance
+ output <- df_tweets %>%
+ mutate(dist_hm2grid = st_distance(home_geometry, grid_geometry, by_element = TRUE) %>% as.numeric(),
+ dist_hm2grid_km = round(dist_hm2grid/1000, 4))
+ }
+ return(output)
+}
+```
+
+### Inflow distance
+
+The inflow distance is saved under `data/derived_data/` directory.
+
+```{r}
+# inflow distance
+if(file.exists(here("data/derived_data/dist_visitor_points.rds"))){
+ dist_visitor_points <- readRDS(here("data/derived_data/dist_visitor_points.rds"))
+}else{
+ input_grids <- unique(distinct_users$grid_id)
+ dist_visitor_points <- do.call(bind_rows, map(input_grids, with_progress(function(x) cal_distance(distinct_users, df_hm_tweets, type = "inflow", grids, x))))
+ dist_visitor_points <- dist_visitor_points %>%
+ dplyr::select(grid_id, u_id, home, created_at, dist_hm2grid, dist_hm2grid_km)
+ saveRDS(dist_visitor_points, here("data/derived_data/dist_visitor_points.rds"))
+}
+```
+
+### Outflow distance
+
+The outflow distance is saved under `data/derived_data/` directory.
+
+```{r}
+# outflow distance
+if(file.exists(here("data/derived_data/dist_local_points.rds"))){
+ dist_local_points <- readRDS(here("data/derived_data/dist_local_points.rds"))
+}else{
+ input_grids <- unique(distinct_users$grid_id)
+ dist_local_points <- do.call(bind_rows, map(input_grids, with_progress(function(x) cal_distance(distinct_users, df_hm_tweets, type = "outflow", grids, x))))
+ dist_local_points <- dist_local_points %>%
+ dplyr::select(home, u_id, grid_id, created_at, dist_hm2grid, dist_hm2grid_km)
+ saveRDS(dist_local_points, here("data/derived_data/dist_local_points.rds"))
+}
+```
+
+
+## Distance visualizaiton
+
+### Inflow distance: spatial distribution
+
+```{r}
+mean_sd_inflow_distance <- dist_visitor_points %>%
+ group_by(grid_id, u_id) %>%
+ dplyr::summarise(mean_dist_user = mean(dist_hm2grid_km)) %>%
+ group_by(grid_id) %>%
+ dplyr::summarise(mean_dist_grid = mean(mean_dist_user, na.rm = T) %>% round(., 2),
+ sd_dist_grid = sd(mean_dist_user, na.rm = T) %>% round(., 2)) %>%
+ left_join(., grids) %>%
+ st_as_sf()
+
+spatial_viz(mean_sd_inflow_distance,
+ fill_var = "mean_dist_grid",
+ legend_title = "Avg.distance (km)",
+ main_title = "(a) Distance of Flows: average incoming distance",
+ main.title_size = 1.4,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65)
+
+spatial_viz(mean_sd_inflow_distance, fill_var = "sd_dist_grid",
+ legend_title = "S.D.distance (km)",
+ main_title = "(b) Distance of Flows: S.D. of incoming distance",
+ main.title_size = 1.4,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65)
+```
+
+### Inflow distance: violin plot
+
+```{r fig.height=4}
+violin_viz(mean_sd_inflow_distance, var = "mean_dist_grid",
+ labs.x = "", labs.y = "Avg.distance (km)", breaks = seq(0, 30, 2),
+ y.shift = 3.5, x.shift = 1.4, text.size = 4)
+
+violin_viz(mean_sd_inflow_distance, var = "sd_dist_grid",
+ labs.x = "", labs.y = "Avg.distance (km)", breaks = seq(0, 12, 2),
+ y.shift = 1.8, x.shift = 1.4, text.size = 4)
+```
+
+### Outflow distance: spatial distribution
+
+```{r}
+mean_sd_outflow_distance <- dist_local_points %>%
+ group_by(home, u_id) %>%
+ dplyr::summarise(mean_dist_user = mean(dist_hm2grid_km)) %>%
+ group_by(home) %>%
+ dplyr::summarise(mean_dist_grid = mean(mean_dist_user) %>% round(., 2),
+ sd_dist_grid = sd(mean_dist_user) %>% round(., 2)) %>%
+ left_join(., grids, by = c("home" = "grid_id")) %>%
+ st_as_sf()
+
+spatial_viz(mean_sd_outflow_distance, fill_var = "mean_dist_grid",
+ legend_title = "Avg.distance (km)",
+ main_title = "(c) Distance of Flows: average outgoing distance",
+ main.title_size = 1.4,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65, palette = "Purples")
+
+spatial_viz(mean_sd_outflow_distance, fill_var = "sd_dist_grid",
+ legend_title = "S.D.distance (km)",
+ main_title = "(d) Distance of Flows: S.D. of outgoing distance",
+ main.title_size = 1.4,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65, palette = "Purples")
+```
+
+
+### Outflow distance: violin plot
+
+```{r fig.height=4}
+violin_viz(mean_sd_outflow_distance, var = "mean_dist_grid",
+ labs.x = "", labs.y = "Avg.distance (km)", breaks = seq(0, 20, 2),
+ y.shift = 2, x.shift = 1.4, text.size = 4)
+
+violin_viz(mean_sd_outflow_distance, var = "sd_dist_grid",
+ labs.x = "", labs.y = "Avg.distance (km)", breaks = seq(0, 10, 2),
+ y.shift = 0.9, x.shift = 1.4, text.size = 4)
+```
+
diff --git a/analysis/06-calculate-distance.md b/analysis/06-calculate-distance.md
new file mode 100644
index 0000000..7924c52
--- /dev/null
+++ b/analysis/06-calculate-distance.md
@@ -0,0 +1,234 @@
+Distance of flows
+================
+Qingqing Chen
+Last compiled date: 18 September, 2021
+
+
+
+We calculate the spatial distance from the centroid of a home location
+to the centroid of each visited location. The mean distance across all
+users in a location is used for the visualizations and the spatial
+regression modeling in the paper.
+
+## Load data
+
+``` r
+# full grids
+grids <- read_sf(here("data/derived_data/spatial_hex_grid.shp"), quiet = T) %>%
+ st_transform(crs = 3414)
+
+# grid centroids
+grid_centroids <- grids %>% st_centroid()
+
+# identified home users
+df_hms <- read_csv(here("data/derived_data/identified_hms.csv"))
+
+# sg tweets
+df_tweets <- readRDS(here("data/derived_data/deidentified_sg_tweets_updated.rds"))
+## sg tweets sent by users with identified home locations
+df_hm_tweets <- df_tweets %>% filter(u_id %in% df_hms$u_id)
+
+# distinct users
+distinct_users <- readRDS(here("data/derived_data/distinct_users.rds"))
+```
+
+## Distance calculation
+
+``` r
+# function for distance calculation
+cal_distance <- function(distinct_users, df_hm_tweets, type, grids, grid_index){
+ # step1: get visitors/locals in the input grid
+ if(type == "inflow"){
+ # visitors
+ users_in_grids <- distinct_users %>%
+ filter(grid_id == grid_index) %>% # all users in the input grid
+ filter(type == "visitor")
+ } else if(type == "outflow"){
+ # locals
+ users_in_grids <- distinct_users %>%
+ filter(grid_id == grid_index) %>% # all users in the input grid
+ filter(type == "local")
+ }
+
+ # step2: remove grids with fewer than 5 visitors/locals
+ if(n_distinct(users_in_grids$u_id) < 5){
+ message("Less than 5 users in the grid, remove this grid!")
+ output <- tibble()
+ } else{
+ # step3: get tweets for visitors/locals
+ if(type == "inflow"){
+ df_tweets <- df_hm_tweets %>%
+ filter(grid_id == grid_index) %>% # filter all tweets in the target grid
+ left_join(users_in_grids, .) # get tweets send by visitors
+ }else if(type == "outflow"){
+ df_tweets <- df_hm_tweets %>%
+ filter(u_id %in% users_in_grids$u_id) %>% # tweets sent by locals
+ filter(grid_id != grid_index) %>% # remove tweets sent at home place
+ mutate(home = grid_index) # add home location
+ }
+ # step 4: calculate distance
+ # geometry of home grid
+ home_geometry <- df_tweets %>%
+ left_join(., grids, by = c("home" = "grid_id")) %>%
+ st_as_sf() %>%
+ st_centroid() %>%
+ st_geometry()
+ # geometry of outside home grids
+ grid_geometry <- df_tweets %>%
+ left_join(., grids, by = "grid_id") %>%
+ st_as_sf() %>%
+ st_centroid() %>%
+ st_geometry()
+ # calculate distance
+ output <- df_tweets %>%
+ mutate(dist_hm2grid = st_distance(home_geometry, grid_geometry, by_element = TRUE) %>% as.numeric(),
+ dist_hm2grid_km = round(dist_hm2grid/1000, 4))
+ }
+ return(output)
+}
+```
+
+### Inflow distance
+
+The inflow distance is saved under `data/derived_data/` directory.
+
+``` r
+# inflow distance
+if(file.exists(here("data/derived_data/dist_visitor_points.rds"))){
+ dist_visitor_points <- readRDS(here("data/derived_data/dist_visitor_points.rds"))
+}else{
+ input_grids <- unique(distinct_users$grid_id)
+ dist_visitor_points <- do.call(bind_rows, map(input_grids, with_progress(function(x) cal_distance(distinct_users, df_hm_tweets, type = "inflow", grids, x))))
+ dist_visitor_points <- dist_visitor_points %>%
+ dplyr::select(grid_id, u_id, home, created_at, dist_hm2grid, dist_hm2grid_km)
+ saveRDS(dist_visitor_points, here("data/derived_data/dist_visitor_points.rds"))
+}
+```
+
+### Outflow distance
+
+The outflow distance is saved under `data/derived_data/` directory.
+
+``` r
+# outflow distance
+if(file.exists(here("data/derived_data/dist_local_points.rds"))){
+ dist_local_points <- readRDS(here("data/derived_data/dist_local_points.rds"))
+}else{
+ input_grids <- unique(distinct_users$grid_id)
+ dist_local_points <- do.call(bind_rows, map(input_grids, with_progress(function(x) cal_distance(distinct_users, df_hm_tweets, type = "outflow", grids, x))))
+ dist_local_points <- dist_local_points %>%
+ dplyr::select(home, u_id, grid_id, created_at, dist_hm2grid, dist_hm2grid_km)
+ saveRDS(dist_local_points, here("data/derived_data/dist_local_points.rds"))
+}
+```
+
+## Distance visualizaiton
+
+### Inflow distance: spatial distribution
+
+``` r
+mean_sd_inflow_distance <- dist_visitor_points %>%
+ group_by(grid_id, u_id) %>%
+ dplyr::summarise(mean_dist_user = mean(dist_hm2grid_km)) %>%
+ group_by(grid_id) %>%
+ dplyr::summarise(mean_dist_grid = mean(mean_dist_user, na.rm = T) %>% round(., 2),
+ sd_dist_grid = sd(mean_dist_user, na.rm = T) %>% round(., 2)) %>%
+ left_join(., grids) %>%
+ st_as_sf()
+
+spatial_viz(mean_sd_inflow_distance,
+ fill_var = "mean_dist_grid",
+ legend_title = "Avg.distance (km)",
+ main_title = "(a) Distance of Flows: average incoming distance",
+ main.title_size = 1.4,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65)
+```
+
+
+
+``` r
+spatial_viz(mean_sd_inflow_distance, fill_var = "sd_dist_grid",
+ legend_title = "S.D.distance (km)",
+ main_title = "(b) Distance of Flows: S.D. of incoming distance",
+ main.title_size = 1.4,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65)
+```
+
+
+
+### Inflow distance: violin plot
+
+``` r
+violin_viz(mean_sd_inflow_distance, var = "mean_dist_grid",
+ labs.x = "", labs.y = "Avg.distance (km)", breaks = seq(0, 30, 2),
+ y.shift = 3.5, x.shift = 1.4, text.size = 4)
+```
+
+
+
+``` r
+violin_viz(mean_sd_inflow_distance, var = "sd_dist_grid",
+ labs.x = "", labs.y = "Avg.distance (km)", breaks = seq(0, 12, 2),
+ y.shift = 1.8, x.shift = 1.4, text.size = 4)
+```
+
+
+
+### Outflow distance: spatial distribution
+
+``` r
+mean_sd_outflow_distance <- dist_local_points %>%
+ group_by(home, u_id) %>%
+ dplyr::summarise(mean_dist_user = mean(dist_hm2grid_km)) %>%
+ group_by(home) %>%
+ dplyr::summarise(mean_dist_grid = mean(mean_dist_user) %>% round(., 2),
+ sd_dist_grid = sd(mean_dist_user) %>% round(., 2)) %>%
+ left_join(., grids, by = c("home" = "grid_id")) %>%
+ st_as_sf()
+
+spatial_viz(mean_sd_outflow_distance, fill_var = "mean_dist_grid",
+ legend_title = "Avg.distance (km)",
+ main_title = "(c) Distance of Flows: average outgoing distance",
+ main.title_size = 1.4,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65, palette = "Purples")
+```
+
+
+
+``` r
+spatial_viz(mean_sd_outflow_distance, fill_var = "sd_dist_grid",
+ legend_title = "S.D.distance (km)",
+ main_title = "(d) Distance of Flows: S.D. of outgoing distance",
+ main.title_size = 1.4,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65, palette = "Purples")
+```
+
+
+
+### Outflow distance: violin plot
+
+``` r
+violin_viz(mean_sd_outflow_distance, var = "mean_dist_grid",
+ labs.x = "", labs.y = "Avg.distance (km)", breaks = seq(0, 20, 2),
+ y.shift = 2, x.shift = 1.4, text.size = 4)
+```
+
+
+
+``` r
+violin_viz(mean_sd_outflow_distance, var = "sd_dist_grid",
+ labs.x = "", labs.y = "Avg.distance (km)", breaks = seq(0, 10, 2),
+ y.shift = 0.9, x.shift = 1.4, text.size = 4)
+```
+
+
diff --git a/analysis/06-calculate-distance_files/figure-gfm/unnamed-chunk-5-1.png b/analysis/06-calculate-distance_files/figure-gfm/unnamed-chunk-5-1.png
new file mode 100644
index 0000000..5ceaf8f
Binary files /dev/null and b/analysis/06-calculate-distance_files/figure-gfm/unnamed-chunk-5-1.png differ
diff --git a/analysis/06-calculate-distance_files/figure-gfm/unnamed-chunk-5-2.png b/analysis/06-calculate-distance_files/figure-gfm/unnamed-chunk-5-2.png
new file mode 100644
index 0000000..f10782b
Binary files /dev/null and b/analysis/06-calculate-distance_files/figure-gfm/unnamed-chunk-5-2.png differ
diff --git a/analysis/06-calculate-distance_files/figure-gfm/unnamed-chunk-6-1.png b/analysis/06-calculate-distance_files/figure-gfm/unnamed-chunk-6-1.png
new file mode 100644
index 0000000..d77740a
Binary files /dev/null and b/analysis/06-calculate-distance_files/figure-gfm/unnamed-chunk-6-1.png differ
diff --git a/analysis/06-calculate-distance_files/figure-gfm/unnamed-chunk-6-2.png b/analysis/06-calculate-distance_files/figure-gfm/unnamed-chunk-6-2.png
new file mode 100644
index 0000000..b87f593
Binary files /dev/null and b/analysis/06-calculate-distance_files/figure-gfm/unnamed-chunk-6-2.png differ
diff --git a/analysis/06-calculate-distance_files/figure-gfm/unnamed-chunk-7-1.png b/analysis/06-calculate-distance_files/figure-gfm/unnamed-chunk-7-1.png
new file mode 100644
index 0000000..09efe88
Binary files /dev/null and b/analysis/06-calculate-distance_files/figure-gfm/unnamed-chunk-7-1.png differ
diff --git a/analysis/06-calculate-distance_files/figure-gfm/unnamed-chunk-7-2.png b/analysis/06-calculate-distance_files/figure-gfm/unnamed-chunk-7-2.png
new file mode 100644
index 0000000..2dd828c
Binary files /dev/null and b/analysis/06-calculate-distance_files/figure-gfm/unnamed-chunk-7-2.png differ
diff --git a/analysis/06-calculate-distance_files/figure-gfm/unnamed-chunk-8-1.png b/analysis/06-calculate-distance_files/figure-gfm/unnamed-chunk-8-1.png
new file mode 100644
index 0000000..9418b83
Binary files /dev/null and b/analysis/06-calculate-distance_files/figure-gfm/unnamed-chunk-8-1.png differ
diff --git a/analysis/06-calculate-distance_files/figure-gfm/unnamed-chunk-8-2.png b/analysis/06-calculate-distance_files/figure-gfm/unnamed-chunk-8-2.png
new file mode 100644
index 0000000..20a0f7d
Binary files /dev/null and b/analysis/06-calculate-distance_files/figure-gfm/unnamed-chunk-8-2.png differ
diff --git a/analysis/07-define-sectors.Rmd b/analysis/07-define-sectors.Rmd
new file mode 100644
index 0000000..dca0993
--- /dev/null
+++ b/analysis/07-define-sectors.Rmd
@@ -0,0 +1,251 @@
+---
+title: "Creating concentric sectors"
+author: "Qingqing Chen"
+date: "Last compiled date: `r format(Sys.time(), '%d %B, %Y')`"
+output: github_document
+editor_options:
+ chunk_output_type: console
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE, fig.align='center', fig.width = 10, fig.height = 10)
+library(tidyverse)
+library(sf)
+library(here)
+library(tmap)
+library(purrrogress)
+library(RColorBrewer)
+```
+
+
+
+## Calculate centroids of hexgonal grids
+
+The grid centroids are saved under `data/derived_data/` directory.
+
+```{r}
+#aggregated grids
+grids <- read_sf(here("data/derived_data/spatial_hex_grid.shp"), quiet = T) %>%
+ st_transform(crs = 3414)
+
+#aggregated grid centroids
+grid_centroids <- grids %>% st_centroid()
+```
+
+
+```{r}
+# visualize the centroids
+tm_shape(grids) +
+ tm_borders(col = "grey") +
+ tm_shape(grids %>% filter(grid_id == 1594)) +
+ tm_polygons(col = "red") +
+ tm_shape(grid_centroids) +
+ tm_dots(col = "black") +
+ tm_layout(frame = FALSE) +
+ tm_compass(type = "8star", size = 2, position = c(0, 0.05)) +
+ tm_scale_bar(position = c(0, 0.001))
+```
+
+
+## Create buffers
+
+To create sectors around the centroid of a grid cell, we first draw eight buffers with a radius of 1km, 3km, 5km, 7km, 10km, 20km, 30km, and 60km. The starting point - 1km - is chosen to roughly correspond to the walking distance of a neighborhood and the final radius - 60km - is chosen to cover the entirety of Singapore.
+
+The created grid buffers are saved under `data/derived_data/` directory.
+
+
+```{r}
+# buffer radius
+radius <- c(1000, 3000, 5000, 7000, 10000, 20000, 30000, 60000)
+
+## draw buffers
+draw_buffers <- function(df_centroids, radius, grid_index){
+ grid_centroid <- df_centroids %>% filter(grid_id == grid_index)
+ buffers <- list()
+ for (i in 1:length(radius)){
+ if(i == 1){
+ buffers[[i]] <- grid_centroid %>%
+ st_buffer(., dist = radius[1]) %>%
+ mutate(radius = radius[1])
+ } else{
+ buffers[[i]] <- st_difference(
+ grid_centroid %>% st_buffer(., dist = radius[i]),
+ grid_centroid %>% st_buffer(., dist = radius[i-1])) %>%
+ dplyr::select(-grid_id.1) %>%
+ mutate(radius = radius[i])
+ }
+ }
+ do.call(rbind, buffers)
+}
+
+if(file.exists(here("data/derived_data/grid_buffers.rds"))){
+ grid_buffers <- readRDS(here("data/derived_data/grid_buffers.rds"))
+}else{
+ grid_buffers <- do.call(rbind, map(grid_centroids$grid_id, with_progress(function(x) draw_buffers(grid_centroids, radius, x))))
+ saveRDS(grid_buffers, file = here("data/derived_data/grid_buffers.rds"))
+}
+```
+
+
+```{r}
+## plot one grid buffer as an example
+buffer_example <- grid_buffers %>%
+ filter(grid_id == 1594) %>%
+ mutate(radius = radius/1000,
+ radius = paste0(radius, "km"),
+ radius = factor(radius, levels = c("1km", "2km", "3km", "5km", "7km", "10km", "20km", "30km", "60km")))
+
+tm_shape(grids) +
+ tm_polygons(col = "white", alpha = 0.5, border.col = "grey") +
+ tm_shape(buffer_example) +
+ tm_borders(col = "purple") +
+ # tm_polygons(col = "radius", title = "Radius", alpha = 0.8) +
+ tm_shape(grids %>% filter(grid_id == 1594)) +
+ tm_polygons(col = "red") +
+ tm_shape(grids %>% filter(grid_id == 1594)) +
+ tm_dots(col = "black") +
+ tm_layout(legend.outside = T)
+```
+
+## Create cirlce sectors
+
+Apart from distance alone, users coming from different directions (e.g., north, south, east, west, or center) in the same distance band may represent distinct demographic or neighborhood backgrounds. As such, we cut each buffer into four directions (‘top-left’, ‘top-right’, ‘bottom-left’, and ‘bottom-right’) by two lines that meet at a right angle (i.e., 90 degrees) at the center of the grid cell. We consider users within the walking distance buffer (1km) to all belong to the same neighborhood, so refrain from cutting the first inner buffer. As such, the total number of created sectors for each grid cell can be calculated as _4 * (n-1) + 1_, where n is the number of buffers used. In our case, we create a total of 29 circular sectors for a single grid cell and do so for all grid cells.
+
+The created grid sectors are saved under `data/derived_data/` directory.
+
+```{r}
+##cut single buffer
+cut_buffer <- function(buffer, buffer_id, blades, grid_index){
+ lwgeom::st_split(st_geometry(buffer[buffer_id, ]), blades) %>%
+ st_collection_extract("POLYGON") %>%
+ st_sf() %>%
+ mutate(grid_id = grid_index) %>%
+ dplyr::select(grid_id)
+}
+
+get_cut_buffer <- function(df_centroids, df_buffers, shift, grid_index){
+ # get input grid centroid
+ centroid <- df_centroids %>%
+ filter(grid_id == grid_index) %>%
+ st_coordinates() %>%
+ as_tibble() %>%
+ set_names(c("lon", "lat")) # convert geometry to lon and lat
+ # create blades
+ blades <- st_linestring(
+ rbind(c(centroid$lon+shift, centroid$lat),
+ c(centroid$lon-shift, centroid$lat),
+ c(centroid$lon, centroid$lat),
+ c(centroid$lon, centroid$lat+shift),
+ c(centroid$lon, centroid$lat-shift))) %>%
+ st_sfc(., crs = 3414)
+ # get buffer for input grid
+ buffer <- df_buffers %>% filter(grid_id == grid_index)
+ buffer1 <- buffer[1, ] %>% dplyr::select(grid_id)
+ buffer <- buffer[-1, ] ## do not cut the first inner buffer
+ buffer_ids <- 1:nrow(buffer)
+ # process all buffers
+ rbind(buffer1, do.call(rbind, map(buffer_ids, function(x) cut_buffer(buffer, x, blades, grid_index)))) %>%
+ rowid_to_column(var = "sector_id")
+}
+
+# process all grids
+if(file.exists(here::here("data/derived_data/grid_sectors.rds"))){
+ grid_sectors <- readRDS(here::here("data/derived_data/grid_sectors.rds"))
+}else{
+ grid_sectors <- do.call(rbind, map(grid_centroids$grid_id, with_progress(function(x) get_cut_buffer(grid_centroids, grid_buffers, shift = 60000, x))))
+ saveRDS(grid_sectors, file = here::here("data/derived_data/grid_sectors.rds"))
+}
+```
+
+
+## Visualize sectors
+
+### Show case: sectors of grid 1594
+```{r}
+## plot one cut buffer as an example
+## users with home locations in the same circle sector are identified as the same species
+grid_sectors_example <- grid_sectors %>%
+ filter(grid_id == 1594) %>%
+ mutate(sector_id = factor(sector_id))
+
+distinct_users <- readRDS(here("data/derived_data/distinct_users.rds"))
+users_grid1594 <- distinct_users %>%
+ filter(grid_id == 1594) %>%
+ filter(type == "visitor") %>%
+ left_join(., grids, by =c("home" = "grid_id")) %>%
+ st_as_sf() %>%
+ st_transform(crs = 3414) %>%
+ st_centroid() %>%
+ st_join(., grid_sectors_example, largest = T)
+
+
+tm_shape(grids) +
+ tm_polygons(col = "white", alpha = 0.5, border.col = "grey") +
+ tm_shape(grids %>% filter(grid_id == 1594)) +
+ tm_polygons(col = "red") + ## target grid
+ tm_shape(users_grid1594) +
+ tm_dots(col = "sector_id", jitter = 0.08, size = 0.015, palette = "Dark2") +
+ tm_shape(grid_sectors_example) +
+ tm_borders(col = "purple") +
+ tm_text(text = "sector_id", size = 0.6, col = "black") +
+ tm_layout(legend.show = FALSE)
+```
+
+### Show case: sectors within Singapore boundary of grid 1594
+
+```{r}
+sectors_inSG <- grid_sectors_example %>%
+ st_intersection(grids, .) %>%
+ group_by(sector_id) %>%
+ summarise()
+
+
+tm_shape(grids) +
+ tm_polygons(col = "white", alpha = 0.1, border.col = "grey") +
+ tm_shape(sectors_inSG) +
+ tm_polygons(col = "sector_id", border.col = "purple", alpha = 0.9) +
+ tm_shape(grids %>% filter(grid_id == 1594)) +
+ tm_polygons(col = "red") + ## target grid
+ tm_shape(users_grid1594) +
+ tm_dots(col = "sector_id", jitter = 0.08, size = 0.015, palette = "Dark2") +
+ # tm_dots(col = "black", jitter = 0.08, size = 0.015) +
+ tm_shape(grid_sectors_example) +
+ tm_borders(col = "purple", lty = 2) +
+ tm_text(text = "sector_id", size = 0.6, col = "black") +
+ tm_layout(legend.show = FALSE)
+```
+
+
+
+### Show case: circle sector 1 of grid 1594
+
+```{r fig.width=5, fig.height=5}
+tm_shape(grid_sectors_example %>% filter(sector_id %in% seq(1))) +
+ tm_polygons(col = "purple", alpha = 0.1) +
+ tm_shape(users_grid1594 %>% filter(sector_id == 1)) +
+ tm_dots(col = "#1B9E77", jitter = 0.2, size = 0.2) +
+ tm_shape(grids %>% filter(grid_id == 1594)) +
+ tm_polygons(col = "red") +
+ tm_shape(grids) +
+ tm_borders(col = "grey") +
+ tm_layout(frame = F)
+```
+
+### Show case: circle sector 5 of grid 1594
+```{r fig.height=5, fig.width=5}
+tm_shape(grid_sectors_example %>% filter(sector_id %in% c(5))) +
+ tm_polygons(col = "purple", alpha = 0.1, border.col = "purple") +
+ tm_shape(users_grid1594 %>% filter(sector_id == 5)) +
+ tm_dots(col = "#D95F02", jitter = 0.1, size = 0.2) +
+ tm_shape(grids) +
+ tm_borders(col = "grey") +
+ tm_shape(grids %>% filter(grid_id == 1594)) +
+ tm_polygons(col = "red") +
+ tm_layout(frame = F)
+```
+
+
+
+
diff --git a/analysis/07-define-sectors.md b/analysis/07-define-sectors.md
new file mode 100644
index 0000000..51d378a
--- /dev/null
+++ b/analysis/07-define-sectors.md
@@ -0,0 +1,255 @@
+Creating concentric sectors
+================
+Qingqing Chen
+Last compiled date: 18 September, 2021
+
+
+
+## Calculate centroids of hexgonal grids
+
+The grid centroids are saved under `data/derived_data/` directory.
+
+``` r
+#aggregated grids
+grids <- read_sf(here("data/derived_data/spatial_hex_grid.shp"), quiet = T) %>%
+ st_transform(crs = 3414)
+
+#aggregated grid centroids
+grid_centroids <- grids %>% st_centroid()
+```
+
+``` r
+# visualize the centroids
+tm_shape(grids) +
+ tm_borders(col = "grey") +
+ tm_shape(grids %>% filter(grid_id == 1594)) +
+ tm_polygons(col = "red") +
+ tm_shape(grid_centroids) +
+ tm_dots(col = "black") +
+ tm_layout(frame = FALSE) +
+ tm_compass(type = "8star", size = 2, position = c(0, 0.05)) +
+ tm_scale_bar(position = c(0, 0.001))
+```
+
+
+
+## Create buffers
+
+To create sectors around the centroid of a grid cell, we first draw
+eight buffers with a radius of 1km, 3km, 5km, 7km, 10km, 20km, 30km, and
+60km. The starting point - 1km - is chosen to roughly correspond to the
+walking distance of a neighborhood and the final radius - 60km - is
+chosen to cover the entirety of Singapore.
+
+The created grid buffers are saved under `data/derived_data/` directory.
+
+``` r
+# buffer radius
+radius <- c(1000, 3000, 5000, 7000, 10000, 20000, 30000, 60000)
+
+## draw buffers
+draw_buffers <- function(df_centroids, radius, grid_index){
+ grid_centroid <- df_centroids %>% filter(grid_id == grid_index)
+ buffers <- list()
+ for (i in 1:length(radius)){
+ if(i == 1){
+ buffers[[i]] <- grid_centroid %>%
+ st_buffer(., dist = radius[1]) %>%
+ mutate(radius = radius[1])
+ } else{
+ buffers[[i]] <- st_difference(
+ grid_centroid %>% st_buffer(., dist = radius[i]),
+ grid_centroid %>% st_buffer(., dist = radius[i-1])) %>%
+ dplyr::select(-grid_id.1) %>%
+ mutate(radius = radius[i])
+ }
+ }
+ do.call(rbind, buffers)
+}
+
+if(file.exists(here("data/derived_data/grid_buffers.rds"))){
+ grid_buffers <- readRDS(here("data/derived_data/grid_buffers.rds"))
+}else{
+ grid_buffers <- do.call(rbind, map(grid_centroids$grid_id, with_progress(function(x) draw_buffers(grid_centroids, radius, x))))
+ saveRDS(grid_buffers, file = here("data/derived_data/grid_buffers.rds"))
+}
+```
+
+``` r
+## plot one grid buffer as an example
+buffer_example <- grid_buffers %>%
+ filter(grid_id == 1594) %>%
+ mutate(radius = radius/1000,
+ radius = paste0(radius, "km"),
+ radius = factor(radius, levels = c("1km", "2km", "3km", "5km", "7km", "10km", "20km", "30km", "60km")))
+
+tm_shape(grids) +
+ tm_polygons(col = "white", alpha = 0.5, border.col = "grey") +
+ tm_shape(buffer_example) +
+ tm_borders(col = "purple") +
+ # tm_polygons(col = "radius", title = "Radius", alpha = 0.8) +
+ tm_shape(grids %>% filter(grid_id == 1594)) +
+ tm_polygons(col = "red") +
+ tm_shape(grids %>% filter(grid_id == 1594)) +
+ tm_dots(col = "black") +
+ tm_layout(legend.outside = T)
+```
+
+
+
+## Create cirlce sectors
+
+Apart from distance alone, users coming from different directions (e.g.,
+north, south, east, west, or center) in the same distance band may
+represent distinct demographic or neighborhood backgrounds. As such, we
+cut each buffer into four directions (‘top-left’, ‘top-right’,
+‘bottom-left’, and ‘bottom-right’) by two lines that meet at a right
+angle (i.e., 90 degrees) at the center of the grid cell. We consider
+users within the walking distance buffer (1km) to all belong to the same
+neighborhood, so refrain from cutting the first inner buffer. As such,
+the total number of created sectors for each grid cell can be calculated
+as *4 \* (n-1) + 1*, where n is the number of buffers used. In our case,
+we create a total of 29 circular sectors for a single grid cell and do
+so for all grid cells.
+
+The created grid sectors are saved under `data/derived_data/` directory.
+
+``` r
+##cut single buffer
+cut_buffer <- function(buffer, buffer_id, blades, grid_index){
+ lwgeom::st_split(st_geometry(buffer[buffer_id, ]), blades) %>%
+ st_collection_extract("POLYGON") %>%
+ st_sf() %>%
+ mutate(grid_id = grid_index) %>%
+ dplyr::select(grid_id)
+}
+
+get_cut_buffer <- function(df_centroids, df_buffers, shift, grid_index){
+ # get input grid centroid
+ centroid <- df_centroids %>%
+ filter(grid_id == grid_index) %>%
+ st_coordinates() %>%
+ as_tibble() %>%
+ set_names(c("lon", "lat")) # convert geometry to lon and lat
+ # create blades
+ blades <- st_linestring(
+ rbind(c(centroid$lon+shift, centroid$lat),
+ c(centroid$lon-shift, centroid$lat),
+ c(centroid$lon, centroid$lat),
+ c(centroid$lon, centroid$lat+shift),
+ c(centroid$lon, centroid$lat-shift))) %>%
+ st_sfc(., crs = 3414)
+ # get buffer for input grid
+ buffer <- df_buffers %>% filter(grid_id == grid_index)
+ buffer1 <- buffer[1, ] %>% dplyr::select(grid_id)
+ buffer <- buffer[-1, ] ## do not cut the first inner buffer
+ buffer_ids <- 1:nrow(buffer)
+ # process all buffers
+ rbind(buffer1, do.call(rbind, map(buffer_ids, function(x) cut_buffer(buffer, x, blades, grid_index)))) %>%
+ rowid_to_column(var = "sector_id")
+}
+
+# process all grids
+if(file.exists(here::here("data/derived_data/grid_sectors.rds"))){
+ grid_sectors <- readRDS(here::here("data/derived_data/grid_sectors.rds"))
+}else{
+ grid_sectors <- do.call(rbind, map(grid_centroids$grid_id, with_progress(function(x) get_cut_buffer(grid_centroids, grid_buffers, shift = 60000, x))))
+ saveRDS(grid_sectors, file = here::here("data/derived_data/grid_sectors.rds"))
+}
+```
+
+## Visualize sectors
+
+### Show case: sectors of grid 1594
+
+``` r
+## plot one cut buffer as an example
+## users with home locations in the same circle sector are identified as the same species
+grid_sectors_example <- grid_sectors %>%
+ filter(grid_id == 1594) %>%
+ mutate(sector_id = factor(sector_id))
+
+distinct_users <- readRDS(here("data/derived_data/distinct_users.rds"))
+users_grid1594 <- distinct_users %>%
+ filter(grid_id == 1594) %>%
+ filter(type == "visitor") %>%
+ left_join(., grids, by =c("home" = "grid_id")) %>%
+ st_as_sf() %>%
+ st_transform(crs = 3414) %>%
+ st_centroid() %>%
+ st_join(., grid_sectors_example, largest = T)
+
+
+tm_shape(grids) +
+ tm_polygons(col = "white", alpha = 0.5, border.col = "grey") +
+ tm_shape(grids %>% filter(grid_id == 1594)) +
+ tm_polygons(col = "red") + ## target grid
+ tm_shape(users_grid1594) +
+ tm_dots(col = "sector_id", jitter = 0.08, size = 0.015, palette = "Dark2") +
+ tm_shape(grid_sectors_example) +
+ tm_borders(col = "purple") +
+ tm_text(text = "sector_id", size = 0.6, col = "black") +
+ tm_layout(legend.show = FALSE)
+```
+
+
+
+### Show case: sectors within Singapore boundary of grid 1594
+
+``` r
+sectors_inSG <- grid_sectors_example %>%
+ st_intersection(grids, .) %>%
+ group_by(sector_id) %>%
+ summarise()
+
+
+tm_shape(grids) +
+ tm_polygons(col = "white", alpha = 0.1, border.col = "grey") +
+ tm_shape(sectors_inSG) +
+ tm_polygons(col = "sector_id", border.col = "purple", alpha = 0.9) +
+ tm_shape(grids %>% filter(grid_id == 1594)) +
+ tm_polygons(col = "red") + ## target grid
+ tm_shape(users_grid1594) +
+ tm_dots(col = "sector_id", jitter = 0.08, size = 0.015, palette = "Dark2") +
+ # tm_dots(col = "black", jitter = 0.08, size = 0.015) +
+ tm_shape(grid_sectors_example) +
+ tm_borders(col = "purple", lty = 2) +
+ tm_text(text = "sector_id", size = 0.6, col = "black") +
+ tm_layout(legend.show = FALSE)
+```
+
+
+
+### Show case: circle sector 1 of grid 1594
+
+``` r
+tm_shape(grid_sectors_example %>% filter(sector_id %in% seq(1))) +
+ tm_polygons(col = "purple", alpha = 0.1) +
+ tm_shape(users_grid1594 %>% filter(sector_id == 1)) +
+ tm_dots(col = "#1B9E77", jitter = 0.2, size = 0.2) +
+ tm_shape(grids %>% filter(grid_id == 1594)) +
+ tm_polygons(col = "red") +
+ tm_shape(grids) +
+ tm_borders(col = "grey") +
+ tm_layout(frame = F)
+```
+
+
+
+### Show case: circle sector 5 of grid 1594
+
+``` r
+tm_shape(grid_sectors_example %>% filter(sector_id %in% c(5))) +
+ tm_polygons(col = "purple", alpha = 0.1, border.col = "purple") +
+ tm_shape(users_grid1594 %>% filter(sector_id == 5)) +
+ tm_dots(col = "#D95F02", jitter = 0.1, size = 0.2) +
+ tm_shape(grids) +
+ tm_borders(col = "grey") +
+ tm_shape(grids %>% filter(grid_id == 1594)) +
+ tm_polygons(col = "red") +
+ tm_layout(frame = F)
+```
+
+
diff --git a/analysis/07-define-sectors_files/figure-gfm/unnamed-chunk-2-1.png b/analysis/07-define-sectors_files/figure-gfm/unnamed-chunk-2-1.png
new file mode 100644
index 0000000..0a75abd
Binary files /dev/null and b/analysis/07-define-sectors_files/figure-gfm/unnamed-chunk-2-1.png differ
diff --git a/analysis/07-define-sectors_files/figure-gfm/unnamed-chunk-4-1.png b/analysis/07-define-sectors_files/figure-gfm/unnamed-chunk-4-1.png
new file mode 100644
index 0000000..bef2934
Binary files /dev/null and b/analysis/07-define-sectors_files/figure-gfm/unnamed-chunk-4-1.png differ
diff --git a/analysis/07-define-sectors_files/figure-gfm/unnamed-chunk-6-1.png b/analysis/07-define-sectors_files/figure-gfm/unnamed-chunk-6-1.png
new file mode 100644
index 0000000..3eb4e46
Binary files /dev/null and b/analysis/07-define-sectors_files/figure-gfm/unnamed-chunk-6-1.png differ
diff --git a/analysis/07-define-sectors_files/figure-gfm/unnamed-chunk-7-1.png b/analysis/07-define-sectors_files/figure-gfm/unnamed-chunk-7-1.png
new file mode 100644
index 0000000..75ed439
Binary files /dev/null and b/analysis/07-define-sectors_files/figure-gfm/unnamed-chunk-7-1.png differ
diff --git a/analysis/07-define-sectors_files/figure-gfm/unnamed-chunk-8-1.png b/analysis/07-define-sectors_files/figure-gfm/unnamed-chunk-8-1.png
new file mode 100644
index 0000000..7abd4a3
Binary files /dev/null and b/analysis/07-define-sectors_files/figure-gfm/unnamed-chunk-8-1.png differ
diff --git a/analysis/07-define-sectors_files/figure-gfm/unnamed-chunk-9-1.png b/analysis/07-define-sectors_files/figure-gfm/unnamed-chunk-9-1.png
new file mode 100644
index 0000000..37d9f80
Binary files /dev/null and b/analysis/07-define-sectors_files/figure-gfm/unnamed-chunk-9-1.png differ
diff --git a/analysis/08-calculate-diversity.Rmd b/analysis/08-calculate-diversity.Rmd
new file mode 100644
index 0000000..e374a91
--- /dev/null
+++ b/analysis/08-calculate-diversity.Rmd
@@ -0,0 +1,260 @@
+---
+title: "Social diversity"
+author: "Qingqing Chen"
+date: "Last compiled date: `r format(Sys.time(), '%d %B, %Y')`"
+output: github_document
+editor_options:
+ chunk_output_type: console
+---
+
+
+
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE, fig.align='center', fig.width = 10, fig.height = 10, warning = FALSE, message = FALSE)
+library(tidyverse)
+library(dplyr)
+library(sf)
+library(tmap)
+library(RColorBrewer)
+library(vegan) ##calculate diversity
+library(purrrogress)
+library(here)
+source(here("R/viz.R"))
+```
+
+To measure diversity, we use Shannon's diversity index (H) as it is commonly used for species diversity and accounts for both richness and evenness of the species present. The higher the Shannon's index (H) value is, the more diverse the origins of visitors are for inflow diversity. Conversely, for outflow diversity a higher H means that locals visit a more diverse set of neighbourhoods throughout the city.
+
+
+## Load necessary data
+
+```{r}
+# aggregated grids
+grids <- read_sf(here("data/derived_data/spatial_hex_grid.shp"), quiet = T) %>%
+ st_transform(crs = 3414)
+
+# Singapore planning areas
+sg_subzone <- read_sf(here("data/raw_data/sg-subzone/MP14_SUBZONE_NO_SEA_PL.shp")) %>%
+ st_transform(crs = 3414) %>%
+ st_make_valid() %>%
+ group_by(PLN_AREA_N) %>%
+ dplyr::summarise()
+
+# grid sectors
+grid_sectors <- readRDS(here("data/derived_data/grid_sectors.rds"))
+
+# tweets
+df_tweets <- readRDS(here("data/derived_data/deidentified_sg_tweets_updated.rds"))
+
+# qualified grid cells
+considered_grids <- df_tweets %>%
+ group_by(grid_id) %>%
+ dplyr::summarise(n_tweets = n(), n_users = n_distinct(u_id)) %>%
+ filter(n_tweets >= 100 & n_users >= 5)
+
+# usrs in each grid cell
+users_in_grids <- df_tweets %>% distinct(u_id, grid_id)
+
+# identified home location of users
+identified_hms <- read_csv(here("data/derived_data/identified_hms.csv"))
+
+# distinct users in grids: local or visitor
+distinct_users <- readRDS(here("data/derived_data/distinct_users.rds"))
+```
+
+
+## Diversity analysis with Shannon's index
+
+```{r}
+# function used for diversity analysis
+cal_diversity <- function(grid_index, data, grid_sectors, grids, type = "inflow"){
+ # get sectors of input grid
+ sectors <- grid_sectors %>% filter(grid_id == grid_index)
+ # get sector areas intersect with SG boundary
+ sectors_corssSG <- st_intersection(sectors, sg_subzone) %>%
+ dplyr::select(-PLN_AREA_N) %>%
+ unique() %>%
+ group_by(sector_id, grid_id) %>%
+ dplyr::summarise() %>%
+ ungroup()
+
+ # assign users to sectors
+ if(type == "inflow"){
+ # assign visitors' home locations to sectors
+ users_in_sectors <- st_join(data, sectors_corssSG, largest = TRUE) %>%
+ distinct(u_id, home, .keep_all = TRUE) %>%
+ dplyr::select(sector_id, grid_id, u_id, home) %>%
+ st_set_geometry(NULL) %>%
+ left_join(., sectors_corssSG) %>% # change the geometry to sectors' geometry
+ st_sf() # convert to sf object
+ }else if(type == "outflow"){
+ # assign locals visited locations to sectors
+ users_in_sectors <- st_join(data, sectors_corssSG, largest = T) %>%
+ distinct(u_id, out_to_grid, .keep_all = TRUE) %>%
+ dplyr::select(sector_id, grid_id, u_id, out_to_grid) %>%
+ st_set_geometry(NULL) %>%
+ left_join(., sectors_corssSG) %>% # change the geometry to sectors' geometry
+ st_sf()
+ }
+
+ # calculate Shannon's index
+ output <- users_in_sectors %>%
+ group_by(sector_id) %>%
+ dplyr::summarise(n_user = n_distinct(u_id)) %>%
+ ungroup() %>%
+ mutate(area_km_square = as.numeric(st_area(.)/1000000)) %>%
+ mutate(user_density = n_user/area_km_square) %>%
+ st_set_geometry(NULL) %>%
+ distinct(sector_id, user_density) %>%
+ spread(sector_id, user_density) %>%
+ diversity(index = "shannon")
+ return(output)
+}
+```
+
+### Inflow diversity
+
+The inflow diversity is saved under `data/derived_data/` directory.
+```{r}
+# Get visitors in each grid
+visitors_in_grid <- distinct_users %>%
+ filter(type == "visitor") %>%
+ left_join(., grids, by = c("home" = "grid_id")) %>%
+ st_as_sf(crs = 3414) %>%
+ nest(data = c(u_id, home, type, geometry))
+
+# visited grid
+visitors_in_grid[1, ]
+
+# visitors with home locations
+visitors_in_grid[1, ]$data[[1]]
+```
+
+
+```{r}
+# calculate diversity
+if(file.exists(here("data/derived_data/inflow_diversity.rds"))){
+ inflow_diversity <- readRDS(here("data/derived_data/inflow_diversity.rds"))
+}else{
+ inflow_diversity <- visitors_in_grid %>%
+ filter(grid_id %in% considered_grids$grid_id) %>%
+ mutate(div_shannon = map2_dbl(grid_id, data, with_progress(function(x, y) cal_diversity(x, y, grid_sectors, grids, type = "inflow")))) %>%
+ dplyr::select(-data)
+
+ inflow_diversity <- inflow_diversity %>%
+ mutate(norm_div_shannon = (div_shannon - min(div_shannon))/(max(div_shannon) - min(div_shannon))) %>%
+ left_join(., grids, by = c("grid_id" = "grid_id")) %>%
+ st_sf(crs = 3414)
+ saveRDS(inflow_diversity, file = here("data/derived_data/inflow_diversity.rds"))
+}
+head(inflow_diversity)
+```
+
+### Outflow diversity
+
+The grids except home that each local visited are saved under `data/derived_data/` directory.
+
+```{r}
+# Get other grids except home that locals visited
+# locals in home grids
+locals_in_grid <- distinct_users %>%
+ filter(type == "local") %>%
+ nest(data = c(u_id, home, type))
+
+# grids outside home grid that each local visited
+get_locals_out_grids <- function(grid_id, data, users_in_grids, grids){
+ # locals in home grid
+ locals <- data$u_id %>% unique()
+ # home grid id
+ locals_home <- grid_id
+ output <- users_in_grids %>%
+ filter(u_id %in% locals) %>% ## get grids that locals sent tweets
+ filter(!grid_id %in% locals_home) %>% ## remove grid that is the same as the home
+ left_join(., grids) %>% ## get out to grid geometry
+ rename(out_to_grid = grid_id) %>%
+ st_sf()
+ return(output)
+}
+
+if(file.exists(here("data/derived_data/locals_out_grids.rds"))){
+ locals_out_grids <- readRDS(here("data/derived_data/locals_out_grids.rds"))
+}else{
+ locals_out_grids <- locals_in_grid %>%
+ filter(grid_id %in% considered_grids$grid_id) %>%
+ mutate(data = map2(grid_id, data, with_progress(function(x, y) get_locals_out_grids(x, y, users_in_grids, grids))))
+ saveRDS(locals_out_grids, file = here("data/derived_data/locals_out_grids.rds"))
+}
+
+# home grid
+locals_out_grids[1, ]
+
+# locals in home grid with their visited grids outside home
+locals_out_grids[1, ]$data[[1]]
+```
+
+The outflow diversity is saved under `data/derived_data/` directory.
+```{r}
+# calculate outflow diversity
+if(file.exists(here("data/derived_data/outflow_diversity.rds"))){
+ outflow_diversity <- readRDS(here("data/derived_data/outflow_diversity.rds"))
+}else{
+ outflow_diversity <- locals_out_grids %>%
+ filter(grid_id %in% considered_grids$grid_id) %>%
+ mutate(div_shannon = map2_dbl(grid_id, data, with_progress(function(x, y) cal_diversity(x, y, grid_sectors, grids, type = "outflow")))) %>%
+ dplyr::select(-data)
+
+ outflow_diversity <- outflow_diversity %>%
+ mutate(norm_div_shannon = (div_shannon - min(div_shannon))/(max(div_shannon) - min(div_shannon)),
+ norm_div_shannon = round(norm_div_shannon, 2)) %>%
+ left_join(., grids, by = c("grid_id" = "grid_id")) %>%
+ st_sf(crs = 3414)
+
+ saveRDS(outflow_diversity, file = here("data/derived_data/outflow_diversity.rds"))
+}
+
+head(outflow_diversity)
+```
+
+
+## Diversity spatial distribution
+
+### Inflow diversity
+
+```{r}
+spatial_viz(inflow_diversity, fill_var = "norm_div_shannon",
+ legend_title = "Norm.diversity",
+ main_title = "(a) Diversity of Flows: normalized inflow diversity",
+ main.title_size = 1.4,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65)
+```
+
+
+```{r fig.height=4}
+violin_viz(inflow_diversity, var = "norm_div_shannon",
+ labs.x = "", labs.y = "Norm.diversity", breaks = seq(0, 1, 0.1),
+ y.shift = 0.17, x.shift = 1.52, text.size = 4)
+```
+
+### Outflow diversity
+
+```{r}
+spatial_viz(outflow_diversity, fill_var = "norm_div_shannon",
+ legend_title = "Norm.diversity",
+ main_title = "(b) Diversity of Flows: normalized outflow diversity",
+ main.title_size = 1.4,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65,
+ palette = "Purples")
+```
+
+
+```{r fig.height=4}
+violin_viz(outflow_diversity, var = "norm_div_shannon",
+ labs.x = "", labs.y = "Norm.diversity", breaks = seq(0, 1, 0.1),
+ y.shift = 0.17, x.shift = 1.52, text.size = 4)
+```
diff --git a/analysis/08-calculate-diversity.md b/analysis/08-calculate-diversity.md
new file mode 100644
index 0000000..308f314
--- /dev/null
+++ b/analysis/08-calculate-diversity.md
@@ -0,0 +1,328 @@
+Social diversity
+================
+Qingqing Chen
+Last compiled date: 18 September, 2021
+
+
+
+To measure diversity, we use Shannon’s diversity index (H) as it is
+commonly used for species diversity and accounts for both richness and
+evenness of the species present. The higher the Shannon’s index (H)
+value is, the more diverse the origins of visitors are for inflow
+diversity. Conversely, for outflow diversity a higher H means that
+locals visit a more diverse set of neighbourhoods throughout the city.
+
+## Load necessary data
+
+``` r
+# aggregated grids
+grids <- read_sf(here("data/derived_data/spatial_hex_grid.shp"), quiet = T) %>%
+ st_transform(crs = 3414)
+
+# Singapore planning areas
+sg_subzone <- read_sf(here("data/raw_data/sg-subzone/MP14_SUBZONE_NO_SEA_PL.shp")) %>%
+ st_transform(crs = 3414) %>%
+ st_make_valid() %>%
+ group_by(PLN_AREA_N) %>%
+ dplyr::summarise()
+
+# grid sectors
+grid_sectors <- readRDS(here("data/derived_data/grid_sectors.rds"))
+
+# tweets
+df_tweets <- readRDS(here("data/derived_data/deidentified_sg_tweets_updated.rds"))
+
+# qualified grid cells
+considered_grids <- df_tweets %>%
+ group_by(grid_id) %>%
+ dplyr::summarise(n_tweets = n(), n_users = n_distinct(u_id)) %>%
+ filter(n_tweets >= 100 & n_users >= 5)
+
+# usrs in each grid cell
+users_in_grids <- df_tweets %>% distinct(u_id, grid_id)
+
+# identified home location of users
+identified_hms <- read_csv(here("data/derived_data/identified_hms.csv"))
+
+# distinct users in grids: local or visitor
+distinct_users <- readRDS(here("data/derived_data/distinct_users.rds"))
+```
+
+## Diversity analysis with Shannon’s index
+
+``` r
+# function used for diversity analysis
+cal_diversity <- function(grid_index, data, grid_sectors, grids, type = "inflow"){
+ # get sectors of input grid
+ sectors <- grid_sectors %>% filter(grid_id == grid_index)
+ # get sector areas intersect with SG boundary
+ sectors_corssSG <- st_intersection(sectors, sg_subzone) %>%
+ dplyr::select(-PLN_AREA_N) %>%
+ unique() %>%
+ group_by(sector_id, grid_id) %>%
+ dplyr::summarise() %>%
+ ungroup()
+
+ # assign users to sectors
+ if(type == "inflow"){
+ # assign visitors' home locations to sectors
+ users_in_sectors <- st_join(data, sectors_corssSG, largest = TRUE) %>%
+ distinct(u_id, home, .keep_all = TRUE) %>%
+ dplyr::select(sector_id, grid_id, u_id, home) %>%
+ st_set_geometry(NULL) %>%
+ left_join(., sectors_corssSG) %>% # change the geometry to sectors' geometry
+ st_sf() # convert to sf object
+ }else if(type == "outflow"){
+ # assign locals visited locations to sectors
+ users_in_sectors <- st_join(data, sectors_corssSG, largest = T) %>%
+ distinct(u_id, out_to_grid, .keep_all = TRUE) %>%
+ dplyr::select(sector_id, grid_id, u_id, out_to_grid) %>%
+ st_set_geometry(NULL) %>%
+ left_join(., sectors_corssSG) %>% # change the geometry to sectors' geometry
+ st_sf()
+ }
+
+ # calculate Shannon's index
+ output <- users_in_sectors %>%
+ group_by(sector_id) %>%
+ dplyr::summarise(n_user = n_distinct(u_id)) %>%
+ ungroup() %>%
+ mutate(area_km_square = as.numeric(st_area(.)/1000000)) %>%
+ mutate(user_density = n_user/area_km_square) %>%
+ st_set_geometry(NULL) %>%
+ distinct(sector_id, user_density) %>%
+ spread(sector_id, user_density) %>%
+ diversity(index = "shannon")
+ return(output)
+}
+```
+
+### Inflow diversity
+
+The inflow diversity is saved under `data/derived_data/` directory.
+
+``` r
+# Get visitors in each grid
+visitors_in_grid <- distinct_users %>%
+ filter(type == "visitor") %>%
+ left_join(., grids, by = c("home" = "grid_id")) %>%
+ st_as_sf(crs = 3414) %>%
+ nest(data = c(u_id, home, type, geometry))
+
+# visited grid
+visitors_in_grid[1, ]
+```
+
+ ## # A tibble: 1 × 2
+ ## grid_id data
+ ##
+ ## 1 3
+
+``` r
+# visitors with home locations
+visitors_in_grid[1, ]$data[[1]]
+```
+
+ ## Simple feature collection with 3 features and 3 fields
+ ## Geometry type: POLYGON
+ ## Dimension: XY
+ ## Bounding box: xmin: 18042.54 ymin: 36100.32 xmax: 40917.54 ymax: 42162.5
+ ## Projected CRS: SVY21 / Singapore TM
+ ## # A tibble: 3 × 4
+ ## u_id home type geometry
+ ##
+ ## 1 82965004 1594 visitor ((40542.54 36749.84, 40167.54 36966.34, 40167.54 37399…
+ ## 2 90218329 626 visitor ((18417.54 36100.32, 18042.54 36316.82, 18042.54 36749…
+ ## 3 49010226 1461 visitor ((35667.54 41296.47, 35292.54 41512.98, 35292.54 41945…
+
+``` r
+# calculate diversity
+if(file.exists(here("data/derived_data/inflow_diversity.rds"))){
+ inflow_diversity <- readRDS(here("data/derived_data/inflow_diversity.rds"))
+}else{
+ inflow_diversity <- visitors_in_grid %>%
+ filter(grid_id %in% considered_grids$grid_id) %>%
+ mutate(div_shannon = map2_dbl(grid_id, data, with_progress(function(x, y) cal_diversity(x, y, grid_sectors, grids, type = "inflow")))) %>%
+ dplyr::select(-data)
+
+ inflow_diversity <- inflow_diversity %>%
+ mutate(norm_div_shannon = (div_shannon - min(div_shannon))/(max(div_shannon) - min(div_shannon))) %>%
+ left_join(., grids, by = c("grid_id" = "grid_id")) %>%
+ st_sf(crs = 3414)
+ saveRDS(inflow_diversity, file = here("data/derived_data/inflow_diversity.rds"))
+}
+head(inflow_diversity)
+```
+
+ ## Simple feature collection with 6 features and 3 fields
+ ## Geometry type: POLYGON
+ ## Dimension: XY
+ ## Bounding box: xmin: 2292.538 ymin: 24408.98 xmax: 7917.538 ymax: 35017.79
+ ## Projected CRS: SVY21 / Singapore TM
+ ## # A tibble: 6 × 4
+ ## grid_id div_shannon norm_div_shannon geometry
+ ##
+ ## 1 3 0.691 0.166 ((2667.538 24408.98, 2292.538 24625.48, …
+ ## 2 8 1.00 0.289 ((3042.538 25058.49, 2667.538 25275, 266…
+ ## 3 14 1.01 0.291 ((3417.538 25708.01, 3042.538 25924.52, …
+ ## 4 19 0.693 0.167 ((3792.538 25058.49, 3417.538 25275, 341…
+ ## 5 123 1.36 0.430 ((7167.538 33502.24, 6792.538 33718.75, …
+ ## 6 135 1.51 0.488 ((7542.538 34151.76, 7167.538 34368.27, …
+
+### Outflow diversity
+
+The grids except home that each local visited are saved under
+`data/derived_data/` directory.
+
+``` r
+# Get other grids except home that locals visited
+# locals in home grids
+locals_in_grid <- distinct_users %>%
+ filter(type == "local") %>%
+ nest(data = c(u_id, home, type))
+
+# grids outside home grid that each local visited
+get_locals_out_grids <- function(grid_id, data, users_in_grids, grids){
+ # locals in home grid
+ locals <- data$u_id %>% unique()
+ # home grid id
+ locals_home <- grid_id
+ output <- users_in_grids %>%
+ filter(u_id %in% locals) %>% ## get grids that locals sent tweets
+ filter(!grid_id %in% locals_home) %>% ## remove grid that is the same as the home
+ left_join(., grids) %>% ## get out to grid geometry
+ rename(out_to_grid = grid_id) %>%
+ st_sf()
+ return(output)
+}
+
+if(file.exists(here("data/derived_data/locals_out_grids.rds"))){
+ locals_out_grids <- readRDS(here("data/derived_data/locals_out_grids.rds"))
+}else{
+ locals_out_grids <- locals_in_grid %>%
+ filter(grid_id %in% considered_grids$grid_id) %>%
+ mutate(data = map2(grid_id, data, with_progress(function(x, y) get_locals_out_grids(x, y, users_in_grids, grids))))
+ saveRDS(locals_out_grids, file = here("data/derived_data/locals_out_grids.rds"))
+}
+
+# home grid
+locals_out_grids[1, ]
+```
+
+ ## # A tibble: 1 × 2
+ ## grid_id data
+ ##
+ ## 1 234
+
+``` r
+# locals in home grid with their visited grids outside home
+locals_out_grids[1, ]$data[[1]]
+```
+
+ ## Simple feature collection with 923 features and 2 fields
+ ## Geometry type: POLYGON
+ ## Dimension: XY
+ ## Bounding box: xmin: 7917.538 ymin: 24408.98 xmax: 46167.54 ymax: 48657.69
+ ## Projected CRS: SVY21 / Singapore TM
+ ## # A tibble: 923 × 3
+ ## u_id out_to_grid geometry
+ ##
+ ## 1 81473111 613 ((18042.54 39347.91, 17667.54 39564.42, 17667.54 39997.…
+ ## 2 81473111 1187 ((29292.54 28955.61, 28917.54 29172.11, 28917.54 29605.…
+ ## 3 81473111 1210 ((29667.54 30904.17, 29292.54 31120.67, 29292.54 31553.…
+ ## 4 81473111 1216 ((29667.54 38698.39, 29292.54 38914.9, 29292.54 39347.9…
+ ## 5 81473111 579 ((17292.54 39347.91, 16917.54 39564.42, 16917.54 39997.…
+ ## 6 81473111 1233 ((30042.54 31553.68, 29667.54 31770.19, 29667.54 32203.…
+ ## 7 81473111 1271 ((30792.54 30254.65, 30417.54 30471.15, 30417.54 30904.…
+ ## 8 81473111 715 ((20292.54 31553.68, 19917.54 31770.19, 19917.54 32203.…
+ ## 9 81473111 1125 ((28167.54 30904.17, 27792.54 31120.67, 27792.54 31553.…
+ ## 10 81473111 1709 ((45417.54 37399.36, 45042.54 37615.86, 45042.54 38048.…
+ ## # … with 913 more rows
+
+The outflow diversity is saved under `data/derived_data/` directory.
+
+``` r
+# calculate outflow diversity
+if(file.exists(here("data/derived_data/outflow_diversity.rds"))){
+ outflow_diversity <- readRDS(here("data/derived_data/outflow_diversity.rds"))
+}else{
+ outflow_diversity <- locals_out_grids %>%
+ filter(grid_id %in% considered_grids$grid_id) %>%
+ mutate(div_shannon = map2_dbl(grid_id, data, with_progress(function(x, y) cal_diversity(x, y, grid_sectors, grids, type = "outflow")))) %>%
+ dplyr::select(-data)
+
+ outflow_diversity <- outflow_diversity %>%
+ mutate(norm_div_shannon = (div_shannon - min(div_shannon))/(max(div_shannon) - min(div_shannon)),
+ norm_div_shannon = round(norm_div_shannon, 2)) %>%
+ left_join(., grids, by = c("grid_id" = "grid_id")) %>%
+ st_sf(crs = 3414)
+
+ saveRDS(outflow_diversity, file = here("data/derived_data/outflow_diversity.rds"))
+}
+
+head(outflow_diversity)
+```
+
+ ## Simple feature collection with 6 features and 3 fields
+ ## Geometry type: POLYGON
+ ## Dimension: XY
+ ## Bounding box: xmin: 9792.538 ymin: 34801.28 xmax: 12417.54 ymax: 38914.9
+ ## Projected CRS: SVY21 / Singapore TM
+ ## # A tibble: 6 × 4
+ ## grid_id div_shannon norm_div_shannon geometry
+ ##
+ ## 1 234 2.15 0.367 ((10167.54 34801.28, 9792.538 35017.79, …
+ ## 2 286 2.24 0.469 ((11292.54 35450.8, 10917.54 35667.31, 1…
+ ## 3 287 1.99 0.179 ((11292.54 36749.84, 10917.54 36966.34, …
+ ## 4 304 2.06 0.260 ((11667.54 36100.32, 11292.54 36316.82, …
+ ## 5 321 2.27 0.508 ((12042.54 35450.8, 11667.54 35667.31, 1…
+ ## 6 323 2.45 0.712 ((12042.54 38048.88, 11667.54 38265.38, …
+
+## Diversity spatial distribution
+
+### Inflow diversity
+
+``` r
+spatial_viz(inflow_diversity, fill_var = "norm_div_shannon",
+ legend_title = "Norm.diversity",
+ main_title = "(a) Diversity of Flows: normalized inflow diversity",
+ main.title_size = 1.4,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65)
+```
+
+
+
+``` r
+violin_viz(inflow_diversity, var = "norm_div_shannon",
+ labs.x = "", labs.y = "Norm.diversity", breaks = seq(0, 1, 0.1),
+ y.shift = 0.17, x.shift = 1.52, text.size = 4)
+```
+
+
+
+### Outflow diversity
+
+``` r
+spatial_viz(outflow_diversity, fill_var = "norm_div_shannon",
+ legend_title = "Norm.diversity",
+ main_title = "(b) Diversity of Flows: normalized outflow diversity",
+ main.title_size = 1.4,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65,
+ palette = "Purples")
+```
+
+
+
+``` r
+violin_viz(outflow_diversity, var = "norm_div_shannon",
+ labs.x = "", labs.y = "Norm.diversity", breaks = seq(0, 1, 0.1),
+ y.shift = 0.17, x.shift = 1.52, text.size = 4)
+```
+
+
diff --git a/analysis/08-calculate-diversity_files/figure-gfm/unnamed-chunk-10-1.png b/analysis/08-calculate-diversity_files/figure-gfm/unnamed-chunk-10-1.png
new file mode 100644
index 0000000..f53f4db
Binary files /dev/null and b/analysis/08-calculate-diversity_files/figure-gfm/unnamed-chunk-10-1.png differ
diff --git a/analysis/08-calculate-diversity_files/figure-gfm/unnamed-chunk-7-1.png b/analysis/08-calculate-diversity_files/figure-gfm/unnamed-chunk-7-1.png
new file mode 100644
index 0000000..dbde53d
Binary files /dev/null and b/analysis/08-calculate-diversity_files/figure-gfm/unnamed-chunk-7-1.png differ
diff --git a/analysis/08-calculate-diversity_files/figure-gfm/unnamed-chunk-8-1.png b/analysis/08-calculate-diversity_files/figure-gfm/unnamed-chunk-8-1.png
new file mode 100644
index 0000000..ad8ab0a
Binary files /dev/null and b/analysis/08-calculate-diversity_files/figure-gfm/unnamed-chunk-8-1.png differ
diff --git a/analysis/08-calculate-diversity_files/figure-gfm/unnamed-chunk-9-1.png b/analysis/08-calculate-diversity_files/figure-gfm/unnamed-chunk-9-1.png
new file mode 100644
index 0000000..11a974b
Binary files /dev/null and b/analysis/08-calculate-diversity_files/figure-gfm/unnamed-chunk-9-1.png differ
diff --git a/analysis/09-draw-flow-maps.Rmd b/analysis/09-draw-flow-maps.Rmd
new file mode 100644
index 0000000..0deafb1
--- /dev/null
+++ b/analysis/09-draw-flow-maps.Rmd
@@ -0,0 +1,294 @@
+---
+title: "Flow Mapping"
+author: "Qingqing Chen"
+date: "Last compiled date: `r format(Sys.time(), '%d %B, %Y')`"
+output: github_document
+editor_options:
+ chunk_output_type: console
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE, fig.align='center', fig.width = 10, fig.height = 10, warning = FALSE, message = FALSE)
+library(tidyverse)
+library(sf)
+library(tmap)
+library(purrrogress)
+library(ggspatial)
+library(here)
+source(here("R/flow_mapping.R"))
+```
+
+
+
+Flows into or out of a neighbourhood can happen in all cardinal directions and with different intensities and distances. Visualizing the entire network of these connections is not very insightful for making neighbourhood-level inferences. This is why we create a single metric that summarizes the overall direction and strength of a flow to or from a neighbourhood, inspired by Tobler's (1981) work on vector fields to display flow patterns.
+
+We first aggregate the sectors of a grid into four zones, with each zone corresponding to a specific angle and vector, relative to ‘East’ (0°): the North East (NE) zone (45°), North West (NW) zone (135°), South West (SW) zone (225°), and South East (SE) zone (315°). This simplification alleviates computational complexity while maintaining the overall flow orientation of a neighbourhood. For each vector, its direction represents where visitors come from or where locals go out to, and its magnitude indicates the number of people in that flow. The resultant vector - the sum of the four vectors - is used to represent the overall flow direction of each grid. To visualize all resulting vectors uniformly on a single map, we log-transform the length of each vector (to prevent some arrows from being much longer than others). Furthermore, we also represent the number of users active in the grid with the width of the resultant vector arrows.
+
+## Load data
+```{r}
+# sg boundary
+sg_boundary <- readRDS(here("data/derived_data/sg_boundary.rds"))
+# hexagonal grids
+grids <- read_sf(here("data/derived_data/spatial_hex_grid.shp"), quiet = T) %>%
+ st_transform(crs = 3414)
+# grid centroids
+grid_centroids <- grids %>% st_centroid()
+
+# divided sectors
+grid_sectors <- readRDS(here("data/derived_data/grid_sectors.rds"))
+
+# inflow and outflow distance
+dist_visitors <- readRDS(here("data/derived_data/dist_visitor_points.rds"))
+dist_locals <- readRDS(here("data/derived_data/dist_local_points.rds"))
+
+# inflow and outflow diversity
+inflow_diversity <- readRDS(here("data/derived_data/inflow_diversity.rds"))
+outflow_diversity <- readRDS(here("data/derived_data/outflow_diversity.rds"))
+```
+
+## Vector flow analysis
+
+### Inflow vectors
+
+```{r}
+if(file.exists(here("data/derived_data/inflow_vectors.rds"))){
+ inflow_vectors <- readRDS(here("data/derived_data/inflow_vectors.rds"))
+}else{
+ dist_visitors_nest <- dist_visitors %>%
+ nest(data = c(u_id, home, created_at, dist_hm2grid, dist_hm2grid_km))
+ inflow_vectors <- do.call(bind_rows, map2(dist_visitors_nest$grid_id, dist_visitors_nest$data, with_progress(cal_inflow_vectors)))
+ saveRDS(inflow_vectors, file = here("data/derived_data/inflow_vectors.rds"))
+}
+
+# scale inflow vectors
+inflow_vectors_scaled <- inflow_vectors %>%
+ mutate(magnitude_rescaled = scales::rescale(log(magnitude))) %>%
+ mutate(start_x = X + 1 + magnitude_rescaled * 1000 * cos(angle),
+ start_y = Y + 1 + magnitude_rescaled * 1000 * sin(angle),
+ total_user = scales::rescale(log(total_user)))
+head(inflow_vectors_scaled)
+```
+
+
+### Outflow vectors
+
+```{r}
+if(file.exists(here("data/derived_data/outflow_vectors.rds"))){
+ outflow_vectors <- readRDS(here("data/derived_data/outflow_vectors.rds"))
+}else{
+ dist_locals_nest <- dist_locals %>%
+ nest(data = c(u_id, grid_id, created_at, dist_hm2grid, dist_hm2grid_km))
+ outflow_vectors <- do.call(bind_rows, map2(dist_locals_nest$home, dist_locals_nest$data, with_progress(cal_outflow_vectors)))
+ saveRDS(outflow_vectors, file = here("data/derived_data/outflow_vectors.rds"))
+}
+
+# scale outflow vectors
+outflow_vectors_scaled <- outflow_vectors %>%
+ mutate(magnitude_rescaled = scales::rescale(log(magnitude)),
+ end_x = X + 1 + magnitude_rescaled * 1000 * cos(angle),
+ end_y = Y + 1 + magnitude_rescaled * 1000 * sin(angle),
+ total_user = scales::rescale(log(total_user)))
+head(outflow_vectors_scaled)
+```
+
+
+## Flow map visualization
+
+### Background preparation
+
+#### Background: Travel distance
+
+```{r}
+# mean inflow distance
+bg_dist_inflow <- dist_visitors %>%
+ group_by(grid_id, u_id) %>%
+ dplyr::summarise(mean_dist_user = mean(dist_hm2grid_km)) %>%
+ group_by(grid_id) %>%
+ dplyr::summarise(mean_dist_grid = mean(mean_dist_user),
+ sd_dist_grid = sd(mean_dist_user)) %>%
+ mutate(cut_mean_dist_grid = cut(mean_dist_grid,
+ breaks=c(quantile(mean_dist_grid, probs = seq(0, 1, by = 0.20))), include.lowest=TRUE),
+ cut_sd_dist_grid = cut(sd_dist_grid,
+ breaks = c(quantile(sd_dist_grid, probs = seq(0, 1, by = 0.2))), include.lowest = TRUE)) %>%
+ left_join(., grids) %>%
+ st_as_sf()
+
+# mean outflow distance
+bg_dist_outflow <- dist_locals %>%
+ group_by(home, u_id) %>%
+ dplyr::summarise(mean_dist_user = mean(dist_hm2grid_km)) %>%
+ group_by(home) %>%
+ dplyr::summarise(mean_dist_grid = mean(mean_dist_user),
+ sd_dist_grid = sd(mean_dist_user)) %>%
+ mutate(cut_mean_dist_grid = cut(mean_dist_grid,
+ breaks=c(quantile(mean_dist_grid, probs = seq(0, 1, by = 0.20))), include.lowest=TRUE),
+ cut_sd_dist_grid = cut(sd_dist_grid,
+ breaks = c(quantile(sd_dist_grid, probs = seq(0, 1, by = 0.2))), include.lowest = TRUE)) %>%
+ left_join(., grids, by = c("home" = "grid_id")) %>%
+ st_as_sf()
+```
+
+
+#### Background: Neighbourhood ratio
+```{r}
+if(file.exists(here("data/derived_data/neigh_ratio_inflow.rds"))){
+ bg_neighRatio_inflow <- readRDS(here("data/derived_data/neigh_ratio_inflow.rds")) %>%
+ dplyr::select(grid_id, type, ratio_divide_sumWeigh) %>%
+ na.omit() %>%
+ spread(key = "type", value = "ratio_divide_sumWeigh") %>%
+ replace(., is.na(.), 0) %>%
+ left_join(., grids) %>%
+ st_as_sf(crs = 3414) %>%
+ mutate(ratio = cut(inner, breaks=c(quantile(inner, probs = seq(0, 1, by = 0.20))), include.lowest=TRUE))
+}else{
+ dist_visitors_weight <- dist_visitors %>%
+ dplyr::select(grid_id, u_id, home, created_at) %>%
+ group_by(u_id) %>%
+ mutate(total_tweets = n(), weight = 1/total_tweets) %>%
+ ungroup()
+
+ neigh_ratio_inflow <- do.call(rbind, map(dist_visitors_nest$grid_id, with_progress(function(x) cal_neigh_ratio(dist_visitors_weight, x, user_type = "visitor"))))
+ saveRDS(neigh_ratio_inflow, file = here("data/derived_data/neigh_ratio_inflow.rds"))
+}
+
+if(file.exists(here("data/derived_data/neigh_ratio_outflow.rds"))){
+ bg_neighRatio_outflow <- readRDS(here("data/derived_data/neigh_ratio_outflow.rds")) %>%
+ dplyr::select(home, type, ratio_divide_sumWeigh) %>%
+ na.omit() %>%
+ spread(key = "type", value = "ratio_divide_sumWeigh") %>%
+ replace(., is.na(.), 0) %>%
+ left_join(., grids, by = c("home" = "grid_id")) %>%
+ st_as_sf(crs = 3414) %>%
+ mutate(ratio = cut(inner, breaks=c(quantile(inner, probs = seq(0, 1, by = 0.20))), include.lowest=TRUE))
+}else{
+ dist_locals_weight <- dist_locals %>%
+ dplyr::select(home, u_id, grid_id, created_at) %>%
+ group_by(u_id) %>%
+ mutate(total_tweets = n(), weight = 1/total_tweets) %>%
+ ungroup()
+
+ neigh_ratio_outflow <- do.call(rbind, map(dist_locals_nest$home, with_progress(function(x) cal_neigh_ratio(dist_locals_weight, x, user_type = "local"))))
+ saveRDS(neigh_ratio_outflow, file = here("data/derived_data/neigh_ratio_outflow.rds"))
+}
+```
+
+#### Background: Diversity
+
+```{r}
+bg_div_inflow <- inflow_diversity %>%
+ dplyr::select(grid_id, norm_div_shannon) %>%
+ mutate(cut_div = cut(norm_div_shannon, breaks=c(quantile(norm_div_shannon, probs = seq(0, 1, by = 0.20))), include.lowest=TRUE))
+
+bg_div_outflow <- outflow_diversity %>%
+ dplyr::select(grid_id, norm_div_shannon) %>%
+ mutate(cut_div = cut(norm_div_shannon, breaks=c(quantile(norm_div_shannon, probs = seq(0, 1, by = 0.20))), include.lowest=TRUE))
+```
+
+
+
+### Flow maps
+
+#### Inflow with neighbourhood ratio background
+
+```{r}
+viz_flows(bg_neighRatio_inflow, inflow_vectors_scaled,
+ bg.fill.var = "ratio",
+ quantile.var = "inner",
+ palette = "OrRd",
+ start.x = "start_x", start.y = "start_y",
+ end.x = "X", end.y = "Y",
+ title = expression(paste("(a) ", bold("Direction of Flows: "), "background is neighbourhood ratio")),
+ legend.nm = "Neighborhood ratio")
+```
+
+#### Outflow with neighbourhood ratio background
+```{r}
+viz_flows(bg_neighRatio_outflow, outflow_vectors_scaled,
+ quantile.var = "inner",
+ palette = "Purples",
+ bg.fill.var = "ratio",
+ start.x = "X", start.y = "Y",
+ end.x = "end_x", end.y = "end_y",
+ title = expression(paste("(a) ", bold("Direction of Flows: "), "background is neighbourhood ratio")),
+ legend.nm = "Neighborhood ratio")
+```
+
+
+#### Inflow with average distance background
+
+```{r}
+viz_flows(bg_dist_inflow, inflow_vectors_scaled,
+ bg.fill.var = "cut_mean_dist_grid",
+ quantile.var = "mean_dist_grid",
+ palette = "OrRd",
+ start.x = "start_x", start.y = "start_y",
+ end.x = "X", end.y = "Y",
+ title = expression(paste("(b) ", bold("Direction of Flows: "), "background is average incoming distance")),
+ legend.nm = "Avg.distance (km)")
+```
+
+### Outflow average distance background
+```{r}
+viz_flows(bg_dist_outflow, outflow_vectors_scaled,
+ bg.fill.var = "cut_mean_dist_grid",
+ quantile.var = "mean_dist_grid",
+ palette = "Purples",
+ start.x = "X", start.y = "Y",
+ end.x = "end_x", end.y = "end_y",
+ title = expression(paste("(b) ", bold("Direction of Flows: "), "background is average outgoing distance")),
+ legend.nm = "Avg.distance (km)")
+```
+
+
+#### Inflow with S.D.distance background
+```{r}
+viz_flows(bg_dist_inflow, inflow_vectors_scaled,
+ bg.fill.var = "cut_sd_dist_grid",
+ quantile.var = "sd_dist_grid",
+ palette = "OrRd",
+ start.x = "start_x", start.y = "start_y",
+ end.x = "X", end.y = "Y",
+ title = expression(paste("(c) ", bold("Direction of Flows: "), "background is S.D. of incoming distance")),
+ legend.nm = "S.D.distance(km)")
+```
+
+
+#### Outflow with S.D.distance background
+```{r}
+viz_flows(bg_dist_outflow, outflow_vectors_scaled,
+ bg.fill.var = "cut_sd_dist_grid",
+ quantile.var = "sd_dist_grid",
+ palette = "Purples",
+ start.x = "X", start.y = "Y",
+ end.x = "end_x", end.y = "end_y",
+ title = expression(paste("(c) ", bold("Direction of Flows: "), "background is S.D. of outgoing distance")),
+ legend.nm = "S.D.distance(km)")
+```
+
+#### Inflow with diversity background
+
+```{r}
+viz_flows(bg_div_inflow %>% filter(grid_id %in% inflow_vectors_scaled$grid_id), inflow_vectors_scaled,
+ bg.fill.var = "cut_div",
+ quantile.var = "norm_div_shannon",
+ palette = "OrRd",
+ start.x = "start_x", start.y = "start_y",
+ end.x = "X", end.y = "Y",
+ title = expression(paste("(d) ", bold("Direction of Flows: "), "background is normalized inflow diversity")),
+ legend.nm = "Norm.diversity")
+```
+
+#### Outflow with diversity background
+
+```{r}
+viz_flows(bg_div_outflow %>% filter(grid_id %in% outflow_vectors_scaled$home_id), outflow_vectors_scaled,
+ bg.fill.var = "cut_div",
+ quantile.var = "norm_div_shannon",
+ palette = "Purples",
+ start.x = "X", start.y = "Y",
+ end.x = "end_x", end.y = "end_y",
+ title = expression(paste("(d) ", bold("Direction of Flows: "), "background is normalized outflow diversity")),
+ legend.nm = "Norm.diversity")
+```
diff --git a/analysis/09-draw-flow-maps.md b/analysis/09-draw-flow-maps.md
new file mode 100644
index 0000000..225b64a
--- /dev/null
+++ b/analysis/09-draw-flow-maps.md
@@ -0,0 +1,334 @@
+Flow Mapping
+================
+Qingqing Chen
+Last compiled date: 18 September, 2021
+
+
+
+Flows into or out of a neighbourhood can happen in all cardinal
+directions and with different intensities and distances. Visualizing the
+entire network of these connections is not very insightful for making
+neighbourhood-level inferences. This is why we create a single metric
+that summarizes the overall direction and strength of a flow to or from
+a neighbourhood, inspired by Tobler’s (1981) work on vector fields to
+display flow patterns.
+
+We first aggregate the sectors of a grid into four zones, with each zone
+corresponding to a specific angle and vector, relative to ‘East’ (0°):
+the North East (NE) zone (45°), North West (NW) zone (135°), South West
+(SW) zone (225°), and South East (SE) zone (315°). This simplification
+alleviates computational complexity while maintaining the overall flow
+orientation of a neighbourhood. For each vector, its direction
+represents where visitors come from or where locals go out to, and its
+magnitude indicates the number of people in that flow. The resultant
+vector - the sum of the four vectors - is used to represent the overall
+flow direction of each grid. To visualize all resulting vectors
+uniformly on a single map, we log-transform the length of each vector
+(to prevent some arrows from being much longer than others).
+Furthermore, we also represent the number of users active in the grid
+with the width of the resultant vector arrows.
+
+## Load data
+
+``` r
+# sg boundary
+sg_boundary <- readRDS(here("data/derived_data/sg_boundary.rds"))
+# hexagonal grids
+grids <- read_sf(here("data/derived_data/spatial_hex_grid.shp"), quiet = T) %>%
+ st_transform(crs = 3414)
+# grid centroids
+grid_centroids <- grids %>% st_centroid()
+
+# divided sectors
+grid_sectors <- readRDS(here("data/derived_data/grid_sectors.rds"))
+
+# inflow and outflow distance
+dist_visitors <- readRDS(here("data/derived_data/dist_visitor_points.rds"))
+dist_locals <- readRDS(here("data/derived_data/dist_local_points.rds"))
+
+# inflow and outflow diversity
+inflow_diversity <- readRDS(here("data/derived_data/inflow_diversity.rds"))
+outflow_diversity <- readRDS(here("data/derived_data/outflow_diversity.rds"))
+```
+
+## Vector flow analysis
+
+### Inflow vectors
+
+``` r
+if(file.exists(here("data/derived_data/inflow_vectors.rds"))){
+ inflow_vectors <- readRDS(here("data/derived_data/inflow_vectors.rds"))
+}else{
+ dist_visitors_nest <- dist_visitors %>%
+ nest(data = c(u_id, home, created_at, dist_hm2grid, dist_hm2grid_km))
+ inflow_vectors <- do.call(bind_rows, map2(dist_visitors_nest$grid_id, dist_visitors_nest$data, with_progress(cal_inflow_vectors)))
+ saveRDS(inflow_vectors, file = here("data/derived_data/inflow_vectors.rds"))
+}
+
+# scale inflow vectors
+inflow_vectors_scaled <- inflow_vectors %>%
+ mutate(magnitude_rescaled = scales::rescale(log(magnitude))) %>%
+ mutate(start_x = X + 1 + magnitude_rescaled * 1000 * cos(angle),
+ start_y = Y + 1 + magnitude_rescaled * 1000 * sin(angle),
+ total_user = scales::rescale(log(total_user)))
+head(inflow_vectors_scaled)
+```
+
+ ## # A tibble: 6 × 9
+ ## X Y grid_id angle magnitude total_user magnitude_rescaled start_x
+ ##
+ ## 1 3043. 25492. 8 0.785 12 0.140 0.284 3244.
+ ## 2 3418. 26141. 14 0.785 15 0.168 0.309 3637.
+ ## 3 7168. 33935. 123 0.620 12.2 0.159 0.285 7401.
+ ## 4 7543. 34585. 135 0.574 28.6 0.272 0.383 7865.
+ ## 5 7918. 33935. 147 0.700 35.1 0.286 0.406 8229.
+ ## 6 7918. 35234. 148 0.537 81.5 0.407 0.502 8350.
+ ## # … with 1 more variable: start_y
+
+### Outflow vectors
+
+``` r
+if(file.exists(here("data/derived_data/outflow_vectors.rds"))){
+ outflow_vectors <- readRDS(here("data/derived_data/outflow_vectors.rds"))
+}else{
+ dist_locals_nest <- dist_locals %>%
+ nest(data = c(u_id, grid_id, created_at, dist_hm2grid, dist_hm2grid_km))
+ outflow_vectors <- do.call(bind_rows, map2(dist_locals_nest$home, dist_locals_nest$data, with_progress(cal_outflow_vectors)))
+ saveRDS(outflow_vectors, file = here("data/derived_data/outflow_vectors.rds"))
+}
+
+# scale outflow vectors
+outflow_vectors_scaled <- outflow_vectors %>%
+ mutate(magnitude_rescaled = scales::rescale(log(magnitude)),
+ end_x = X + 1 + magnitude_rescaled * 1000 * cos(angle),
+ end_y = Y + 1 + magnitude_rescaled * 1000 * sin(angle),
+ total_user = scales::rescale(log(total_user)))
+head(outflow_vectors_scaled)
+```
+
+ ## # A tibble: 6 × 9
+ ## X Y home_id angle magnitude total_user magnitude_rescaled end_x
+ ##
+ ## 1 10168. 35234. 234 0.0891 667. 0.477 0.592 10758.
+ ## 2 11293. 35884. 286 -0.0182 777. 0.515 0.618 11912.
+ ## 3 11293. 37183. 287 -0.374 1020. 0.560 0.666 11914.
+ ## 4 11668. 36533. 304 -0.214 567. 0.437 0.563 12219.
+ ## 5 12043. 35884. 321 -0.103 4729. 0.914 0.937 12976.
+ ## 6 12043. 38482. 323 -0.201 491. 0.414 0.538 12570.
+ ## # … with 1 more variable: end_y
+
+## Flow map visualization
+
+### Background preparation
+
+#### Background: Travel distance
+
+``` r
+# mean inflow distance
+bg_dist_inflow <- dist_visitors %>%
+ group_by(grid_id, u_id) %>%
+ dplyr::summarise(mean_dist_user = mean(dist_hm2grid_km)) %>%
+ group_by(grid_id) %>%
+ dplyr::summarise(mean_dist_grid = mean(mean_dist_user),
+ sd_dist_grid = sd(mean_dist_user)) %>%
+ mutate(cut_mean_dist_grid = cut(mean_dist_grid,
+ breaks=c(quantile(mean_dist_grid, probs = seq(0, 1, by = 0.20))), include.lowest=TRUE),
+ cut_sd_dist_grid = cut(sd_dist_grid,
+ breaks = c(quantile(sd_dist_grid, probs = seq(0, 1, by = 0.2))), include.lowest = TRUE)) %>%
+ left_join(., grids) %>%
+ st_as_sf()
+
+# mean outflow distance
+bg_dist_outflow <- dist_locals %>%
+ group_by(home, u_id) %>%
+ dplyr::summarise(mean_dist_user = mean(dist_hm2grid_km)) %>%
+ group_by(home) %>%
+ dplyr::summarise(mean_dist_grid = mean(mean_dist_user),
+ sd_dist_grid = sd(mean_dist_user)) %>%
+ mutate(cut_mean_dist_grid = cut(mean_dist_grid,
+ breaks=c(quantile(mean_dist_grid, probs = seq(0, 1, by = 0.20))), include.lowest=TRUE),
+ cut_sd_dist_grid = cut(sd_dist_grid,
+ breaks = c(quantile(sd_dist_grid, probs = seq(0, 1, by = 0.2))), include.lowest = TRUE)) %>%
+ left_join(., grids, by = c("home" = "grid_id")) %>%
+ st_as_sf()
+```
+
+#### Background: Neighbourhood ratio
+
+``` r
+if(file.exists(here("data/derived_data/neigh_ratio_inflow.rds"))){
+ bg_neighRatio_inflow <- readRDS(here("data/derived_data/neigh_ratio_inflow.rds")) %>%
+ dplyr::select(grid_id, type, ratio_divide_sumWeigh) %>%
+ na.omit() %>%
+ spread(key = "type", value = "ratio_divide_sumWeigh") %>%
+ replace(., is.na(.), 0) %>%
+ left_join(., grids) %>%
+ st_as_sf(crs = 3414) %>%
+ mutate(ratio = cut(inner, breaks=c(quantile(inner, probs = seq(0, 1, by = 0.20))), include.lowest=TRUE))
+}else{
+ dist_visitors_weight <- dist_visitors %>%
+ dplyr::select(grid_id, u_id, home, created_at) %>%
+ group_by(u_id) %>%
+ mutate(total_tweets = n(), weight = 1/total_tweets) %>%
+ ungroup()
+
+ neigh_ratio_inflow <- do.call(rbind, map(dist_visitors_nest$grid_id, with_progress(function(x) cal_neigh_ratio(dist_visitors_weight, x, user_type = "visitor"))))
+ saveRDS(neigh_ratio_inflow, file = here("data/derived_data/neigh_ratio_inflow.rds"))
+}
+
+if(file.exists(here("data/derived_data/neigh_ratio_outflow.rds"))){
+ bg_neighRatio_outflow <- readRDS(here("data/derived_data/neigh_ratio_outflow.rds")) %>%
+ dplyr::select(home, type, ratio_divide_sumWeigh) %>%
+ na.omit() %>%
+ spread(key = "type", value = "ratio_divide_sumWeigh") %>%
+ replace(., is.na(.), 0) %>%
+ left_join(., grids, by = c("home" = "grid_id")) %>%
+ st_as_sf(crs = 3414) %>%
+ mutate(ratio = cut(inner, breaks=c(quantile(inner, probs = seq(0, 1, by = 0.20))), include.lowest=TRUE))
+}else{
+ dist_locals_weight <- dist_locals %>%
+ dplyr::select(home, u_id, grid_id, created_at) %>%
+ group_by(u_id) %>%
+ mutate(total_tweets = n(), weight = 1/total_tweets) %>%
+ ungroup()
+
+ neigh_ratio_outflow <- do.call(rbind, map(dist_locals_nest$home, with_progress(function(x) cal_neigh_ratio(dist_locals_weight, x, user_type = "local"))))
+ saveRDS(neigh_ratio_outflow, file = here("data/derived_data/neigh_ratio_outflow.rds"))
+}
+```
+
+#### Background: Diversity
+
+``` r
+bg_div_inflow <- inflow_diversity %>%
+ dplyr::select(grid_id, norm_div_shannon) %>%
+ mutate(cut_div = cut(norm_div_shannon, breaks=c(quantile(norm_div_shannon, probs = seq(0, 1, by = 0.20))), include.lowest=TRUE))
+
+bg_div_outflow <- outflow_diversity %>%
+ dplyr::select(grid_id, norm_div_shannon) %>%
+ mutate(cut_div = cut(norm_div_shannon, breaks=c(quantile(norm_div_shannon, probs = seq(0, 1, by = 0.20))), include.lowest=TRUE))
+```
+
+### Flow maps
+
+#### Inflow with neighbourhood ratio background
+
+``` r
+viz_flows(bg_neighRatio_inflow, inflow_vectors_scaled,
+ bg.fill.var = "ratio",
+ quantile.var = "inner",
+ palette = "OrRd",
+ start.x = "start_x", start.y = "start_y",
+ end.x = "X", end.y = "Y",
+ title = expression(paste("(a) ", bold("Direction of Flows: "), "background is neighbourhood ratio")),
+ legend.nm = "Neighborhood ratio")
+```
+
+
+
+#### Outflow with neighbourhood ratio background
+
+``` r
+viz_flows(bg_neighRatio_outflow, outflow_vectors_scaled,
+ quantile.var = "inner",
+ palette = "Purples",
+ bg.fill.var = "ratio",
+ start.x = "X", start.y = "Y",
+ end.x = "end_x", end.y = "end_y",
+ title = expression(paste("(a) ", bold("Direction of Flows: "), "background is neighbourhood ratio")),
+ legend.nm = "Neighborhood ratio")
+```
+
+
+
+#### Inflow with average distance background
+
+``` r
+viz_flows(bg_dist_inflow, inflow_vectors_scaled,
+ bg.fill.var = "cut_mean_dist_grid",
+ quantile.var = "mean_dist_grid",
+ palette = "OrRd",
+ start.x = "start_x", start.y = "start_y",
+ end.x = "X", end.y = "Y",
+ title = expression(paste("(b) ", bold("Direction of Flows: "), "background is average incoming distance")),
+ legend.nm = "Avg.distance (km)")
+```
+
+
+
+### Outflow average distance background
+
+``` r
+viz_flows(bg_dist_outflow, outflow_vectors_scaled,
+ bg.fill.var = "cut_mean_dist_grid",
+ quantile.var = "mean_dist_grid",
+ palette = "Purples",
+ start.x = "X", start.y = "Y",
+ end.x = "end_x", end.y = "end_y",
+ title = expression(paste("(b) ", bold("Direction of Flows: "), "background is average outgoing distance")),
+ legend.nm = "Avg.distance (km)")
+```
+
+
+
+#### Inflow with S.D.distance background
+
+``` r
+viz_flows(bg_dist_inflow, inflow_vectors_scaled,
+ bg.fill.var = "cut_sd_dist_grid",
+ quantile.var = "sd_dist_grid",
+ palette = "OrRd",
+ start.x = "start_x", start.y = "start_y",
+ end.x = "X", end.y = "Y",
+ title = expression(paste("(c) ", bold("Direction of Flows: "), "background is S.D. of incoming distance")),
+ legend.nm = "S.D.distance(km)")
+```
+
+
+
+#### Outflow with S.D.distance background
+
+``` r
+viz_flows(bg_dist_outflow, outflow_vectors_scaled,
+ bg.fill.var = "cut_sd_dist_grid",
+ quantile.var = "sd_dist_grid",
+ palette = "Purples",
+ start.x = "X", start.y = "Y",
+ end.x = "end_x", end.y = "end_y",
+ title = expression(paste("(c) ", bold("Direction of Flows: "), "background is S.D. of outgoing distance")),
+ legend.nm = "S.D.distance(km)")
+```
+
+
+
+#### Inflow with diversity background
+
+``` r
+viz_flows(bg_div_inflow %>% filter(grid_id %in% inflow_vectors_scaled$grid_id), inflow_vectors_scaled,
+ bg.fill.var = "cut_div",
+ quantile.var = "norm_div_shannon",
+ palette = "OrRd",
+ start.x = "start_x", start.y = "start_y",
+ end.x = "X", end.y = "Y",
+ title = expression(paste("(d) ", bold("Direction of Flows: "), "background is normalized inflow diversity")),
+ legend.nm = "Norm.diversity")
+```
+
+
+
+#### Outflow with diversity background
+
+``` r
+viz_flows(bg_div_outflow %>% filter(grid_id %in% outflow_vectors_scaled$home_id), outflow_vectors_scaled,
+ bg.fill.var = "cut_div",
+ quantile.var = "norm_div_shannon",
+ palette = "Purples",
+ start.x = "X", start.y = "Y",
+ end.x = "end_x", end.y = "end_y",
+ title = expression(paste("(d) ", bold("Direction of Flows: "), "background is normalized outflow diversity")),
+ legend.nm = "Norm.diversity")
+```
+
+
diff --git a/analysis/09-draw-flow-maps_files/figure-gfm/unnamed-chunk-10-1.png b/analysis/09-draw-flow-maps_files/figure-gfm/unnamed-chunk-10-1.png
new file mode 100644
index 0000000..0e074f7
Binary files /dev/null and b/analysis/09-draw-flow-maps_files/figure-gfm/unnamed-chunk-10-1.png differ
diff --git a/analysis/09-draw-flow-maps_files/figure-gfm/unnamed-chunk-11-1.png b/analysis/09-draw-flow-maps_files/figure-gfm/unnamed-chunk-11-1.png
new file mode 100644
index 0000000..38c552f
Binary files /dev/null and b/analysis/09-draw-flow-maps_files/figure-gfm/unnamed-chunk-11-1.png differ
diff --git a/analysis/09-draw-flow-maps_files/figure-gfm/unnamed-chunk-12-1.png b/analysis/09-draw-flow-maps_files/figure-gfm/unnamed-chunk-12-1.png
new file mode 100644
index 0000000..467135f
Binary files /dev/null and b/analysis/09-draw-flow-maps_files/figure-gfm/unnamed-chunk-12-1.png differ
diff --git a/analysis/09-draw-flow-maps_files/figure-gfm/unnamed-chunk-13-1.png b/analysis/09-draw-flow-maps_files/figure-gfm/unnamed-chunk-13-1.png
new file mode 100644
index 0000000..f8fb56e
Binary files /dev/null and b/analysis/09-draw-flow-maps_files/figure-gfm/unnamed-chunk-13-1.png differ
diff --git a/analysis/09-draw-flow-maps_files/figure-gfm/unnamed-chunk-14-1.png b/analysis/09-draw-flow-maps_files/figure-gfm/unnamed-chunk-14-1.png
new file mode 100644
index 0000000..971dc23
Binary files /dev/null and b/analysis/09-draw-flow-maps_files/figure-gfm/unnamed-chunk-14-1.png differ
diff --git a/analysis/09-draw-flow-maps_files/figure-gfm/unnamed-chunk-7-1.png b/analysis/09-draw-flow-maps_files/figure-gfm/unnamed-chunk-7-1.png
new file mode 100644
index 0000000..07b00ed
Binary files /dev/null and b/analysis/09-draw-flow-maps_files/figure-gfm/unnamed-chunk-7-1.png differ
diff --git a/analysis/09-draw-flow-maps_files/figure-gfm/unnamed-chunk-8-1.png b/analysis/09-draw-flow-maps_files/figure-gfm/unnamed-chunk-8-1.png
new file mode 100644
index 0000000..fd27022
Binary files /dev/null and b/analysis/09-draw-flow-maps_files/figure-gfm/unnamed-chunk-8-1.png differ
diff --git a/analysis/09-draw-flow-maps_files/figure-gfm/unnamed-chunk-9-1.png b/analysis/09-draw-flow-maps_files/figure-gfm/unnamed-chunk-9-1.png
new file mode 100644
index 0000000..700a712
Binary files /dev/null and b/analysis/09-draw-flow-maps_files/figure-gfm/unnamed-chunk-9-1.png differ
diff --git a/analysis/10-regression-analysis.Rmd b/analysis/10-regression-analysis.Rmd
new file mode 100644
index 0000000..17453a9
--- /dev/null
+++ b/analysis/10-regression-analysis.Rmd
@@ -0,0 +1,946 @@
+---
+title: "Regression analysis"
+author: "Qingqing Chen"
+date: "Last compiled date: `r format(Sys.time(), '%d %B, %Y')`"
+output: github_document
+editor_options:
+ chunk_output_type: console
+---
+
+
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE, fig.align='center', fig.width = 10, fig.height = 10, warning = FALSE, message = FALSE)
+library(tidyverse)
+library(sf)
+library(lubridate)
+library(tmap)
+library(stars)
+library(RColorBrewer)
+library(purrrogress)
+library(here)
+library(broom)
+library(spdep)
+library(PerformanceAnalytics)
+library(stargazer)
+source(here("R/viz.R"))
+```
+
+To contextualize the mobility patterns in our analysis, we compare two of the aforementioned metrics – distance and diversity – with the socio-economic and built environment characteristics of each neighbourhood through a spatial regression analysis. We do so by drawing on a set of publicly available datasets, including the HDB Resale Flat Prices data, HDB Property Information data, SLA Street Directory data, and LTA Train Station data. These datasets allow us to create six independent variables shown as follows:
+
+ - Mean resale price per square meter;
+ - Percentage of 1-Room and 2-Room rental flats;
+ - Percentage of residential building area;
+ - Percentage of industrial building area;
+ - Distance to the nearest MRT station;
+ - Distance to the central grid cell.
+
+We aggregate the variables to individual neighbourhoods (i.e. with the same spatial resolution as neighbourhoods) and only keep neighbourhoods that have public housing. It is important to stress that we employ this regression model not to predict travel distance or mobility diversity based on these independent variables but rather to provide an initial quantitative exploration of the potential underlying covariates that influence mobility patterns in Singapore.
+
+## Load data
+
+```{r}
+# hexagonal grids
+grids <- read_sf(here("data/derived_data/spatial_hex_grid.shp"), quiet = T) %>%
+ st_transform(crs = 3414)
+
+# grid centroids
+grid_centroids <- grids %>% st_centroid()
+
+# HDB buildings
+hdb_sf <- read_sf(here("data/derived_data/spatial_hdb_building.shp"))
+
+# inflow and outflow distance
+dist_inflow <- readRDS(here("data/derived_data/dist_visitor_points.rds"))
+dist_outflow <- readRDS(here("data/derived_data/dist_local_points.rds"))
+
+# inflow and outflow diversity
+div_inflow <- readRDS(here("data/derived_data/inflow_diversity.rds"))
+div_outflow <- readRDS(here("data/derived_data/outflow_diversity.rds"))
+```
+
+
+## Dependent variables
+
+### HDB resale price
+```{r}
+if(file.exists(here("data/derived_data/sf_resale.rds"))){
+ sf_resale <- readRDS(here("data/derived_data/sf_resale.rds"))
+}else{
+ df_resale_12to14 <- read_csv(here("data/raw_data/resale-flat-prices/resale-flat-prices-based-on-registration-date-from-mar-2012-to-dec-2014.csv"))
+ df_resale_15to16 <- read_csv(here("data/raw_data/resale-flat-prices/resale-flat-prices-based-on-registration-date-from-jan-2015-to-dec-2016.csv"))
+ df_resale <- bind_rows(df_resale_12to14, df_resale_15to16) %>%
+ mutate(month = ymd(month, truncated = 1)) %>%
+ filter(month >= '2012-07-01' & month < '2016-11-01')
+ # format street name
+ df_resale <- df_resale %>%
+ mutate(ROAD_NAME = gsub("\\bAVE\\b", "AVENUE", street_name),
+ ROAD_NAME = gsub("\\bBT\\b", "BUKIT", ROAD_NAME),
+ ROAD_NAME = gsub("\\bCL\\b", "CLOSE", ROAD_NAME),
+ ROAD_NAME = gsub("\\bCRES\\b", "CRESCENT", ROAD_NAME),
+ ROAD_NAME = gsub("\\bCTRL\\b", "CENTRAL", ROAD_NAME),
+ ROAD_NAME = gsub("\\bC'WEALTH\\b", "COMMONWEALTH", ROAD_NAME),
+ ROAD_NAME = gsub("\\bCTR\\b", "CENTRE", ROAD_NAME),
+ ROAD_NAME = gsub("\\bDR\\b", "DRIVE", ROAD_NAME),
+ ROAD_NAME = gsub("\\bGDNS\\b", "GARDENS", ROAD_NAME),
+ ROAD_NAME = gsub("\\bHTS\\b", "HEIGHTS", ROAD_NAME),
+ ROAD_NAME = gsub("\\bJLN\\b", "JALAN", ROAD_NAME),
+ ROAD_NAME = gsub("\\bKG\\b", "KAMPONG", ROAD_NAME),
+ ROAD_NAME = gsub("\\bLOR\\b", "LORONG", ROAD_NAME),
+ ROAD_NAME = gsub("\\bMKT\\b", "MARKET", ROAD_NAME),
+ ROAD_NAME = gsub("\\bNTH\\b", "NORTH", ROAD_NAME),
+ ROAD_NAME = gsub("\\bPL\\b", "PLACE", ROAD_NAME),
+ ROAD_NAME = gsub("\\bPK\\b", "PARK", ROAD_NAME),
+ ROAD_NAME = gsub("\\bRD\\b", "ROAD", ROAD_NAME),
+ ROAD_NAME = gsub("\\bST\\b", "STREET", ROAD_NAME),
+ ROAD_NAME = gsub("\\bSTH\\b", "SOUTH", ROAD_NAME),
+ ROAD_NAME = gsub("\\bTER\\b", "TERRACE", ROAD_NAME),
+ ROAD_NAME = gsub("\\bTG\\b", "TANJONG", ROAD_NAME),
+ ROAD_NAME = gsub("\\bUPP\\b", "UPPER", ROAD_NAME)) %>%
+ mutate(ROAD_NAME = case_when(
+ ROAD_NAME == "STREET. GEORGE'S LANE" ~ "SAINT GEORGE'S LANE",
+ ROAD_NAME == "STREET. GEORGE'S ROAD" ~ "SAINT GEORGE'S ROAD",
+ TRUE ~ ROAD_NAME
+ )) %>%
+ dplyr::rename(HOUSE_BLK_ = block) %>%
+ unite(ROAD_BLK, c("ROAD_NAME", "HOUSE_BLK_")) %>%
+ mutate(ROAD_BLK = case_when(
+ ROAD_BLK == "CHOA CHU KANG AVENUE 2_297A" ~ "CHOA CHU KANG AVENUE 2_297",
+ ROAD_BLK == "CHOA CHU KANG AVENUE 2_297B" ~ "CHOA CHU KANG AVENUE 2_297",
+ TRUE ~ ROAD_BLK
+ ))
+
+ # add geometry to each resale transaction
+ sf_resale <- hdb_sf %>%
+ left_join(df_resale, ., by = c("ROAD_BLK" = "ROAD_BLK")) %>%
+ st_as_sf() %>%
+ st_transform(crs = 3414) %>%
+ st_join(., grids, largest = T) %>%
+ filter(!is.na(grid_id))
+ saveRDS(sf_resale, file = here("data/derived_data/sf_resale.rds"))
+}
+
+head(sf_resale)
+```
+
+```{r}
+mean_resale <- sf_resale %>%
+ st_set_geometry(NULL) %>%
+ dplyr::select(grid_id, floor_area_sqm, resale_price) %>%
+ mutate(resale_price_sqm = resale_price/floor_area_sqm) %>%
+ group_by(grid_id) %>%
+ dplyr::summarise(mean_resale_price_sqm = mean(resale_price_sqm),
+ sd_resale_price_sqm = sd(resale_price_sqm))
+
+## grid cells with available socio-economic variables and human mobility indicators
+qualified_grids_inflow <- intersect(mean_resale$grid_id, div_inflow$grid_id) %>% intersect(dist_inflow$grid_id)
+qualified_grids_outflow <- intersect(mean_resale$grid_id, div_outflow$grid_id) %>% intersect(dist_outflow$grid_id)
+
+mean_resale_sf <- mean_resale %>%
+ filter(grid_id %in% qualified_grids_inflow) %>%
+ left_join(., grids, by = c("grid_id" = "grid_id")) %>%
+ st_sf()
+```
+
+
+```{r}
+spatial_viz(mean_resale_sf,
+ fill_var = "mean_resale_price_sqm",
+ legend_title = "Avg.resale price/sqm",
+ main_title = "(a) Spatial distribution of average resale price",
+ main.title_size = 1.4,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65,
+ digits = 0, palette = "PuRd")
+
+# spatial_viz(mean_resale_sf,
+# fill_var = "sd_resale_price_sqm",
+# legend_title = "S.D.resale price/sqm",
+# main_title = "(a) Spatial distribution of S.D of resale price",
+# main.title_size = 1.4,
+# legend.hist_height = 0.25, legend.hist_width = 0.5,
+# legend_width = 0.4, legend.hist_size = 0.5,
+# legend.title_size = 1.2, legend.text_size = 0.65,
+# digits = 0, palette = "PuRd")
+```
+
+```{r fig.height=5}
+violin_viz(mean_resale_sf, var = "mean_resale_price_sqm",
+ labs.x = "", labs.y = "Avg.resale price/sqm", breaks = seq(0, 9000, 1000),
+ y.shift = 1050, x.shift = 1.4, text.size = 4, digits = 0)
+```
+
+### Percentage of 1-ROOM and 2-Room rental flats
+
+```{r}
+#HDB Property Information: https://data.gov.sg/dataset/hdb-property-information
+if(file.exists(here("data/derived_data/df_pct_1_2room_rental.rds"))){
+ df_pct_1_2room_rental <- readRDS(here("data/derived_data/df_pct_1_2room_rental.rds"))
+}else{
+ df_hdb_property <- read_csv(here("data/raw_data/hdb-property-information.csv"))
+ df_hdb_property_updated <- df_hdb_property %>%
+ mutate(ROAD_NAME = gsub("\\bDR\\b", "DRIVE", street),
+ ROAD_NAME = gsub("\\bRD\\b", "ROAD", ROAD_NAME),
+ ROAD_NAME = gsub("\\bCRES\\b", "CRESCENT", ROAD_NAME),
+ ROAD_NAME = gsub("\\bAVE\\b", "AVENUE", ROAD_NAME),
+ ROAD_NAME = gsub("\\bST\\b", "STREET", ROAD_NAME),
+ ROAD_NAME = gsub("\\bCTRL\\b", "CENTRAL", ROAD_NAME),
+ ROAD_NAME = gsub("\\bNTH\\b", "NORTH", ROAD_NAME),
+ ROAD_NAME = gsub("\\bSTH\\b", "SOUTH", ROAD_NAME),
+ ROAD_NAME = gsub("\\bBT\\b", "BUKIT", ROAD_NAME),
+ ROAD_NAME = gsub("\\bC'WEALTH\\b", "COMMONWEALTH", ROAD_NAME),
+ ROAD_NAME = gsub("\\bCL\\b", "CLOSE", ROAD_NAME),
+ ROAD_NAME = gsub("\\bPK\\b", "PARK", ROAD_NAME),
+ ROAD_NAME = gsub("\\bJLN\\b", "JALAN", ROAD_NAME),
+ ROAD_NAME = gsub("\\bKG\\b", "KAMPONG", ROAD_NAME),
+ ROAD_NAME = gsub("\\bPL\\b", "PLACE", ROAD_NAME),
+ ROAD_NAME = gsub("\\bLOR\\b", "LORONG", ROAD_NAME),
+ ROAD_NAME = gsub("\\bTER\\b", "TERRACE", ROAD_NAME),
+ ROAD_NAME = gsub("\\bMKT\\b", "MARKET", ROAD_NAME),
+ ROAD_NAME = gsub("\\bUPP\\b", "UPPER", ROAD_NAME),
+ ROAD_NAME = gsub("\\bGDNS\\b", "GARDENS", ROAD_NAME),
+ ROAD_NAME = gsub("\\bTG\\b", "TANJONG", ROAD_NAME),
+ ROAD_NAME = gsub("\\bHTS\\b", "HEIGHTS", ROAD_NAME),
+ ROAD_NAME = gsub("\\bCTR\\b", "CENTRE", ROAD_NAME)) %>%
+ mutate(ROAD_NAME = case_when(
+ ROAD_NAME == "STREET. GEORGE'S LANE" ~ "SAINT GEORGE'S LANE",
+ ROAD_NAME == "STREET. GEORGE'S ROAD" ~ "SAINT GEORGE'S ROAD",
+ ROAD_NAME == "SECTOR A SIN MING IND EST" ~ "SECTOR A SIN MING INDUSTRIAL ESTATE",
+ TRUE ~ ROAD_NAME
+ )) %>%
+ dplyr::rename(HOUSE_BLK_ = blk_no) %>%
+ unite(ROAD_BLK, c("ROAD_NAME", "HOUSE_BLK_")) %>%
+ dplyr::select(ROAD_BLK, `1room_rental`, `2room_rental`, total_dwelling_units) %>%
+ mutate(ROAD_BLK = case_when(
+ ROAD_BLK == "CHOA CHU KANG AVENUE 2_297A" ~ "CHOA CHU KANG AVENUE 2_297",
+ ROAD_BLK == "CHOA CHU KANG AVENUE 2_297B" ~ "CHOA CHU KANG AVENUE 2_297",
+ TRUE ~ ROAD_BLK
+ ))
+
+ # get extra two Blks that are under 'S' and 'K' building type
+ extra_building_sf <- df_building %>%
+ filter(ROAD_NAME %in% c("NEW MARKET ROAD", "JALAN KUKOH") & HOUSE_BLK_ %in% c(1, 32)) %>%
+ unite(ROAD_BLK, c("ROAD_NAME", "HOUSE_BLK_")) %>%
+ st_transform(crs = 3414) %>%
+ dplyr::select(names(hdb_sf))
+
+ df_1_2room_rental <- df_hdb_property_updated %>%
+ filter(total_dwelling_units != 0) %>%
+ left_join(., rbind(hdb_sf, extra_building_sf)) %>%
+ st_as_sf() %>%
+ st_make_valid() %>%
+ st_join(., grids, largest = T) %>% # join grids
+ filter(!is.na(grid_id)) %>%
+ group_by(grid_id) %>%
+ dplyr::summarise(`1room_rental` = sum(`1room_rental`),
+ `2room_rental` = sum(`2room_rental`),
+ total_dwelling_units = sum(total_dwelling_units))
+
+ df_pct_1_2room_rental <- df_1_2room_rental %>%
+ st_set_geometry(NULL) %>%
+ mutate(pct_1_2room_rental = (`1room_rental` + `2room_rental`)/total_dwelling_units)
+ saveRDS(df_pct_1_2room_rental, file = here("data/derived_data/df_pct_1_2room_rental.rds"))
+}
+head(df_pct_1_2room_rental)
+```
+
+
+```{r}
+df_pct_1_2room_rental_sf <- df_pct_1_2room_rental %>%
+ filter(grid_id %in% qualified_grids_inflow) %>%
+ left_join(., grids, by = c("grid_id" = "grid_id")) %>%
+ st_sf()
+
+spatial_viz(df_pct_1_2room_rental_sf,
+ fill_var = "pct_1_2room_rental",
+ legend_title = "1&2 Room rentals (%)",
+ main_title = "(b) Spatial distribution of 1-Room and 2-Room rentals",
+ main.title_size = 1.4,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65, palette = "PuRd",
+ style = "fixed",
+ breaks = c(0, 0.05, 0.1, 0.3, 0.5, 0.8))
+```
+
+```{r fig.height=5}
+violin_viz(df_pct_1_2room_rental_sf,
+ var = "pct_1_2room_rental",
+ labs.x = "", labs.y = "1-Room & 2-Room Rentals (%)",
+ breaks = seq(0, 0.8, 0.1),
+ y.shift = 0.13, x.shift = 1.4, text.size = 4)
+```
+
+### Landuse: Percentage of industrial building
+
+```{r}
+cal_building_area <- function(index, building_sf){
+ # intersect with grids
+ intersect_grid <- building_sf %>%
+ filter(FEA_ID == index) %>%
+ st_make_valid() %>%
+ st_intersection(., grids)
+ # intersect area in grids
+ intersect_grid %>%
+ st_set_geometry(NULL) %>%
+ dplyr::mutate(area_sqm = st_area(intersect_grid) %>% as.numeric()) %>%
+ dplyr::select(FEA_ID, grid_id, area_sqm)
+}
+
+if(file.exists(here("data/derived_data/industrial_areas_grids.rds"))){
+ industrial_areas_grids <- readRDS(here("data/derived_data/industrial_areas_grids.rds"))
+}else{
+ #building type: I
+ industrial_sf <- df_building %>%
+ filter(BLDG_TYPE_ == "I") %>%
+ filter(DATA_TYPE_ == "extg") %>% # Existing building outline
+ st_transform(crs = 3414) %>%
+ unite(ROAD_BLK, c("ROAD_NAME", "HOUSE_BLK_"), sep = " ")
+
+ FEA_indexes <- industrial_sf$FEA_ID
+ # industrial building areas in grids
+ industrial_areas_grids <- do.call(bind_rows, map(FEA_indexes, with_progress(function(x) cal_building_area(x, industrial_sf))))
+ # percentage of commercial areas in grids
+ industrial_areas_grids <- industrial_areas_grids %>%
+ group_by(grid_id) %>%
+ dplyr::summarise(industrial_area_sqm = sum(area_sqm)) %>%
+ mutate(grid_area = st_area(grids[1, ]) %>% as.numeric(),
+ pct_industrial_area = round(industrial_area_sqm/grid_area, 4))
+ saveRDS(industrial_areas_grids, file = here("data/derived_data/industrial_areas_grids.rds"))
+}
+
+head(industrial_areas_grids)
+```
+
+```{r}
+industrial_areas_grids_sf <- industrial_areas_grids %>%
+ filter(grid_id %in% qualified_grids_inflow) %>%
+ left_join(grids %>% filter(grid_id %in% qualified_grids_inflow), ., by = c("grid_id" = "grid_id")) %>%
+ replace(., is.na(.), 0) %>%
+ st_sf()
+
+spatial_viz(industrial_areas_grids_sf,
+ fill_var = "pct_industrial_area",
+ legend_title = "Industrial area (%)",
+ main_title = "(c) Spatial distribution of industrial area",
+ main.title_size = 1.4,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65, palette = "PuRd",
+ style = "fixed",
+ breaks = c(0, 0.05, 0.1, 0.15, 0.2, 0.25))
+```
+
+
+```{r fig.height=5}
+violin_viz(industrial_areas_grids_sf,
+ var = "pct_industrial_area",
+ labs.x = "", labs.y = "Industrial area (%)",
+ breaks = seq(0, 0.25, 0.05), digits = 3,
+ y.shift = 0.04, x.shift = 1.4, text.size = 4)
+```
+
+### Landuse: Percentage of residential building
+
+```{r}
+if(file.exists(here("data/derived_data/residential_areas_grids.rds"))){
+ residential_areas_grids <- readRDS(here("data/derived_data/residential_areas_grids.rds"))
+}else{
+ #building type: C-Condominium, H-HDB Building, A-Apartment, E-Executive Condominium, R-Residential
+ residential_sf <- df_building %>%
+ filter(BLDG_TYPE_ %in% c("C", "H", "A", "E", "R")) %>%
+ filter(DATA_TYPE_ == "extg") %>% # Existing building outline
+ st_transform(crs = 3414) %>%
+ unite(ROAD_BLK, c("ROAD_NAME", "HOUSE_BLK_"), sep = " ")
+
+ FEA_indexes <- residential_sf$FEA_ID
+ # residential areas in grids
+ residential_areas_grids <- do.call(bind_rows, map(FEA_indexes, with_progress(function(x) cal_building_area(x, residential_sf))))
+ # percentage of residential areas in grids
+ residential_areas_grids <- residential_areas_grids %>%
+ group_by(grid_id) %>%
+ dplyr::summarise(residential_area_sqm = sum(area_sqm)) %>%
+ mutate(grid_area = st_area(grids[1, ]) %>% as.numeric(),
+ pct_residential_area = round(residential_area_sqm/grid_area, 4))
+ saveRDS(residential_areas_grids, file = here("data/derived_data/residential_areas_grids.rds"))
+}
+
+head(residential_areas_grids)
+```
+
+
+```{r}
+residential_areas_grids_sf <- residential_areas_grids %>%
+ filter(grid_id %in% qualified_grids_inflow) %>%
+ left_join(., grids, by = c("grid_id" = "grid_id")) %>%
+ st_sf()
+
+spatial_viz(residential_areas_grids_sf,
+ fill_var = "pct_residential_area",
+ legend_title = "Residential area (%)",
+ main_title = "(d) Spatial distribution of residential area",
+ main.title_size = 1.4,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65,
+ palette = "PuRd")
+```
+
+
+```{r fig.height=5}
+violin_viz(residential_areas_grids_sf,
+ var = "pct_residential_area",
+ labs.x = "", labs.y = "Residential area (%)",
+ breaks = seq(0, 0.32, 0.05),
+ y.shift = 0.06, x.shift = 1.55, text.size = 4)
+```
+
+### Distance to the nearest MRT station
+
+```{r}
+cal_dist2nearestMRT <- function(grid_index, grids, mrt){
+ grid_centroid <- grids[grid_index, ] %>% st_centroid()
+ # create 20k buffer
+ grid_buffer <- grid_centroid %>% st_buffer(20000)
+ # get MRT stations within the buffer
+ mrt_station_in_buffer <- grid_buffer %>%
+ st_join(mrt, .) %>%
+ filter(!is.na(grid_id))
+
+ # distance from grid centroid to MRT stations
+ dist2MRT <- st_distance(st_geometry(grid_centroid), st_geometry(mrt_station_in_buffer), by_element = TRUE) %>% as.numeric()
+
+ # get the nearest distance
+ mrt_station_in_buffer %>%
+ st_set_geometry(NULL) %>%
+ mutate(dist2MRT_km = round(dist2MRT/1000, 4)) %>%
+ arrange(dist2MRT_km) %>%
+ slice(1)
+}
+
+if(file.exists(here("data/derived_data/dist2MRT_nearest.rds"))){
+ dist2MRT_nearest <- readRDS(here("data/derived_data/dist2MRT_nearest.rds"))
+}else{
+ # MRT station
+ mrt <- read_sf(here("data/raw_data/TrainStation_Jan2020/MRTLRTStnPtt.Shp"), quiet = T) %>%
+ st_transform(crs = 3414)
+ dist2MRT_nearest <- do.call(bind_rows, map(grids$grid_id, with_progress(function(x) cal_dist2nearestMRT(x, grids, mrt))))
+ saveRDS(dist2MRT_nearest, file = here("data/derived_data/dist2MRT_nearest.rds"))
+}
+
+head(dist2MRT_nearest)
+```
+
+
+```{r}
+dist2MRT_nearest_sf <- dist2MRT_nearest %>%
+ filter(grid_id %in% qualified_grids_inflow) %>%
+ left_join(., grids, by = c("grid_id" = "grid_id")) %>%
+ st_sf()
+
+spatial_viz(dist2MRT_nearest_sf,
+ fill_var = "dist2MRT_km",
+ legend_title = "Distance (km)",
+ main_title = "(e) Spatial distribution of distance to the nearest MRT station",
+ main.title_size = 1.4,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65,
+ palette = "PuRd")
+```
+
+
+```{r fig.height=5}
+violin_viz(dist2MRT_nearest_sf,
+ var = "dist2MRT_km",
+ labs.x = "", labs.y = "Distance (km)",
+ breaks = seq(0, 2.5, 0.5),
+ y.shift = 0.5, x.shift = 1.4, text.size = 4)
+```
+
+### Distance from grid to 'center' grid 1232 - City Hall
+
+```{r}
+center_grid <- grid_centroids %>% filter(grid_id == 1232)
+dist2_center_grid <- grid_centroids %>%
+ mutate(dist_grid2center = st_distance(., center_grid) %>% as.numeric(),
+ dist_grid2center_km = dist_grid2center/1000) %>%
+ st_set_geometry(NULL)
+head(dist2_center_grid)
+```
+
+
+```{r}
+dist2_center_grid_sf <- dist2_center_grid %>%
+ filter(grid_id %in% qualified_grids_inflow) %>%
+ left_join(., grids, by = c("grid_id" = "grid_id")) %>% st_sf()
+
+spatial_viz(dist2_center_grid_sf,
+ fill_var = "dist_grid2center_km",
+ legend_title = "Distance (km)",
+ main_title = "(f) Spatial distribution of distance to the central location",
+ main.title_size = 1.4,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65,
+ palette = "PuRd")
+```
+
+
+```{r fig.height=5}
+violin_viz(dist2_center_grid_sf,
+ var = "dist_grid2center_km",
+ labs.x = "", labs.y = "Distance (km)",
+ breaks = seq(0, 20, 5),
+ y.shift = 5, x.shift = 1.55, text.size = 4)
+```
+
+## Dependent variables
+### Travel distance
+
+```{r}
+# visitors travel distance
+mean_dist_inflow <- dist_inflow %>%
+ group_by(grid_id, u_id) %>%
+ dplyr::summarise(mean_dist_user = mean(dist_hm2grid_km)) %>%
+ group_by(grid_id) %>%
+ dplyr::summarise(mean_dist_km = mean(mean_dist_user)) %>%
+ mutate(user_type = "visitor")
+
+# locals travel distance
+mean_dist_outflow <- dist_outflow %>%
+ group_by(home, u_id) %>%
+ dplyr::summarise(mean_dist_user = mean(dist_hm2grid_km)) %>%
+ group_by(home) %>%
+ dplyr::summarise(mean_dist_km = mean(mean_dist_user)) %>%
+ dplyr::rename(grid_id = home) %>% mutate(user_type = "local")
+```
+
+
+```{r fig.height=8}
+ggplot(bind_rows(mean_dist_inflow, mean_dist_outflow)) +
+ geom_density(aes(mean_dist_km, fill = user_type, color = user_type), alpha = 0.5) +
+ geom_vline(data = bind_rows(mean_dist_inflow, mean_dist_outflow) %>% group_by(user_type) %>% dplyr::summarise(mean = mean(mean_dist_km)), aes(xintercept = mean, color = user_type), size=1.5) +
+ theme_bw() +
+ theme(legend.position = c(0.9, 0.9)) +
+ labs(x = "Avg.distance (km)", y = "Density", fill = "User type", color = "User type")
+```
+
+
+### Mobility diversity
+
+```{r fig.height=8}
+combinded_div <- div_inflow %>%
+ st_set_geometry(NULL) %>%
+ mutate(div_type = "Inflow") %>%
+ bind_rows(.,
+ div_outflow %>%
+ st_set_geometry(NULL) %>%
+ mutate(div_type = "Outflow")) %>%
+ mutate(div_type = factor(div_type, levels = c("Outflow", "Inflow")))
+
+combinded_div_mean <- combinded_div %>%
+ group_by(div_type) %>%
+ dplyr::summarise(mean = mean(norm_div_shannon)) %>%
+ mutate(div_type = factor(div_type, levels = c("Outflow", "Inflow")))
+
+ggplot(combinded_div, aes(norm_div_shannon, color = div_type, fill = div_type)) +
+ geom_density(alpha = 0.5) +
+ geom_vline(data = combinded_div_mean, aes(xintercept = mean, color = div_type), size = 1.5) +
+ scale_x_continuous(breaks = seq(0, 1, 0.2)) +
+ labs(x = "Normalized diversity", y = "Density", title = "Density distribution of normalized diversity", fill = "Diversity type", color = "Diversity type") +
+ theme_bw() +
+ theme(legend.position = c(0.1, 0.9))
+```
+
+
+## Regression analysis
+
+### Inflow travel distance
+
+```{r}
+## gather independent and dependent variables
+reg_inflow_dist <- mean_resale %>%
+ filter(grid_id %in% qualified_grids_inflow) %>%
+ left_join(., mean_dist_inflow %>% dplyr::select(-user_type)) %>%
+ left_join(., industrial_areas_grids %>% dplyr::select(grid_id, pct_industrial_area)) %>%
+ left_join(., residential_areas_grids %>% dplyr::select(grid_id, pct_residential_area)) %>%
+ left_join(., dist2MRT_nearest %>% dplyr::select(grid_id, dist2MRT_km)) %>%
+ left_join(., df_pct_1_2room_rental %>% dplyr::select(grid_id, pct_1_2room_rental)) %>%
+ left_join(., dist2_center_grid %>% dplyr::select(grid_id, dist_grid2center_km)) %>%
+ replace(., is.na(.), 0)
+head(reg_inflow_dist)
+```
+
+
+#### OLS
+
+Formula: `Inflow distance ~ HDB resale price + Percentage of 1 ROOM and 2 Room rentals + Percentage of industrial building + Percentage of residential building + Distance to the nearest MRT station + Distance to the central grid)`
+
+```{r}
+ols_inflow_dist <- lm(mean_dist_km ~ mean_resale_price_sqm + pct_1_2room_rental + pct_industrial_area + pct_residential_area + dist2MRT_km + dist_grid2center_km, data = reg_inflow_dist)
+
+summary(ols_inflow_dist)
+
+spatial_viz(augment(ols_inflow_dist, data = reg_inflow_dist) %>% left_join(., grids) %>% st_as_sf(),
+ fill_var = ".resid", palette = "RdBu",
+ legend_title = "Residual (OLS)",
+ main_title = "(a) Residuals of average incoming distance (Moran I statistic: 0.78)",
+ main.title_size = 1.6,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65)
+```
+
+
+
+#### Spatial Error Model (SEM)
+```{r}
+sf_reg_inflow_dist <- reg_inflow_dist %>%
+ left_join(., grids) %>%
+ st_as_sf() %>%
+ st_transform(crs = 4326)
+
+dist_inflow_sp <- as(sf_reg_inflow_dist, 'Spatial')
+dist_inflow_neighbors <- poly2nb(dist_inflow_sp)
+summary(dist_inflow_neighbors)
+
+dist_inflow_weights <- nb2listw(dist_inflow_neighbors, style="W", zero.policy=TRUE)
+```
+
+
+```{r}
+moran.test(dist_inflow_sp$mean_dist_km, dist_inflow_weights)
+```
+
+
+```{r}
+sem_inflow_dist <- spatialreg::errorsarlm(mean_dist_km ~ mean_resale_price_sqm + pct_1_2room_rental + pct_industrial_area + pct_residential_area + pct_industrial_area + dist2MRT_km + dist_grid2center_km, data = reg_inflow_dist, listw = dist_inflow_weights)
+
+summary(sem_inflow_dist)
+sf_reg_inflow_dist$resid_error <- residuals(sem_inflow_dist)
+
+spatial_viz(sf_reg_inflow_dist,
+ fill_var = "resid_error",
+ legend_title = "Residual (SEM)", palette = "RdBu",
+ main_title = "(a) Residuals of average incoming distance (Moran I statistic: - 0.09)",
+ main.title_size = 1.6,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65)
+moran.test(sf_reg_inflow_dist$resid_error, dist_inflow_weights)
+```
+
+
+#### Spatial Lag Model (SLM)
+```{r}
+slm_inflow_dist <- spatialreg::lagsarlm(mean_dist_km ~ mean_resale_price_sqm + pct_1_2room_rental + pct_industrial_area + pct_residential_area + dist2MRT_km + dist_grid2center_km, data = reg_inflow_dist, listw = dist_inflow_weights)
+summary(slm_inflow_dist)
+
+sf_reg_inflow_dist$resid_lagsarlm <- residuals(slm_inflow_dist)
+spatial_viz(sf_reg_inflow_dist,
+ fill_var = "resid_lagsarlm",
+ legend_title = "Residual (SLM)",
+ palette = "RdBu",
+ main_title = "(a) Residuals of average incoming distance (Moran I statistic: - 0.08)",
+ main.title_size = 1.6,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65)
+moran.test(sf_reg_inflow_dist$resid_lagsarlm, dist_inflow_weights)
+```
+
+
+
+```{r}
+# compare three models
+AIC(ols_inflow_dist, slm_inflow_dist, sem_inflow_dist)
+```
+
+
+
+### Outflow travel distance
+
+```{r}
+reg_outflow_dist <- mean_resale %>%
+ filter(grid_id %in% qualified_grids_outflow) %>%
+ left_join(., mean_dist_inflow %>% dplyr::select(-user_type)) %>%
+ left_join(., industrial_areas_grids %>% dplyr::select(grid_id, pct_industrial_area)) %>%
+ left_join(., residential_areas_grids %>% dplyr::select(grid_id, pct_residential_area)) %>%
+ left_join(., dist2MRT_nearest %>% dplyr::select(grid_id, dist2MRT_km)) %>%
+ # left_join(., pop_grids_2016) %>%
+ left_join(., df_pct_1_2room_rental %>% dplyr::select(grid_id, pct_1_2room_rental)) %>%
+ left_join(., dist2_center_grid %>% dplyr::select(grid_id, dist_grid2center_km)) %>%
+ replace(., is.na(.), 0)
+head(reg_outflow_dist)
+```
+
+
+#### OLS
+
+Formula: `Outflow distance ~ HDB resale price + Percentage of 1 ROOM and 2 Room rentals + Percentage of industrial building + Percentage of residential building + Distance to the nearest MRT station + Distance to the central grid)`
+
+```{r}
+ols_outflow_dist <- lm(mean_dist_km ~ mean_resale_price_sqm + pct_1_2room_rental + pct_industrial_area + pct_residential_area + dist2MRT_km + dist_grid2center_km, data = reg_outflow_dist)
+
+summary(ols_outflow_dist)
+
+spatial_viz(augment(ols_outflow_dist, data = reg_outflow_dist) %>% left_join(., grids) %>% st_as_sf(),
+ fill_var = ".resid", palette = "RdBu",
+ legend_title = "Residual (OLS)",
+ main_title = "(b) Residuals of average outgoing distance (Moran I statistic: 0.79) ",
+ main.title_size = 1.6,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65)
+```
+
+
+#### Spatial Error Model (SEM)
+```{r}
+sf_reg_outflow_dist <- reg_outflow_dist %>%
+ left_join(., grids) %>%
+ st_as_sf() %>%
+ st_transform(crs = 4326)
+
+dist_outflow_sp <- as(sf_reg_outflow_dist, 'Spatial')
+dist_outflow_neighbors <- poly2nb(dist_outflow_sp)
+summary(dist_outflow_neighbors)
+dist_outflow_weights <- nb2listw(dist_outflow_neighbors, style="W", zero.policy=TRUE)
+```
+
+
+```{r}
+moran.test(dist_outflow_sp$mean_dist_km, dist_outflow_weights)
+```
+
+```{r}
+sem_outflow_dist <- spatialreg::errorsarlm(mean_dist_km ~ mean_resale_price_sqm + pct_1_2room_rental + pct_industrial_area + pct_residential_area + dist2MRT_km + dist_grid2center_km, data = reg_outflow_dist, listw = dist_outflow_weights)
+
+summary(sem_outflow_dist)
+
+sf_reg_outflow_dist$resid_error <- residuals(sem_outflow_dist)
+
+spatial_viz(sf_reg_outflow_dist,
+ fill_var = "resid_error",
+ legend_title = "Residual (SEM)", palette = "RdBu",
+ main_title = "(b) Residuals of average outgoing distance (Moran I statistic: -0.09)",
+ main.title_size = 1.6,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65)
+moran.test(sf_reg_outflow_dist$resid_error, dist_outflow_weights)
+```
+
+#### Spatial Lag Model (SLM)
+```{r}
+slm_outflow_dist <- spatialreg::lagsarlm(mean_dist_km ~ mean_resale_price_sqm + pct_1_2room_rental + pct_industrial_area + pct_residential_area + dist2MRT_km + dist_grid2center_km, data = reg_outflow_dist, listw = dist_outflow_weights)
+summary(slm_outflow_dist)
+
+spatialreg::impacts(slm_outflow_dist, listw = dist_outflow_weights)
+
+sf_reg_outflow_dist$resid_lagsarlm <- residuals(slm_outflow_dist)
+spatial_viz(sf_reg_outflow_dist,
+ fill_var = "resid_lagsarlm",
+ legend_title = "Residual (SLM)", palette = "RdBu",
+ main_title = "(b) Residuals of average outgoing distance (Moran I statistic: -0.08)",
+ main.title_size = 1.6,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65)
+moran.test(sf_reg_outflow_dist$resid_lagsarlm, dist_outflow_weights)
+```
+
+```{r}
+# compare three models
+AIC(ols_outflow_dist, slm_outflow_dist, sem_outflow_dist)
+```
+
+
+
+### Inflow diversity
+```{r}
+reg_inflow_div <- mean_resale %>%
+ filter(grid_id %in% qualified_grids_inflow) %>%
+ left_join(., div_inflow %>% st_set_geometry(NULL) %>% dplyr::select(grid_id, norm_div_shannon)) %>%
+ left_join(., industrial_areas_grids %>% dplyr::select(grid_id, pct_industrial_area)) %>%
+ left_join(., residential_areas_grids %>% dplyr::select(grid_id, pct_residential_area)) %>%
+ left_join(., dist2MRT_nearest %>% dplyr::select(grid_id, dist2MRT_km)) %>%
+ left_join(., df_pct_1_2room_rental %>% dplyr::select(grid_id, pct_1_2room_rental)) %>%
+ left_join(., dist2_center_grid %>% dplyr::select(grid_id, dist_grid2center_km)) %>%
+ replace(., is.na(.), 0)
+
+head(reg_inflow_div)
+```
+
+#### OLS
+
+```{r}
+ols_inflow_div <- lm(norm_div_shannon ~ mean_resale_price_sqm + pct_1_2room_rental + pct_industrial_area + pct_residential_area + dist2MRT_km + dist_grid2center_km, data = reg_inflow_div)
+
+summary(ols_inflow_div)
+
+spatial_viz(augment(ols_inflow_div, data = reg_inflow_div) %>% left_join(., grids) %>% st_as_sf(),
+ fill_var = ".resid", palette = "RdBu",
+ legend_title = "Residual (OLS)",
+ main_title = "(c) Residuals of inflow diversity (Moran I statistic: 0.69)",
+ main.title_size = 1.6,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65)
+```
+
+
+#### Spatial Error (SEM) Models
+
+```{r}
+sf_reg_inflow_div <- reg_inflow_div %>%
+ left_join(., grids) %>%
+ st_as_sf() %>%
+ st_transform(crs = 4326)
+
+div_inflow_sp <- as(sf_reg_inflow_div, 'Spatial')
+div_inflow_neighbors <- poly2nb(div_inflow_sp)
+summary(div_inflow_neighbors)
+div_inflow_weights <- nb2listw(div_inflow_neighbors, style="W", zero.policy=TRUE)
+```
+
+
+```{r}
+moran.test(div_inflow_sp$norm_div_shannon, div_inflow_weights)
+```
+
+```{r}
+sem_inflow_div <- spatialreg::errorsarlm(norm_div_shannon ~ mean_resale_price_sqm + pct_1_2room_rental + pct_industrial_area + pct_residential_area + dist2MRT_km + dist_grid2center_km, data = reg_inflow_div, listw = div_inflow_weights)
+summary(sem_inflow_div)
+
+sf_reg_inflow_div$resid_error <- residuals(sem_inflow_div)
+spatial_viz(sf_reg_inflow_div,
+ fill_var = "resid_error",
+ legend_title = "Residual (SEM)", palette = "RdBu",
+ main_title = "(c) Residuals of inflow diversity (Moran I statistic: -0.05)",
+ main.title_size = 1.6,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65)
+moran.test(sf_reg_inflow_div$resid_error, div_inflow_weights)
+```
+
+#### Spatial Lag Models (SLM):
+```{r}
+slm_inflow_div <- spatialreg::lagsarlm(norm_div_shannon ~ mean_resale_price_sqm + pct_1_2room_rental + pct_industrial_area + pct_residential_area + dist2MRT_km +dist_grid2center_km, data = reg_inflow_div, listw = div_inflow_weights)
+summary(slm_inflow_div)
+spatialreg::impacts(slm_inflow_div, listw = div_inflow_weights)
+
+sf_reg_inflow_div$resid_lagsarlm <- residuals(slm_inflow_div)
+
+spatial_viz(sf_reg_inflow_div,
+ fill_var = "resid_lagsarlm",
+ legend_title = "Residual (SLM)", palette = "RdBu",
+ main_title = "(c) Residuals of inflow diversity (Moran I statistic: -0.03)",
+ main.title_size = 1.6,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65)
+moran.test(sf_reg_inflow_div$resid_lagsarlm, div_inflow_weights)
+```
+
+
+```{r}
+AIC(ols_inflow_div, slm_inflow_div, sem_inflow_div)
+```
+
+
+
+### Outflow diversity
+
+```{r}
+reg_outflow_div <- mean_resale %>%
+ filter(grid_id %in% qualified_grids_outflow) %>%
+ left_join(., div_outflow %>% st_set_geometry(NULL) %>% dplyr::select(grid_id, norm_div_shannon)) %>%
+ left_join(., industrial_areas_grids %>% dplyr::select(grid_id, pct_industrial_area)) %>%
+ left_join(., residential_areas_grids %>% dplyr::select(grid_id, pct_residential_area)) %>%
+ left_join(., dist2MRT_nearest %>% dplyr::select(grid_id, dist2MRT_km)) %>%
+ left_join(., df_pct_1_2room_rental %>% dplyr::select(grid_id, pct_1_2room_rental)) %>%
+ left_join(., dist2_center_grid %>% dplyr::select(grid_id, dist_grid2center_km)) %>%
+ replace(., is.na(.), 0)
+head(reg_inflow_div)
+```
+
+#### OLS
+
+```{r}
+ols_outflow_div <- lm(norm_div_shannon ~ mean_resale_price_sqm + pct_1_2room_rental + pct_industrial_area + pct_residential_area + dist2MRT_km + dist_grid2center_km, data = reg_outflow_div)
+summary(ols_outflow_div)
+
+spatial_viz(augment(ols_outflow_div, data = reg_outflow_div) %>% left_join(., grids) %>% st_as_sf(),
+ fill_var = ".resid", palette = "RdBu",
+ legend_title = "Residual (OLS)",
+ main_title = "(d) Residuals of outflow diversity (Moran I statistic: 0.83)",
+ main.title_size = 1.6,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65)
+```
+
+
+```{r}
+sf_reg_outflow_div <- reg_outflow_div %>%
+ left_join(., grids) %>%
+ st_as_sf() %>%
+ st_transform(crs = 4326)
+
+div_outflow_sp <- as(sf_reg_outflow_div, 'Spatial')
+div_outflow_neighbors <- poly2nb(div_outflow_sp)
+summary(div_outflow_neighbors)
+div_outflow_weights <- nb2listw(div_outflow_neighbors, style="W", zero.policy=TRUE)
+```
+
+
+```{r}
+moran.test(div_outflow_sp$norm_div_shannon, div_outflow_weights)
+```
+
+
+#### Spatial Error (SEM) Models
+```{r}
+sem_outflow_div <- spatialreg::errorsarlm(norm_div_shannon ~ mean_resale_price_sqm + pct_1_2room_rental + pct_industrial_area + pct_residential_area + dist2MRT_km + dist_grid2center_km, data = reg_outflow_div, listw = div_outflow_weights)
+summary(sem_outflow_div)
+
+sf_reg_outflow_div$resid_error <- residuals(sem_outflow_div)
+spatial_viz(sf_reg_outflow_div,
+ fill_var = "resid_error",
+ legend_title = "Residual (SEM)", palette = "RdBu",
+ main_title = "(d) Residuals of outflow diversity (Moran I statistic: -0.08)",
+ main.title_size = 1.6,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65)
+moran.test(sf_reg_outflow_div$resid_error, div_outflow_weights)
+```
+
+
+#### Spatial Lag Models (SLM):
+```{r}
+slm_outflow_div <- spatialreg::lagsarlm(norm_div_shannon ~ mean_resale_price_sqm + pct_1_2room_rental + pct_industrial_area + pct_residential_area + dist2MRT_km + dist_grid2center_km, data = reg_outflow_div, listw = div_outflow_weights)
+summary(slm_outflow_div)
+spatialreg::impacts(slm_outflow_div, listw = div_outflow_weights)
+
+sf_reg_outflow_div$resid_lagsarlm <- residuals(slm_outflow_div)
+spatial_viz(sf_reg_outflow_div,
+ fill_var = "resid_lagsarlm",
+ legend_title = "Residual (SLM)", palette = "-RdBu",
+ main_title = "(d) Residuals of outflow diversity (Moran I statistic: -0.06)",
+ main.title_size = 1.6,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65)
+moran.test(sf_reg_outflow_div$resid_lagsarlm, div_outflow_weights)
+```
+
+```{r}
+AIC(ols_outflow_div, slm_outflow_div, sem_outflow_div)
+```
diff --git a/analysis/10-regression-analysis.md b/analysis/10-regression-analysis.md
new file mode 100644
index 0000000..164aa5a
--- /dev/null
+++ b/analysis/10-regression-analysis.md
@@ -0,0 +1,1775 @@
+Regression analysis
+================
+Qingqing Chen
+Last compiled date: 18 September, 2021
+
+
+
+To contextualize the mobility patterns in our analysis, we compare two
+of the aforementioned metrics – distance and diversity – with the
+socio-economic and built environment characteristics of each
+neighbourhood through a spatial regression analysis. We do so by drawing
+on a set of publicly available datasets, including the HDB Resale Flat
+Prices data, HDB Property Information data, SLA Street Directory data,
+and LTA Train Station data. These datasets allow us to create six
+independent variables shown as follows:
+
+- Mean resale price per square meter;
+- Percentage of 1-Room and 2-Room rental flats;
+- Percentage of residential building area;
+- Percentage of industrial building area;
+- Distance to the nearest MRT station;
+- Distance to the central grid cell.
+
+We aggregate the variables to individual neighbourhoods (i.e. with the
+same spatial resolution as neighbourhoods) and only keep neighbourhoods
+that have public housing. It is important to stress that we employ this
+regression model not to predict travel distance or mobility diversity
+based on these independent variables but rather to provide an initial
+quantitative exploration of the potential underlying covariates that
+influence mobility patterns in Singapore.
+
+## Load data
+
+``` r
+# hexagonal grids
+grids <- read_sf(here("data/derived_data/spatial_hex_grid.shp"), quiet = T) %>%
+ st_transform(crs = 3414)
+
+# grid centroids
+grid_centroids <- grids %>% st_centroid()
+
+# HDB buildings
+hdb_sf <- read_sf(here("data/derived_data/spatial_hdb_building.shp"))
+
+# inflow and outflow distance
+dist_inflow <- readRDS(here("data/derived_data/dist_visitor_points.rds"))
+dist_outflow <- readRDS(here("data/derived_data/dist_local_points.rds"))
+
+# inflow and outflow diversity
+div_inflow <- readRDS(here("data/derived_data/inflow_diversity.rds"))
+div_outflow <- readRDS(here("data/derived_data/outflow_diversity.rds"))
+```
+
+## Dependent variables
+
+### HDB resale price
+
+``` r
+if(file.exists(here("data/derived_data/sf_resale.rds"))){
+ sf_resale <- readRDS(here("data/derived_data/sf_resale.rds"))
+}else{
+ df_resale_12to14 <- read_csv(here("data/raw_data/resale-flat-prices/resale-flat-prices-based-on-registration-date-from-mar-2012-to-dec-2014.csv"))
+ df_resale_15to16 <- read_csv(here("data/raw_data/resale-flat-prices/resale-flat-prices-based-on-registration-date-from-jan-2015-to-dec-2016.csv"))
+ df_resale <- bind_rows(df_resale_12to14, df_resale_15to16) %>%
+ mutate(month = ymd(month, truncated = 1)) %>%
+ filter(month >= '2012-07-01' & month < '2016-11-01')
+ # format street name
+ df_resale <- df_resale %>%
+ mutate(ROAD_NAME = gsub("\\bAVE\\b", "AVENUE", street_name),
+ ROAD_NAME = gsub("\\bBT\\b", "BUKIT", ROAD_NAME),
+ ROAD_NAME = gsub("\\bCL\\b", "CLOSE", ROAD_NAME),
+ ROAD_NAME = gsub("\\bCRES\\b", "CRESCENT", ROAD_NAME),
+ ROAD_NAME = gsub("\\bCTRL\\b", "CENTRAL", ROAD_NAME),
+ ROAD_NAME = gsub("\\bC'WEALTH\\b", "COMMONWEALTH", ROAD_NAME),
+ ROAD_NAME = gsub("\\bCTR\\b", "CENTRE", ROAD_NAME),
+ ROAD_NAME = gsub("\\bDR\\b", "DRIVE", ROAD_NAME),
+ ROAD_NAME = gsub("\\bGDNS\\b", "GARDENS", ROAD_NAME),
+ ROAD_NAME = gsub("\\bHTS\\b", "HEIGHTS", ROAD_NAME),
+ ROAD_NAME = gsub("\\bJLN\\b", "JALAN", ROAD_NAME),
+ ROAD_NAME = gsub("\\bKG\\b", "KAMPONG", ROAD_NAME),
+ ROAD_NAME = gsub("\\bLOR\\b", "LORONG", ROAD_NAME),
+ ROAD_NAME = gsub("\\bMKT\\b", "MARKET", ROAD_NAME),
+ ROAD_NAME = gsub("\\bNTH\\b", "NORTH", ROAD_NAME),
+ ROAD_NAME = gsub("\\bPL\\b", "PLACE", ROAD_NAME),
+ ROAD_NAME = gsub("\\bPK\\b", "PARK", ROAD_NAME),
+ ROAD_NAME = gsub("\\bRD\\b", "ROAD", ROAD_NAME),
+ ROAD_NAME = gsub("\\bST\\b", "STREET", ROAD_NAME),
+ ROAD_NAME = gsub("\\bSTH\\b", "SOUTH", ROAD_NAME),
+ ROAD_NAME = gsub("\\bTER\\b", "TERRACE", ROAD_NAME),
+ ROAD_NAME = gsub("\\bTG\\b", "TANJONG", ROAD_NAME),
+ ROAD_NAME = gsub("\\bUPP\\b", "UPPER", ROAD_NAME)) %>%
+ mutate(ROAD_NAME = case_when(
+ ROAD_NAME == "STREET. GEORGE'S LANE" ~ "SAINT GEORGE'S LANE",
+ ROAD_NAME == "STREET. GEORGE'S ROAD" ~ "SAINT GEORGE'S ROAD",
+ TRUE ~ ROAD_NAME
+ )) %>%
+ dplyr::rename(HOUSE_BLK_ = block) %>%
+ unite(ROAD_BLK, c("ROAD_NAME", "HOUSE_BLK_")) %>%
+ mutate(ROAD_BLK = case_when(
+ ROAD_BLK == "CHOA CHU KANG AVENUE 2_297A" ~ "CHOA CHU KANG AVENUE 2_297",
+ ROAD_BLK == "CHOA CHU KANG AVENUE 2_297B" ~ "CHOA CHU KANG AVENUE 2_297",
+ TRUE ~ ROAD_BLK
+ ))
+
+ # add geometry to each resale transaction
+ sf_resale <- hdb_sf %>%
+ left_join(df_resale, ., by = c("ROAD_BLK" = "ROAD_BLK")) %>%
+ st_as_sf() %>%
+ st_transform(crs = 3414) %>%
+ st_join(., grids, largest = T) %>%
+ filter(!is.na(grid_id))
+ saveRDS(sf_resale, file = here("data/derived_data/sf_resale.rds"))
+}
+
+head(sf_resale)
+```
+
+ ## Simple feature collection with 6 features and 12 fields
+ ## Geometry type: GEOMETRY
+ ## Dimension: XY
+ ## Bounding box: xmin: 28319.87 ymin: 38111.72 xmax: 30095.91 ymax: 39879.56
+ ## Projected CRS: SVY21 / Singapore TM
+ ## month town flat_type ROAD_BLK street_name
+ ## 1 2012-07-01 ANG MO KIO 2 ROOM ANG MO KIO AVENUE 4_174 ANG MO KIO AVE 4
+ ## 2 2012-07-01 ANG MO KIO 3 ROOM ANG MO KIO AVENUE 5_154 ANG MO KIO AVE 5
+ ## 3 2012-07-01 ANG MO KIO 3 ROOM ANG MO KIO AVENUE 1_226B ANG MO KIO AVE 1
+ ## 4 2012-07-01 ANG MO KIO 3 ROOM ANG MO KIO AVENUE 1_333 ANG MO KIO AVE 1
+ ## 5 2012-07-01 ANG MO KIO 3 ROOM ANG MO KIO AVENUE 5_150 ANG MO KIO AVE 5
+ ## 6 2012-07-01 ANG MO KIO 3 ROOM ANG MO KIO AVENUE 4_170 ANG MO KIO AVE 4
+ ## storey_range floor_area_sqm flat_model lease_commence_date resale_price
+ ## 1 10 TO 12 45 Improved 1986 260000
+ ## 2 01 TO 03 68 New Generation 1981 303000
+ ## 3 01 TO 03 66 Improved 1994 303000
+ ## 4 01 TO 03 68 New Generation 1981 315000
+ ## 5 01 TO 03 68 New Generation 1981 315000
+ ## 6 10 TO 12 61 Improved 1986 315000
+ ## remaining_lease grid_id geometry
+ ## 1 NA 1153 MULTIPOLYGON (((28487.98 39...
+ ## 2 NA 1153 MULTIPOLYGON (((28840.93 39...
+ ## 3 NA 1152 POLYGON ((28685.74 38805.59...
+ ## 4 NA 1238 MULTIPOLYGON (((30012.41 38...
+ ## 5 NA 1195 MULTIPOLYGON (((29007.03 39...
+ ## 6 NA 1153 MULTIPOLYGON (((28397.28 39...
+
+``` r
+mean_resale <- sf_resale %>%
+ st_set_geometry(NULL) %>%
+ dplyr::select(grid_id, floor_area_sqm, resale_price) %>%
+ mutate(resale_price_sqm = resale_price/floor_area_sqm) %>%
+ group_by(grid_id) %>%
+ dplyr::summarise(mean_resale_price_sqm = mean(resale_price_sqm),
+ sd_resale_price_sqm = sd(resale_price_sqm))
+
+## grid cells with available socio-economic variables and human mobility indicators
+qualified_grids_inflow <- intersect(mean_resale$grid_id, div_inflow$grid_id) %>% intersect(dist_inflow$grid_id)
+qualified_grids_outflow <- intersect(mean_resale$grid_id, div_outflow$grid_id) %>% intersect(dist_outflow$grid_id)
+
+mean_resale_sf <- mean_resale %>%
+ filter(grid_id %in% qualified_grids_inflow) %>%
+ left_join(., grids, by = c("grid_id" = "grid_id")) %>%
+ st_sf()
+```
+
+``` r
+spatial_viz(mean_resale_sf,
+ fill_var = "mean_resale_price_sqm",
+ legend_title = "Avg.resale price/sqm",
+ main_title = "(a) Spatial distribution of average resale price",
+ main.title_size = 1.4,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65,
+ digits = 0, palette = "PuRd")
+```
+
+
+
+``` r
+# spatial_viz(mean_resale_sf,
+# fill_var = "sd_resale_price_sqm",
+# legend_title = "S.D.resale price/sqm",
+# main_title = "(a) Spatial distribution of S.D of resale price",
+# main.title_size = 1.4,
+# legend.hist_height = 0.25, legend.hist_width = 0.5,
+# legend_width = 0.4, legend.hist_size = 0.5,
+# legend.title_size = 1.2, legend.text_size = 0.65,
+# digits = 0, palette = "PuRd")
+```
+
+``` r
+violin_viz(mean_resale_sf, var = "mean_resale_price_sqm",
+ labs.x = "", labs.y = "Avg.resale price/sqm", breaks = seq(0, 9000, 1000),
+ y.shift = 1050, x.shift = 1.4, text.size = 4, digits = 0)
+```
+
+
+
+### Percentage of 1-ROOM and 2-Room rental flats
+
+``` r
+#HDB Property Information: https://data.gov.sg/dataset/hdb-property-information
+if(file.exists(here("data/derived_data/df_pct_1_2room_rental.rds"))){
+ df_pct_1_2room_rental <- readRDS(here("data/derived_data/df_pct_1_2room_rental.rds"))
+}else{
+ df_hdb_property <- read_csv(here("data/raw_data/hdb-property-information.csv"))
+ df_hdb_property_updated <- df_hdb_property %>%
+ mutate(ROAD_NAME = gsub("\\bDR\\b", "DRIVE", street),
+ ROAD_NAME = gsub("\\bRD\\b", "ROAD", ROAD_NAME),
+ ROAD_NAME = gsub("\\bCRES\\b", "CRESCENT", ROAD_NAME),
+ ROAD_NAME = gsub("\\bAVE\\b", "AVENUE", ROAD_NAME),
+ ROAD_NAME = gsub("\\bST\\b", "STREET", ROAD_NAME),
+ ROAD_NAME = gsub("\\bCTRL\\b", "CENTRAL", ROAD_NAME),
+ ROAD_NAME = gsub("\\bNTH\\b", "NORTH", ROAD_NAME),
+ ROAD_NAME = gsub("\\bSTH\\b", "SOUTH", ROAD_NAME),
+ ROAD_NAME = gsub("\\bBT\\b", "BUKIT", ROAD_NAME),
+ ROAD_NAME = gsub("\\bC'WEALTH\\b", "COMMONWEALTH", ROAD_NAME),
+ ROAD_NAME = gsub("\\bCL\\b", "CLOSE", ROAD_NAME),
+ ROAD_NAME = gsub("\\bPK\\b", "PARK", ROAD_NAME),
+ ROAD_NAME = gsub("\\bJLN\\b", "JALAN", ROAD_NAME),
+ ROAD_NAME = gsub("\\bKG\\b", "KAMPONG", ROAD_NAME),
+ ROAD_NAME = gsub("\\bPL\\b", "PLACE", ROAD_NAME),
+ ROAD_NAME = gsub("\\bLOR\\b", "LORONG", ROAD_NAME),
+ ROAD_NAME = gsub("\\bTER\\b", "TERRACE", ROAD_NAME),
+ ROAD_NAME = gsub("\\bMKT\\b", "MARKET", ROAD_NAME),
+ ROAD_NAME = gsub("\\bUPP\\b", "UPPER", ROAD_NAME),
+ ROAD_NAME = gsub("\\bGDNS\\b", "GARDENS", ROAD_NAME),
+ ROAD_NAME = gsub("\\bTG\\b", "TANJONG", ROAD_NAME),
+ ROAD_NAME = gsub("\\bHTS\\b", "HEIGHTS", ROAD_NAME),
+ ROAD_NAME = gsub("\\bCTR\\b", "CENTRE", ROAD_NAME)) %>%
+ mutate(ROAD_NAME = case_when(
+ ROAD_NAME == "STREET. GEORGE'S LANE" ~ "SAINT GEORGE'S LANE",
+ ROAD_NAME == "STREET. GEORGE'S ROAD" ~ "SAINT GEORGE'S ROAD",
+ ROAD_NAME == "SECTOR A SIN MING IND EST" ~ "SECTOR A SIN MING INDUSTRIAL ESTATE",
+ TRUE ~ ROAD_NAME
+ )) %>%
+ dplyr::rename(HOUSE_BLK_ = blk_no) %>%
+ unite(ROAD_BLK, c("ROAD_NAME", "HOUSE_BLK_")) %>%
+ dplyr::select(ROAD_BLK, `1room_rental`, `2room_rental`, total_dwelling_units) %>%
+ mutate(ROAD_BLK = case_when(
+ ROAD_BLK == "CHOA CHU KANG AVENUE 2_297A" ~ "CHOA CHU KANG AVENUE 2_297",
+ ROAD_BLK == "CHOA CHU KANG AVENUE 2_297B" ~ "CHOA CHU KANG AVENUE 2_297",
+ TRUE ~ ROAD_BLK
+ ))
+
+ # get extra two Blks that are under 'S' and 'K' building type
+ extra_building_sf <- df_building %>%
+ filter(ROAD_NAME %in% c("NEW MARKET ROAD", "JALAN KUKOH") & HOUSE_BLK_ %in% c(1, 32)) %>%
+ unite(ROAD_BLK, c("ROAD_NAME", "HOUSE_BLK_")) %>%
+ st_transform(crs = 3414) %>%
+ dplyr::select(names(hdb_sf))
+
+ df_1_2room_rental <- df_hdb_property_updated %>%
+ filter(total_dwelling_units != 0) %>%
+ left_join(., rbind(hdb_sf, extra_building_sf)) %>%
+ st_as_sf() %>%
+ st_make_valid() %>%
+ st_join(., grids, largest = T) %>% # join grids
+ filter(!is.na(grid_id)) %>%
+ group_by(grid_id) %>%
+ dplyr::summarise(`1room_rental` = sum(`1room_rental`),
+ `2room_rental` = sum(`2room_rental`),
+ total_dwelling_units = sum(total_dwelling_units))
+
+ df_pct_1_2room_rental <- df_1_2room_rental %>%
+ st_set_geometry(NULL) %>%
+ mutate(pct_1_2room_rental = (`1room_rental` + `2room_rental`)/total_dwelling_units)
+ saveRDS(df_pct_1_2room_rental, file = here("data/derived_data/df_pct_1_2room_rental.rds"))
+}
+head(df_pct_1_2room_rental)
+```
+
+ ## # A tibble: 6 × 5
+ ## grid_id `1room_rental` `2room_rental` total_dwelling_units pct_1_2room_rental
+ ##
+ ## 1 286 0 0 795 0
+ ## 2 304 0 0 99 0
+ ## 3 321 273 273 4208 0.130
+ ## 4 338 0 0 1372 0
+ ## 5 339 0 0 4079 0
+ ## 6 354 0 0 6653 0
+
+``` r
+df_pct_1_2room_rental_sf <- df_pct_1_2room_rental %>%
+ filter(grid_id %in% qualified_grids_inflow) %>%
+ left_join(., grids, by = c("grid_id" = "grid_id")) %>%
+ st_sf()
+
+spatial_viz(df_pct_1_2room_rental_sf,
+ fill_var = "pct_1_2room_rental",
+ legend_title = "1&2 Room rentals (%)",
+ main_title = "(b) Spatial distribution of 1-Room and 2-Room rentals",
+ main.title_size = 1.4,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65, palette = "PuRd",
+ style = "fixed",
+ breaks = c(0, 0.05, 0.1, 0.3, 0.5, 0.8))
+```
+
+
+
+``` r
+violin_viz(df_pct_1_2room_rental_sf,
+ var = "pct_1_2room_rental",
+ labs.x = "", labs.y = "1-Room & 2-Room Rentals (%)",
+ breaks = seq(0, 0.8, 0.1),
+ y.shift = 0.13, x.shift = 1.4, text.size = 4)
+```
+
+
+
+### Landuse: Percentage of industrial building
+
+``` r
+cal_building_area <- function(index, building_sf){
+ # intersect with grids
+ intersect_grid <- building_sf %>%
+ filter(FEA_ID == index) %>%
+ st_make_valid() %>%
+ st_intersection(., grids)
+ # intersect area in grids
+ intersect_grid %>%
+ st_set_geometry(NULL) %>%
+ dplyr::mutate(area_sqm = st_area(intersect_grid) %>% as.numeric()) %>%
+ dplyr::select(FEA_ID, grid_id, area_sqm)
+}
+
+if(file.exists(here("data/derived_data/industrial_areas_grids.rds"))){
+ industrial_areas_grids <- readRDS(here("data/derived_data/industrial_areas_grids.rds"))
+}else{
+ #building type: I
+ industrial_sf <- df_building %>%
+ filter(BLDG_TYPE_ == "I") %>%
+ filter(DATA_TYPE_ == "extg") %>% # Existing building outline
+ st_transform(crs = 3414) %>%
+ unite(ROAD_BLK, c("ROAD_NAME", "HOUSE_BLK_"), sep = " ")
+
+ FEA_indexes <- industrial_sf$FEA_ID
+ # industrial building areas in grids
+ industrial_areas_grids <- do.call(bind_rows, map(FEA_indexes, with_progress(function(x) cal_building_area(x, industrial_sf))))
+ # percentage of commercial areas in grids
+ industrial_areas_grids <- industrial_areas_grids %>%
+ group_by(grid_id) %>%
+ dplyr::summarise(industrial_area_sqm = sum(area_sqm)) %>%
+ mutate(grid_area = st_area(grids[1, ]) %>% as.numeric(),
+ pct_industrial_area = round(industrial_area_sqm/grid_area, 4))
+ saveRDS(industrial_areas_grids, file = here("data/derived_data/industrial_areas_grids.rds"))
+}
+
+head(industrial_areas_grids)
+```
+
+ ## # A tibble: 6 × 4
+ ## grid_id industrial_area_sqm grid_area pct_industrial_area
+ ##
+ ## 1 8 1044. 487139. 0.0021
+ ## 2 9 7940. 487139. 0.0163
+ ## 3 13 7004. 487139. 0.0144
+ ## 4 14 25922. 487139. 0.0532
+ ## 5 16 209. 487139. 0.0004
+ ## 6 19 29082. 487139. 0.0597
+
+``` r
+industrial_areas_grids_sf <- industrial_areas_grids %>%
+ filter(grid_id %in% qualified_grids_inflow) %>%
+ left_join(grids %>% filter(grid_id %in% qualified_grids_inflow), ., by = c("grid_id" = "grid_id")) %>%
+ replace(., is.na(.), 0) %>%
+ st_sf()
+
+spatial_viz(industrial_areas_grids_sf,
+ fill_var = "pct_industrial_area",
+ legend_title = "Industrial area (%)",
+ main_title = "(c) Spatial distribution of industrial area",
+ main.title_size = 1.4,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65, palette = "PuRd",
+ style = "fixed",
+ breaks = c(0, 0.05, 0.1, 0.15, 0.2, 0.25))
+```
+
+
+
+``` r
+violin_viz(industrial_areas_grids_sf,
+ var = "pct_industrial_area",
+ labs.x = "", labs.y = "Industrial area (%)",
+ breaks = seq(0, 0.25, 0.05), digits = 3,
+ y.shift = 0.04, x.shift = 1.4, text.size = 4)
+```
+
+
+
+### Landuse: Percentage of residential building
+
+``` r
+if(file.exists(here("data/derived_data/residential_areas_grids.rds"))){
+ residential_areas_grids <- readRDS(here("data/derived_data/residential_areas_grids.rds"))
+}else{
+ #building type: C-Condominium, H-HDB Building, A-Apartment, E-Executive Condominium, R-Residential
+ residential_sf <- df_building %>%
+ filter(BLDG_TYPE_ %in% c("C", "H", "A", "E", "R")) %>%
+ filter(DATA_TYPE_ == "extg") %>% # Existing building outline
+ st_transform(crs = 3414) %>%
+ unite(ROAD_BLK, c("ROAD_NAME", "HOUSE_BLK_"), sep = " ")
+
+ FEA_indexes <- residential_sf$FEA_ID
+ # residential areas in grids
+ residential_areas_grids <- do.call(bind_rows, map(FEA_indexes, with_progress(function(x) cal_building_area(x, residential_sf))))
+ # percentage of residential areas in grids
+ residential_areas_grids <- residential_areas_grids %>%
+ group_by(grid_id) %>%
+ dplyr::summarise(residential_area_sqm = sum(area_sqm)) %>%
+ mutate(grid_area = st_area(grids[1, ]) %>% as.numeric(),
+ pct_residential_area = round(residential_area_sqm/grid_area, 4))
+ saveRDS(residential_areas_grids, file = here("data/derived_data/residential_areas_grids.rds"))
+}
+
+head(residential_areas_grids)
+```
+
+ ## # A tibble: 6 × 4
+ ## grid_id residential_area_sqm grid_area pct_residential_area
+ ##
+ ## 1 286 9461. 487139. 0.0194
+ ## 2 304 563. 487139. 0.0012
+ ## 3 321 72846. 487139. 0.150
+ ## 4 338 13626. 487139. 0.028
+ ## 5 339 57370. 487139. 0.118
+ ## 6 354 75537. 487139. 0.155
+
+``` r
+residential_areas_grids_sf <- residential_areas_grids %>%
+ filter(grid_id %in% qualified_grids_inflow) %>%
+ left_join(., grids, by = c("grid_id" = "grid_id")) %>%
+ st_sf()
+
+spatial_viz(residential_areas_grids_sf,
+ fill_var = "pct_residential_area",
+ legend_title = "Residential area (%)",
+ main_title = "(d) Spatial distribution of residential area",
+ main.title_size = 1.4,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65,
+ palette = "PuRd")
+```
+
+
+
+``` r
+violin_viz(residential_areas_grids_sf,
+ var = "pct_residential_area",
+ labs.x = "", labs.y = "Residential area (%)",
+ breaks = seq(0, 0.32, 0.05),
+ y.shift = 0.06, x.shift = 1.55, text.size = 4)
+```
+
+
+
+### Distance to the nearest MRT station
+
+``` r
+cal_dist2nearestMRT <- function(grid_index, grids, mrt){
+ grid_centroid <- grids[grid_index, ] %>% st_centroid()
+ # create 20k buffer
+ grid_buffer <- grid_centroid %>% st_buffer(20000)
+ # get MRT stations within the buffer
+ mrt_station_in_buffer <- grid_buffer %>%
+ st_join(mrt, .) %>%
+ filter(!is.na(grid_id))
+
+ # distance from grid centroid to MRT stations
+ dist2MRT <- st_distance(st_geometry(grid_centroid), st_geometry(mrt_station_in_buffer), by_element = TRUE) %>% as.numeric()
+
+ # get the nearest distance
+ mrt_station_in_buffer %>%
+ st_set_geometry(NULL) %>%
+ mutate(dist2MRT_km = round(dist2MRT/1000, 4)) %>%
+ arrange(dist2MRT_km) %>%
+ slice(1)
+}
+
+if(file.exists(here("data/derived_data/dist2MRT_nearest.rds"))){
+ dist2MRT_nearest <- readRDS(here("data/derived_data/dist2MRT_nearest.rds"))
+}else{
+ # MRT station
+ mrt <- read_sf(here("data/raw_data/TrainStation_Jan2020/MRTLRTStnPtt.Shp"), quiet = T) %>%
+ st_transform(crs = 3414)
+ dist2MRT_nearest <- do.call(bind_rows, map(grids$grid_id, with_progress(function(x) cal_dist2nearestMRT(x, grids, mrt))))
+ saveRDS(dist2MRT_nearest, file = here("data/derived_data/dist2MRT_nearest.rds"))
+}
+
+head(dist2MRT_nearest)
+```
+
+ ## OBJECTID STN_NAME STN_NO grid_id dist2MRT_km
+ ## 1 25 TUAS CRESCENT MRT STATION EW31 1 12.4307
+ ## 2 25 TUAS CRESCENT MRT STATION EW31 2 11.2449
+ ## 3 25 TUAS CRESCENT MRT STATION EW31 3 10.0871
+ ## 4 25 TUAS CRESCENT MRT STATION EW31 4 8.9681
+ ## 5 25 TUAS CRESCENT MRT STATION EW31 5 7.9044
+ ## 6 25 TUAS CRESCENT MRT STATION EW31 6 11.6869
+
+``` r
+dist2MRT_nearest_sf <- dist2MRT_nearest %>%
+ filter(grid_id %in% qualified_grids_inflow) %>%
+ left_join(., grids, by = c("grid_id" = "grid_id")) %>%
+ st_sf()
+
+spatial_viz(dist2MRT_nearest_sf,
+ fill_var = "dist2MRT_km",
+ legend_title = "Distance (km)",
+ main_title = "(e) Spatial distribution of distance to the nearest MRT station",
+ main.title_size = 1.4,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65,
+ palette = "PuRd")
+```
+
+
+
+``` r
+violin_viz(dist2MRT_nearest_sf,
+ var = "dist2MRT_km",
+ labs.x = "", labs.y = "Distance (km)",
+ breaks = seq(0, 2.5, 0.5),
+ y.shift = 0.5, x.shift = 1.4, text.size = 4)
+```
+
+
+
+### Distance from grid to ‘center’ grid 1232 - City Hall
+
+``` r
+center_grid <- grid_centroids %>% filter(grid_id == 1232)
+dist2_center_grid <- grid_centroids %>%
+ mutate(dist_grid2center = st_distance(., center_grid) %>% as.numeric(),
+ dist_grid2center_km = dist_grid2center/1000) %>%
+ st_set_geometry(NULL)
+head(dist2_center_grid)
+```
+
+ ## # A tibble: 6 × 3
+ ## grid_id dist_grid2center dist_grid2center_km
+ ##
+ ## 1 1 28648. 28.6
+ ## 2 2 28292. 28.3
+ ## 3 3 27992. 28.0
+ ## 4 4 27750 27.8
+ ## 5 5 27567. 27.6
+ ## 6 6 28102. 28.1
+
+``` r
+dist2_center_grid_sf <- dist2_center_grid %>%
+ filter(grid_id %in% qualified_grids_inflow) %>%
+ left_join(., grids, by = c("grid_id" = "grid_id")) %>% st_sf()
+
+spatial_viz(dist2_center_grid_sf,
+ fill_var = "dist_grid2center_km",
+ legend_title = "Distance (km)",
+ main_title = "(f) Spatial distribution of distance to the central location",
+ main.title_size = 1.4,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65,
+ palette = "PuRd")
+```
+
+
+
+``` r
+violin_viz(dist2_center_grid_sf,
+ var = "dist_grid2center_km",
+ labs.x = "", labs.y = "Distance (km)",
+ breaks = seq(0, 20, 5),
+ y.shift = 5, x.shift = 1.55, text.size = 4)
+```
+
+
+
+## Dependent variables
+
+### Travel distance
+
+``` r
+# visitors travel distance
+mean_dist_inflow <- dist_inflow %>%
+ group_by(grid_id, u_id) %>%
+ dplyr::summarise(mean_dist_user = mean(dist_hm2grid_km)) %>%
+ group_by(grid_id) %>%
+ dplyr::summarise(mean_dist_km = mean(mean_dist_user)) %>%
+ mutate(user_type = "visitor")
+
+# locals travel distance
+mean_dist_outflow <- dist_outflow %>%
+ group_by(home, u_id) %>%
+ dplyr::summarise(mean_dist_user = mean(dist_hm2grid_km)) %>%
+ group_by(home) %>%
+ dplyr::summarise(mean_dist_km = mean(mean_dist_user)) %>%
+ dplyr::rename(grid_id = home) %>% mutate(user_type = "local")
+```
+
+``` r
+ggplot(bind_rows(mean_dist_inflow, mean_dist_outflow)) +
+ geom_density(aes(mean_dist_km, fill = user_type, color = user_type), alpha = 0.5) +
+ geom_vline(data = bind_rows(mean_dist_inflow, mean_dist_outflow) %>% group_by(user_type) %>% dplyr::summarise(mean = mean(mean_dist_km)), aes(xintercept = mean, color = user_type), size=1.5) +
+ theme_bw() +
+ theme(legend.position = c(0.9, 0.9)) +
+ labs(x = "Avg.distance (km)", y = "Density", fill = "User type", color = "User type")
+```
+
+
+
+### Mobility diversity
+
+``` r
+combinded_div <- div_inflow %>%
+ st_set_geometry(NULL) %>%
+ mutate(div_type = "Inflow") %>%
+ bind_rows(.,
+ div_outflow %>%
+ st_set_geometry(NULL) %>%
+ mutate(div_type = "Outflow")) %>%
+ mutate(div_type = factor(div_type, levels = c("Outflow", "Inflow")))
+
+combinded_div_mean <- combinded_div %>%
+ group_by(div_type) %>%
+ dplyr::summarise(mean = mean(norm_div_shannon)) %>%
+ mutate(div_type = factor(div_type, levels = c("Outflow", "Inflow")))
+
+ggplot(combinded_div, aes(norm_div_shannon, color = div_type, fill = div_type)) +
+ geom_density(alpha = 0.5) +
+ geom_vline(data = combinded_div_mean, aes(xintercept = mean, color = div_type), size = 1.5) +
+ scale_x_continuous(breaks = seq(0, 1, 0.2)) +
+ labs(x = "Normalized diversity", y = "Density", title = "Density distribution of normalized diversity", fill = "Diversity type", color = "Diversity type") +
+ theme_bw() +
+ theme(legend.position = c(0.1, 0.9))
+```
+
+
+
+## Regression analysis
+
+### Inflow travel distance
+
+``` r
+## gather independent and dependent variables
+reg_inflow_dist <- mean_resale %>%
+ filter(grid_id %in% qualified_grids_inflow) %>%
+ left_join(., mean_dist_inflow %>% dplyr::select(-user_type)) %>%
+ left_join(., industrial_areas_grids %>% dplyr::select(grid_id, pct_industrial_area)) %>%
+ left_join(., residential_areas_grids %>% dplyr::select(grid_id, pct_residential_area)) %>%
+ left_join(., dist2MRT_nearest %>% dplyr::select(grid_id, dist2MRT_km)) %>%
+ left_join(., df_pct_1_2room_rental %>% dplyr::select(grid_id, pct_1_2room_rental)) %>%
+ left_join(., dist2_center_grid %>% dplyr::select(grid_id, dist_grid2center_km)) %>%
+ replace(., is.na(.), 0)
+head(reg_inflow_dist)
+```
+
+ ## # A tibble: 6 × 9
+ ## grid_id mean_resale_price_sqm sd_resale_price_sqm mean_dist_km pct_industrial_…
+ ##
+ ## 1 286 3685. 396. 14.7 0
+ ## 2 304 3571. 323. 12.6 0
+ ## 3 321 3808. 445. 14.0 0
+ ## 4 338 4775. 464. 11.2 0.0936
+ ## 5 339 3518. 253. 13.9 0
+ ## 6 354 4186. 461. 13.1 0
+ ## # … with 4 more variables: pct_residential_area , dist2MRT_km ,
+ ## # pct_1_2room_rental , dist_grid2center_km
+
+#### OLS
+
+Formula:
+`Inflow distance ~ HDB resale price + Percentage of 1 ROOM and 2 Room rentals + Percentage of industrial building + Percentage of residential building + Distance to the nearest MRT station + Distance to the central grid)`
+
+``` r
+ols_inflow_dist <- lm(mean_dist_km ~ mean_resale_price_sqm + pct_1_2room_rental + pct_industrial_area + pct_residential_area + dist2MRT_km + dist_grid2center_km, data = reg_inflow_dist)
+
+summary(ols_inflow_dist)
+```
+
+ ##
+ ## Call:
+ ## lm(formula = mean_dist_km ~ mean_resale_price_sqm + pct_1_2room_rental +
+ ## pct_industrial_area + pct_residential_area + dist2MRT_km +
+ ## dist_grid2center_km, data = reg_inflow_dist)
+ ##
+ ## Residuals:
+ ## Min 1Q Median 3Q Max
+ ## -2.8272 -0.9806 -0.0888 0.8180 4.5572
+ ##
+ ## Coefficients:
+ ## Estimate Std. Error t value Pr(>|t|)
+ ## (Intercept) 2.1725265 0.9489659 2.289 0.02266 *
+ ## mean_resale_price_sqm 0.0009083 0.0001338 6.787 4.98e-11 ***
+ ## pct_1_2room_rental 1.9795891 0.7186578 2.755 0.00619 **
+ ## pct_industrial_area -4.2909212 2.0240122 -2.120 0.03472 *
+ ## pct_residential_area -2.4026791 1.3876470 -1.731 0.08426 .
+ ## dist2MRT_km 0.3202552 0.1755298 1.825 0.06894 .
+ ## dist_grid2center_km 0.2124682 0.0259493 8.188 5.18e-15 ***
+ ## ---
+ ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+ ##
+ ## Residual standard error: 1.36 on 346 degrees of freedom
+ ## Multiple R-squared: 0.1888, Adjusted R-squared: 0.1747
+ ## F-statistic: 13.42 on 6 and 346 DF, p-value: 1.091e-13
+
+``` r
+spatial_viz(augment(ols_inflow_dist, data = reg_inflow_dist) %>% left_join(., grids) %>% st_as_sf(),
+ fill_var = ".resid", palette = "RdBu",
+ legend_title = "Residual (OLS)",
+ main_title = "(a) Residuals of average incoming distance (Moran I statistic: 0.78)",
+ main.title_size = 1.6,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65)
+```
+
+
+
+#### Spatial Error Model (SEM)
+
+``` r
+sf_reg_inflow_dist <- reg_inflow_dist %>%
+ left_join(., grids) %>%
+ st_as_sf() %>%
+ st_transform(crs = 4326)
+
+dist_inflow_sp <- as(sf_reg_inflow_dist, 'Spatial')
+dist_inflow_neighbors <- poly2nb(dist_inflow_sp)
+summary(dist_inflow_neighbors)
+```
+
+ ## Neighbour list object:
+ ## Number of regions: 353
+ ## Number of nonzero links: 1466
+ ## Percentage nonzero weights: 1.17648
+ ## Average number of links: 4.152975
+ ## Link number distribution:
+ ##
+ ## 1 2 3 4 5 6
+ ## 4 39 76 86 76 72
+ ## 4 least connected regions:
+ ## 128 244 247 302 with 1 link
+ ## 72 most connected regions:
+ ## 6 9 10 19 38 46 53 57 70 75 77 93 107 109 115 123 126 130 131 134 137 138 142 147 163 164 167 170 171 175 176 179 185 190 192 193 198 200 204 206 209 211 212 216 217 221 231 257 258 262 263 264 268 269 270 273 274 281 286 291 307 308 314 317 324 332 338 341 342 343 345 346 with 6 links
+
+``` r
+dist_inflow_weights <- nb2listw(dist_inflow_neighbors, style="W", zero.policy=TRUE)
+```
+
+``` r
+moran.test(dist_inflow_sp$mean_dist_km, dist_inflow_weights)
+```
+
+ ##
+ ## Moran I test under randomisation
+ ##
+ ## data: dist_inflow_sp$mean_dist_km
+ ## weights: dist_inflow_weights
+ ##
+ ## Moran I statistic standard deviate = 20.366, p-value < 2.2e-16
+ ## alternative hypothesis: greater
+ ## sample estimates:
+ ## Moran I statistic Expectation Variance
+ ## 0.777633909 -0.002840909 0.001468666
+
+``` r
+sem_inflow_dist <- spatialreg::errorsarlm(mean_dist_km ~ mean_resale_price_sqm + pct_1_2room_rental + pct_industrial_area + pct_residential_area + pct_industrial_area + dist2MRT_km + dist_grid2center_km, data = reg_inflow_dist, listw = dist_inflow_weights)
+
+summary(sem_inflow_dist)
+```
+
+ ##
+ ## Call:spatialreg::errorsarlm(formula = mean_dist_km ~ mean_resale_price_sqm +
+ ## pct_1_2room_rental + pct_industrial_area + pct_residential_area +
+ ## pct_industrial_area + dist2MRT_km + dist_grid2center_km,
+ ## data = reg_inflow_dist, listw = dist_inflow_weights)
+ ##
+ ## Residuals:
+ ## Min 1Q Median 3Q Max
+ ## -1.882582 -0.461242 -0.059318 0.468540 2.693693
+ ##
+ ## Type: error
+ ## Coefficients: (asymptotic standard errors)
+ ## Estimate Std. Error z value Pr(>|z|)
+ ## (Intercept) 7.6813e+00 8.6433e-01 8.8870 < 2.2e-16
+ ## mean_resale_price_sqm 4.9152e-05 1.0561e-04 0.4654 0.641649
+ ## pct_1_2room_rental 5.1959e-01 4.1127e-01 1.2634 0.206444
+ ## pct_industrial_area -3.2302e+00 1.2701e+00 -2.5433 0.010982
+ ## pct_residential_area 2.4437e+00 8.8488e-01 2.7616 0.005752
+ ## dist2MRT_km -2.3525e-01 1.5047e-01 -1.5634 0.117956
+ ## dist_grid2center_km 7.9053e-02 4.4728e-02 1.7674 0.077161
+ ##
+ ## Lambda: 0.80697, LR test value: 318.77, p-value: < 2.22e-16
+ ## Asymptotic standard error: 0.026271
+ ## z-value: 30.717, p-value: < 2.22e-16
+ ## Wald statistic: 943.52, p-value: < 2.22e-16
+ ##
+ ## Log likelihood: -446.3775 for error model
+ ## ML residual variance (sigma squared): 0.56934, (sigma: 0.75455)
+ ## Number of observations: 353
+ ## Number of parameters estimated: 9
+ ## AIC: 910.76, (AIC for lm: 1227.5)
+
+``` r
+sf_reg_inflow_dist$resid_error <- residuals(sem_inflow_dist)
+
+spatial_viz(sf_reg_inflow_dist,
+ fill_var = "resid_error",
+ legend_title = "Residual (SEM)", palette = "RdBu",
+ main_title = "(a) Residuals of average incoming distance (Moran I statistic: - 0.09)",
+ main.title_size = 1.6,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65)
+```
+
+
+
+``` r
+moran.test(sf_reg_inflow_dist$resid_error, dist_inflow_weights)
+```
+
+ ##
+ ## Moran I test under randomisation
+ ##
+ ## data: sf_reg_inflow_dist$resid_error
+ ## weights: dist_inflow_weights
+ ##
+ ## Moran I statistic standard deviate = -2.344, p-value = 0.9905
+ ## alternative hypothesis: greater
+ ## sample estimates:
+ ## Moran I statistic Expectation Variance
+ ## -0.092696925 -0.002840909 0.001469509
+
+#### Spatial Lag Model (SLM)
+
+``` r
+slm_inflow_dist <- spatialreg::lagsarlm(mean_dist_km ~ mean_resale_price_sqm + pct_1_2room_rental + pct_industrial_area + pct_residential_area + dist2MRT_km + dist_grid2center_km, data = reg_inflow_dist, listw = dist_inflow_weights)
+summary(slm_inflow_dist)
+```
+
+ ##
+ ## Call:spatialreg::lagsarlm(formula = mean_dist_km ~ mean_resale_price_sqm +
+ ## pct_1_2room_rental + pct_industrial_area + pct_residential_area +
+ ## dist2MRT_km + dist_grid2center_km, data = reg_inflow_dist,
+ ## listw = dist_inflow_weights)
+ ##
+ ## Residuals:
+ ## Min 1Q Median 3Q Max
+ ## -1.95644 -0.45995 -0.07117 0.48707 3.07624
+ ##
+ ## Type: lag
+ ## Coefficients: (asymptotic standard errors)
+ ## Estimate Std. Error z value Pr(>|z|)
+ ## (Intercept) 4.0175e-01 5.5647e-01 0.7220 0.470315
+ ## mean_resale_price_sqm 1.9368e-04 7.7345e-05 2.5041 0.012276
+ ## pct_1_2room_rental 9.2312e-01 4.0807e-01 2.2622 0.023686
+ ## pct_industrial_area -3.3336e+00 1.1510e+00 -2.8962 0.003777
+ ## pct_residential_area 9.4708e-01 7.8746e-01 1.2027 0.229090
+ ## dist2MRT_km 2.6737e-02 9.9614e-02 0.2684 0.788384
+ ## dist_grid2center_km 4.8355e-02 1.5185e-02 3.1843 0.001451
+ ##
+ ## Rho: 0.77437, LR test value: 313.67, p-value: < 2.22e-16
+ ## Asymptotic standard error: 0.028889
+ ## z-value: 26.805, p-value: < 2.22e-16
+ ## Wald statistic: 718.51, p-value: < 2.22e-16
+ ##
+ ## Log likelihood: -448.9292 for lag model
+ ## ML residual variance (sigma squared): 0.59504, (sigma: 0.77139)
+ ## Number of observations: 353
+ ## Number of parameters estimated: 9
+ ## AIC: 915.86, (AIC for lm: 1227.5)
+ ## LM test for residual autocorrelation
+ ## test value: 18.762, p-value: 1.4813e-05
+
+``` r
+sf_reg_inflow_dist$resid_lagsarlm <- residuals(slm_inflow_dist)
+spatial_viz(sf_reg_inflow_dist,
+ fill_var = "resid_lagsarlm",
+ legend_title = "Residual (SLM)",
+ palette = "RdBu",
+ main_title = "(a) Residuals of average incoming distance (Moran I statistic: - 0.08)",
+ main.title_size = 1.6,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65)
+```
+
+
+
+``` r
+moran.test(sf_reg_inflow_dist$resid_lagsarlm, dist_inflow_weights)
+```
+
+ ##
+ ## Moran I test under randomisation
+ ##
+ ## data: sf_reg_inflow_dist$resid_lagsarlm
+ ## weights: dist_inflow_weights
+ ##
+ ## Moran I statistic standard deviate = -2.1227, p-value = 0.9831
+ ## alternative hypothesis: greater
+ ## sample estimates:
+ ## Moran I statistic Expectation Variance
+ ## -0.084194423 -0.002840909 0.001468820
+
+``` r
+# compare three models
+AIC(ols_inflow_dist, slm_inflow_dist, sem_inflow_dist)
+```
+
+ ## df AIC
+ ## ols_inflow_dist 8 1227.5269
+ ## slm_inflow_dist 9 915.8584
+ ## sem_inflow_dist 9 910.7551
+
+### Outflow travel distance
+
+``` r
+reg_outflow_dist <- mean_resale %>%
+ filter(grid_id %in% qualified_grids_outflow) %>%
+ left_join(., mean_dist_inflow %>% dplyr::select(-user_type)) %>%
+ left_join(., industrial_areas_grids %>% dplyr::select(grid_id, pct_industrial_area)) %>%
+ left_join(., residential_areas_grids %>% dplyr::select(grid_id, pct_residential_area)) %>%
+ left_join(., dist2MRT_nearest %>% dplyr::select(grid_id, dist2MRT_km)) %>%
+ # left_join(., pop_grids_2016) %>%
+ left_join(., df_pct_1_2room_rental %>% dplyr::select(grid_id, pct_1_2room_rental)) %>%
+ left_join(., dist2_center_grid %>% dplyr::select(grid_id, dist_grid2center_km)) %>%
+ replace(., is.na(.), 0)
+head(reg_outflow_dist)
+```
+
+ ## # A tibble: 6 × 9
+ ## grid_id mean_resale_price_sqm sd_resale_price_sqm mean_dist_km pct_industrial_…
+ ##
+ ## 1 286 3685. 396. 14.7 0
+ ## 2 304 3571. 323. 12.6 0
+ ## 3 321 3808. 445. 14.0 0
+ ## 4 338 4775. 464. 11.2 0.0936
+ ## 5 339 3518. 253. 13.9 0
+ ## 6 354 4186. 461. 13.1 0
+ ## # … with 4 more variables: pct_residential_area , dist2MRT_km ,
+ ## # pct_1_2room_rental , dist_grid2center_km
+
+#### OLS
+
+Formula:
+`Outflow distance ~ HDB resale price + Percentage of 1 ROOM and 2 Room rentals + Percentage of industrial building + Percentage of residential building + Distance to the nearest MRT station + Distance to the central grid)`
+
+``` r
+ols_outflow_dist <- lm(mean_dist_km ~ mean_resale_price_sqm + pct_1_2room_rental + pct_industrial_area + pct_residential_area + dist2MRT_km + dist_grid2center_km, data = reg_outflow_dist)
+
+summary(ols_outflow_dist)
+```
+
+ ##
+ ## Call:
+ ## lm(formula = mean_dist_km ~ mean_resale_price_sqm + pct_1_2room_rental +
+ ## pct_industrial_area + pct_residential_area + dist2MRT_km +
+ ## dist_grid2center_km, data = reg_outflow_dist)
+ ##
+ ## Residuals:
+ ## Min 1Q Median 3Q Max
+ ## -2.7995 -0.9898 -0.0371 0.8362 4.4627
+ ##
+ ## Coefficients:
+ ## Estimate Std. Error t value Pr(>|t|)
+ ## (Intercept) 2.0263419 0.9628799 2.104 0.03609 *
+ ## mean_resale_price_sqm 0.0009127 0.0001356 6.729 7.45e-11 ***
+ ## pct_1_2room_rental 2.2303314 0.7328078 3.044 0.00252 **
+ ## pct_industrial_area -4.0521222 2.0681184 -1.959 0.05091 .
+ ## pct_residential_area -2.6512445 1.4351617 -1.847 0.06558 .
+ ## dist2MRT_km 0.3842153 0.1799655 2.135 0.03350 *
+ ## dist_grid2center_km 0.2223886 0.0263955 8.425 1.09e-15 ***
+ ## ---
+ ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+ ##
+ ## Residual standard error: 1.349 on 333 degrees of freedom
+ ## Multiple R-squared: 0.2072, Adjusted R-squared: 0.1929
+ ## F-statistic: 14.5 on 6 and 333 DF, p-value: 1.035e-14
+
+``` r
+spatial_viz(augment(ols_outflow_dist, data = reg_outflow_dist) %>% left_join(., grids) %>% st_as_sf(),
+ fill_var = ".resid", palette = "RdBu",
+ legend_title = "Residual (OLS)",
+ main_title = "(b) Residuals of average outgoing distance (Moran I statistic: 0.79) ",
+ main.title_size = 1.6,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65)
+```
+
+
+
+#### Spatial Error Model (SEM)
+
+``` r
+sf_reg_outflow_dist <- reg_outflow_dist %>%
+ left_join(., grids) %>%
+ st_as_sf() %>%
+ st_transform(crs = 4326)
+
+dist_outflow_sp <- as(sf_reg_outflow_dist, 'Spatial')
+dist_outflow_neighbors <- poly2nb(dist_outflow_sp)
+summary(dist_outflow_neighbors)
+```
+
+ ## Neighbour list object:
+ ## Number of regions: 340
+ ## Number of nonzero links: 1398
+ ## Percentage nonzero weights: 1.209343
+ ## Average number of links: 4.111765
+ ## Link number distribution:
+ ##
+ ## 1 2 3 4 5 6
+ ## 6 37 75 82 75 65
+ ## 6 least connected regions:
+ ## 30 47 113 233 236 290 with 1 link
+ ## 65 most connected regions:
+ ## 6 9 10 19 42 49 53 65 70 72 88 101 103 109 117 120 124 126 130 134 139 154 155 158 161 162 166 167 170 176 181 184 189 195 197 199 201 202 207 220 246 247 250 251 252 256 257 258 261 262 269 274 279 295 296 302 305 312 320 326 329 330 331 333 334 with 6 links
+
+``` r
+dist_outflow_weights <- nb2listw(dist_outflow_neighbors, style="W", zero.policy=TRUE)
+```
+
+``` r
+moran.test(dist_outflow_sp$mean_dist_km, dist_outflow_weights)
+```
+
+ ##
+ ## Moran I test under randomisation
+ ##
+ ## data: dist_outflow_sp$mean_dist_km
+ ## weights: dist_outflow_weights
+ ##
+ ## Moran I statistic standard deviate = 20.103, p-value < 2.2e-16
+ ## alternative hypothesis: greater
+ ## sample estimates:
+ ## Moran I statistic Expectation Variance
+ ## 0.788295402 -0.002949853 0.001549173
+
+``` r
+sem_outflow_dist <- spatialreg::errorsarlm(mean_dist_km ~ mean_resale_price_sqm + pct_1_2room_rental + pct_industrial_area + pct_residential_area + dist2MRT_km + dist_grid2center_km, data = reg_outflow_dist, listw = dist_outflow_weights)
+
+summary(sem_outflow_dist)
+```
+
+ ##
+ ## Call:spatialreg::errorsarlm(formula = mean_dist_km ~ mean_resale_price_sqm +
+ ## pct_1_2room_rental + pct_industrial_area + pct_residential_area +
+ ## dist2MRT_km + dist_grid2center_km, data = reg_outflow_dist,
+ ## listw = dist_outflow_weights)
+ ##
+ ## Residuals:
+ ## Min 1Q Median 3Q Max
+ ## -1.851638 -0.481379 -0.038598 0.474132 2.543005
+ ##
+ ## Type: error
+ ## Coefficients: (asymptotic standard errors)
+ ## Estimate Std. Error z value Pr(>|z|)
+ ## (Intercept) 7.12260427 0.86987035 8.1881 2.22e-16
+ ## mean_resale_price_sqm 0.00010133 0.00010368 0.9773 0.32841
+ ## pct_1_2room_rental 0.59249114 0.42007068 1.4105 0.15841
+ ## pct_industrial_area -3.33092235 1.31126440 -2.5402 0.01108
+ ## pct_residential_area 1.71113222 0.88193695 1.9402 0.05236
+ ## dist2MRT_km -0.08804982 0.14890994 -0.5913 0.55432
+ ## dist_grid2center_km 0.11128362 0.04594109 2.4223 0.01542
+ ##
+ ## Lambda: 0.81384, LR test value: 317.35, p-value: < 2.22e-16
+ ## Asymptotic standard error: 0.025714
+ ## z-value: 31.65, p-value: < 2.22e-16
+ ## Wald statistic: 1001.7, p-value: < 2.22e-16
+ ##
+ ## Log likelihood: -422.1293 for error model
+ ## ML residual variance (sigma squared): 0.53831, (sigma: 0.7337)
+ ## Number of observations: 340
+ ## Number of parameters estimated: 9
+ ## AIC: 862.26, (AIC for lm: 1177.6)
+
+``` r
+sf_reg_outflow_dist$resid_error <- residuals(sem_outflow_dist)
+
+spatial_viz(sf_reg_outflow_dist,
+ fill_var = "resid_error",
+ legend_title = "Residual (SEM)", palette = "RdBu",
+ main_title = "(b) Residuals of average outgoing distance (Moran I statistic: -0.09)",
+ main.title_size = 1.6,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65)
+```
+
+
+
+``` r
+moran.test(sf_reg_outflow_dist$resid_error, dist_outflow_weights)
+```
+
+ ##
+ ## Moran I test under randomisation
+ ##
+ ## data: sf_reg_outflow_dist$resid_error
+ ## weights: dist_outflow_weights
+ ##
+ ## Moran I statistic standard deviate = -2.1469, p-value = 0.9841
+ ## alternative hypothesis: greater
+ ## sample estimates:
+ ## Moran I statistic Expectation Variance
+ ## -0.087498288 -0.002949853 0.001550940
+
+#### Spatial Lag Model (SLM)
+
+``` r
+slm_outflow_dist <- spatialreg::lagsarlm(mean_dist_km ~ mean_resale_price_sqm + pct_1_2room_rental + pct_industrial_area + pct_residential_area + dist2MRT_km + dist_grid2center_km, data = reg_outflow_dist, listw = dist_outflow_weights)
+summary(slm_outflow_dist)
+```
+
+ ##
+ ## Call:spatialreg::lagsarlm(formula = mean_dist_km ~ mean_resale_price_sqm +
+ ## pct_1_2room_rental + pct_industrial_area + pct_residential_area +
+ ## dist2MRT_km + dist_grid2center_km, data = reg_outflow_dist,
+ ## listw = dist_outflow_weights)
+ ##
+ ## Residuals:
+ ## Min 1Q Median 3Q Max
+ ## -1.861709 -0.472194 -0.041032 0.468667 2.671780
+ ##
+ ## Type: lag
+ ## Coefficients: (asymptotic standard errors)
+ ## Estimate Std. Error z value Pr(>|z|)
+ ## (Intercept) 2.8485e-01 5.4756e-01 0.5202 0.602906
+ ## mean_resale_price_sqm 1.9463e-04 7.6661e-05 2.5389 0.011122
+ ## pct_1_2room_rental 9.4911e-01 4.0605e-01 2.3374 0.019419
+ ## pct_industrial_area -2.9316e+00 1.1476e+00 -2.5545 0.010635
+ ## pct_residential_area 6.0660e-01 7.9433e-01 0.7637 0.445072
+ ## dist2MRT_km 1.0160e-01 9.9916e-02 1.0169 0.309214
+ ## dist_grid2center_km 5.1147e-02 1.5201e-02 3.3648 0.000766
+ ##
+ ## Rho: 0.78256, LR test value: 315.41, p-value: < 2.22e-16
+ ## Asymptotic standard error: 0.028324
+ ## z-value: 27.629, p-value: < 2.22e-16
+ ## Wald statistic: 763.34, p-value: < 2.22e-16
+ ##
+ ## Log likelihood: -423.0978 for lag model
+ ## ML residual variance (sigma squared): 0.5578, (sigma: 0.74686)
+ ## Number of observations: 340
+ ## Number of parameters estimated: 9
+ ## AIC: 864.2, (AIC for lm: 1177.6)
+ ## LM test for residual autocorrelation
+ ## test value: 16.49, p-value: 4.8904e-05
+
+``` r
+spatialreg::impacts(slm_outflow_dist, listw = dist_outflow_weights)
+```
+
+ ## Impact measures (lag, exact):
+ ## Direct Indirect Total
+ ## mean_resale_price_sqm 0.0002625584 0.0006325316 0.00089509
+ ## pct_1_2room_rental 1.2803532205 3.0845087888 4.36486201
+ ## pct_industrial_area -3.9546677409 -9.5272204643 -13.48188821
+ ## pct_residential_area 0.8183019825 1.9713775985 2.78967958
+ ## dist2MRT_km 0.1370616717 0.3301963273 0.46725800
+ ## dist_grid2center_km 0.0689973217 0.1662219783 0.23521930
+
+``` r
+sf_reg_outflow_dist$resid_lagsarlm <- residuals(slm_outflow_dist)
+spatial_viz(sf_reg_outflow_dist,
+ fill_var = "resid_lagsarlm",
+ legend_title = "Residual (SLM)", palette = "RdBu",
+ main_title = "(b) Residuals of average outgoing distance (Moran I statistic: -0.08)",
+ main.title_size = 1.6,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65)
+```
+
+
+
+``` r
+moran.test(sf_reg_outflow_dist$resid_lagsarlm, dist_outflow_weights)
+```
+
+ ##
+ ## Moran I test under randomisation
+ ##
+ ## data: sf_reg_outflow_dist$resid_lagsarlm
+ ## weights: dist_outflow_weights
+ ##
+ ## Moran I statistic standard deviate = -2.0151, p-value = 0.9781
+ ## alternative hypothesis: greater
+ ## sample estimates:
+ ## Moran I statistic Expectation Variance
+ ## -0.082296705 -0.002949853 0.001550404
+
+``` r
+# compare three models
+AIC(ols_outflow_dist, slm_outflow_dist, sem_outflow_dist)
+```
+
+ ## df AIC
+ ## ols_outflow_dist 8 1177.6092
+ ## slm_outflow_dist 9 864.1956
+ ## sem_outflow_dist 9 862.2585
+
+### Inflow diversity
+
+``` r
+reg_inflow_div <- mean_resale %>%
+ filter(grid_id %in% qualified_grids_inflow) %>%
+ left_join(., div_inflow %>% st_set_geometry(NULL) %>% dplyr::select(grid_id, norm_div_shannon)) %>%
+ left_join(., industrial_areas_grids %>% dplyr::select(grid_id, pct_industrial_area)) %>%
+ left_join(., residential_areas_grids %>% dplyr::select(grid_id, pct_residential_area)) %>%
+ left_join(., dist2MRT_nearest %>% dplyr::select(grid_id, dist2MRT_km)) %>%
+ left_join(., df_pct_1_2room_rental %>% dplyr::select(grid_id, pct_1_2room_rental)) %>%
+ left_join(., dist2_center_grid %>% dplyr::select(grid_id, dist_grid2center_km)) %>%
+ replace(., is.na(.), 0)
+
+head(reg_inflow_div)
+```
+
+ ## # A tibble: 6 × 9
+ ## grid_id mean_resale_price… sd_resale_price_… norm_div_shannon pct_industrial_…
+ ##
+ ## 1 286 3685. 396. 0.720 0
+ ## 2 304 3571. 323. 0.449 0
+ ## 3 321 3808. 445. 0.602 0
+ ## 4 338 4775. 464. 0.485 0.0936
+ ## 5 339 3518. 253. 0.547 0
+ ## 6 354 4186. 461. 0.507 0
+ ## # … with 4 more variables: pct_residential_area , dist2MRT_km ,
+ ## # pct_1_2room_rental , dist_grid2center_km
+
+#### OLS
+
+``` r
+ols_inflow_div <- lm(norm_div_shannon ~ mean_resale_price_sqm + pct_1_2room_rental + pct_industrial_area + pct_residential_area + dist2MRT_km + dist_grid2center_km, data = reg_inflow_div)
+
+summary(ols_inflow_div)
+```
+
+ ##
+ ## Call:
+ ## lm(formula = norm_div_shannon ~ mean_resale_price_sqm + pct_1_2room_rental +
+ ## pct_industrial_area + pct_residential_area + dist2MRT_km +
+ ## dist_grid2center_km, data = reg_inflow_div)
+ ##
+ ## Residuals:
+ ## Min 1Q Median 3Q Max
+ ## -0.282743 -0.050921 0.004627 0.055192 0.230125
+ ##
+ ## Coefficients:
+ ## Estimate Std. Error t value Pr(>|t|)
+ ## (Intercept) 8.797e-01 5.747e-02 15.307 <2e-16 ***
+ ## mean_resale_price_sqm 1.046e-05 8.105e-06 1.291 0.1977
+ ## pct_1_2room_rental -6.107e-02 4.352e-02 -1.403 0.1615
+ ## pct_industrial_area 7.263e-02 1.226e-01 0.593 0.5539
+ ## pct_residential_area 7.651e-02 8.403e-02 0.910 0.3632
+ ## dist2MRT_km -2.050e-02 1.063e-02 -1.928 0.0546 .
+ ## dist_grid2center_km -1.627e-02 1.571e-03 -10.351 <2e-16 ***
+ ## ---
+ ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+ ##
+ ## Residual standard error: 0.08233 on 346 degrees of freedom
+ ## Multiple R-squared: 0.5152, Adjusted R-squared: 0.5068
+ ## F-statistic: 61.27 on 6 and 346 DF, p-value: < 2.2e-16
+
+``` r
+spatial_viz(augment(ols_inflow_div, data = reg_inflow_div) %>% left_join(., grids) %>% st_as_sf(),
+ fill_var = ".resid", palette = "RdBu",
+ legend_title = "Residual (OLS)",
+ main_title = "(c) Residuals of inflow diversity (Moran I statistic: 0.69)",
+ main.title_size = 1.6,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65)
+```
+
+
+
+#### Spatial Error (SEM) Models
+
+``` r
+sf_reg_inflow_div <- reg_inflow_div %>%
+ left_join(., grids) %>%
+ st_as_sf() %>%
+ st_transform(crs = 4326)
+
+div_inflow_sp <- as(sf_reg_inflow_div, 'Spatial')
+div_inflow_neighbors <- poly2nb(div_inflow_sp)
+summary(div_inflow_neighbors)
+```
+
+ ## Neighbour list object:
+ ## Number of regions: 353
+ ## Number of nonzero links: 1466
+ ## Percentage nonzero weights: 1.17648
+ ## Average number of links: 4.152975
+ ## Link number distribution:
+ ##
+ ## 1 2 3 4 5 6
+ ## 4 39 76 86 76 72
+ ## 4 least connected regions:
+ ## 128 244 247 302 with 1 link
+ ## 72 most connected regions:
+ ## 6 9 10 19 38 46 53 57 70 75 77 93 107 109 115 123 126 130 131 134 137 138 142 147 163 164 167 170 171 175 176 179 185 190 192 193 198 200 204 206 209 211 212 216 217 221 231 257 258 262 263 264 268 269 270 273 274 281 286 291 307 308 314 317 324 332 338 341 342 343 345 346 with 6 links
+
+``` r
+div_inflow_weights <- nb2listw(div_inflow_neighbors, style="W", zero.policy=TRUE)
+```
+
+``` r
+moran.test(div_inflow_sp$norm_div_shannon, div_inflow_weights)
+```
+
+ ##
+ ## Moran I test under randomisation
+ ##
+ ## data: div_inflow_sp$norm_div_shannon
+ ## weights: div_inflow_weights
+ ##
+ ## Moran I statistic standard deviate = 18.155, p-value < 2.2e-16
+ ## alternative hypothesis: greater
+ ## sample estimates:
+ ## Moran I statistic Expectation Variance
+ ## 0.693791308 -0.002840909 0.001472379
+
+``` r
+sem_inflow_div <- spatialreg::errorsarlm(norm_div_shannon ~ mean_resale_price_sqm + pct_1_2room_rental + pct_industrial_area + pct_residential_area + dist2MRT_km + dist_grid2center_km, data = reg_inflow_div, listw = div_inflow_weights)
+summary(sem_inflow_div)
+```
+
+ ##
+ ## Call:
+ ## spatialreg::errorsarlm(formula = norm_div_shannon ~ mean_resale_price_sqm +
+ ## pct_1_2room_rental + pct_industrial_area + pct_residential_area +
+ ## dist2MRT_km + dist_grid2center_km, data = reg_inflow_div,
+ ## listw = div_inflow_weights)
+ ##
+ ## Residuals:
+ ## Min 1Q Median 3Q Max
+ ## -0.3017621 -0.0425634 0.0031422 0.0441823 0.2316963
+ ##
+ ## Type: error
+ ## Coefficients: (asymptotic standard errors)
+ ## Estimate Std. Error z value Pr(>|z|)
+ ## (Intercept) 8.7745e-01 6.3804e-02 13.7522 < 2.2e-16
+ ## mean_resale_price_sqm 1.3020e-05 8.7738e-06 1.4840 0.137807
+ ## pct_1_2room_rental -1.0698e-02 3.6666e-02 -0.2918 0.770460
+ ## pct_industrial_area -1.2187e-01 1.1054e-01 -1.1025 0.270251
+ ## pct_residential_area 5.4874e-02 7.7424e-02 0.7087 0.478486
+ ## dist2MRT_km -3.7637e-02 1.2030e-02 -3.1287 0.001756
+ ## dist_grid2center_km -1.5665e-02 2.2159e-03 -7.0694 1.556e-12
+ ##
+ ## Lambda: 0.57378, LR test value: 104.21, p-value: < 2.22e-16
+ ## Asymptotic standard error: 0.046713
+ ## z-value: 12.283, p-value: < 2.22e-16
+ ## Wald statistic: 150.87, p-value: < 2.22e-16
+ ##
+ ## Log likelihood: 436.2011 for error model
+ ## ML residual variance (sigma squared): 0.0044611, (sigma: 0.066792)
+ ## Number of observations: 353
+ ## Number of parameters estimated: 9
+ ## AIC: -854.4, (AIC for lm: -752.19)
+
+``` r
+sf_reg_inflow_div$resid_error <- residuals(sem_inflow_div)
+spatial_viz(sf_reg_inflow_div,
+ fill_var = "resid_error",
+ legend_title = "Residual (SEM)", palette = "RdBu",
+ main_title = "(c) Residuals of inflow diversity (Moran I statistic: -0.05)",
+ main.title_size = 1.6,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65)
+```
+
+
+
+``` r
+moran.test(sf_reg_inflow_div$resid_error, div_inflow_weights)
+```
+
+ ##
+ ## Moran I test under randomisation
+ ##
+ ## data: sf_reg_inflow_div$resid_error
+ ## weights: div_inflow_weights
+ ##
+ ## Moran I statistic standard deviate = -1.2831, p-value = 0.9003
+ ## alternative hypothesis: greater
+ ## sample estimates:
+ ## Moran I statistic Expectation Variance
+ ## -0.051965537 -0.002840909 0.001465702
+
+#### Spatial Lag Models (SLM):
+
+``` r
+slm_inflow_div <- spatialreg::lagsarlm(norm_div_shannon ~ mean_resale_price_sqm + pct_1_2room_rental + pct_industrial_area + pct_residential_area + dist2MRT_km +dist_grid2center_km, data = reg_inflow_div, listw = div_inflow_weights)
+summary(slm_inflow_div)
+```
+
+ ##
+ ## Call:spatialreg::lagsarlm(formula = norm_div_shannon ~ mean_resale_price_sqm +
+ ## pct_1_2room_rental + pct_industrial_area + pct_residential_area +
+ ## dist2MRT_km + dist_grid2center_km, data = reg_inflow_div,
+ ## listw = div_inflow_weights)
+ ##
+ ## Residuals:
+ ## Min 1Q Median 3Q Max
+ ## -0.3015421 -0.0418065 0.0038327 0.0436594 0.2440237
+ ##
+ ## Type: lag
+ ## Coefficients: (asymptotic standard errors)
+ ## Estimate Std. Error z value Pr(>|z|)
+ ## (Intercept) 3.9600e-01 6.5163e-02 6.0770 1.224e-09
+ ## mean_resale_price_sqm 8.3093e-06 6.7330e-06 1.2341 0.21716
+ ## pct_1_2room_rental -3.4324e-02 3.6112e-02 -0.9505 0.34187
+ ## pct_industrial_area -2.2847e-02 1.0171e-01 -0.2246 0.82226
+ ## pct_residential_area 5.4756e-02 6.9728e-02 0.7853 0.43229
+ ## dist2MRT_km -2.0264e-02 8.8795e-03 -2.2821 0.02249
+ ## dist_grid2center_km -7.2208e-03 1.5370e-03 -4.6979 2.629e-06
+ ##
+ ## Rho: 0.53395, LR test value: 94.032, p-value: < 2.22e-16
+ ## Asymptotic standard error: 0.04895
+ ## z-value: 10.908, p-value: < 2.22e-16
+ ## Wald statistic: 118.99, p-value: < 2.22e-16
+ ##
+ ## Log likelihood: 431.1132 for lag model
+ ## ML residual variance (sigma squared): 0.0046661, (sigma: 0.068309)
+ ## Number of observations: 353
+ ## Number of parameters estimated: 9
+ ## AIC: -844.23, (AIC for lm: -752.19)
+ ## LM test for residual autocorrelation
+ ## test value: 7.4079, p-value: 0.0064937
+
+``` r
+spatialreg::impacts(slm_inflow_div, listw = div_inflow_weights)
+```
+
+ ## Impact measures (lag, exact):
+ ## Direct Indirect Total
+ ## mean_resale_price_sqm 9.152345e-06 8.677103e-06 1.782945e-05
+ ## pct_1_2room_rental -3.780603e-02 -3.584292e-02 -7.364895e-02
+ ## pct_industrial_area -2.516523e-02 -2.385850e-02 -4.902373e-02
+ ## pct_residential_area 6.031150e-02 5.717978e-02 1.174913e-01
+ ## dist2MRT_km -2.231941e-02 -2.116046e-02 -4.347987e-02
+ ## dist_grid2center_km -7.953358e-03 -7.540374e-03 -1.549373e-02
+
+``` r
+sf_reg_inflow_div$resid_lagsarlm <- residuals(slm_inflow_div)
+
+spatial_viz(sf_reg_inflow_div,
+ fill_var = "resid_lagsarlm",
+ legend_title = "Residual (SLM)", palette = "RdBu",
+ main_title = "(c) Residuals of inflow diversity (Moran I statistic: -0.03)",
+ main.title_size = 1.6,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65)
+```
+
+
+
+``` r
+moran.test(sf_reg_inflow_div$resid_lagsarlm, div_inflow_weights)
+```
+
+ ##
+ ## Moran I test under randomisation
+ ##
+ ## data: sf_reg_inflow_div$resid_lagsarlm
+ ## weights: div_inflow_weights
+ ##
+ ## Moran I statistic standard deviate = -0.77667, p-value = 0.7813
+ ## alternative hypothesis: greater
+ ## sample estimates:
+ ## Moran I statistic Expectation Variance
+ ## -0.032580095 -0.002840909 0.001466181
+
+``` r
+AIC(ols_inflow_div, slm_inflow_div, sem_inflow_div)
+```
+
+ ## df AIC
+ ## ols_inflow_div 8 -752.1945
+ ## slm_inflow_div 9 -844.2264
+ ## sem_inflow_div 9 -854.4022
+
+### Outflow diversity
+
+``` r
+reg_outflow_div <- mean_resale %>%
+ filter(grid_id %in% qualified_grids_outflow) %>%
+ left_join(., div_outflow %>% st_set_geometry(NULL) %>% dplyr::select(grid_id, norm_div_shannon)) %>%
+ left_join(., industrial_areas_grids %>% dplyr::select(grid_id, pct_industrial_area)) %>%
+ left_join(., residential_areas_grids %>% dplyr::select(grid_id, pct_residential_area)) %>%
+ left_join(., dist2MRT_nearest %>% dplyr::select(grid_id, dist2MRT_km)) %>%
+ left_join(., df_pct_1_2room_rental %>% dplyr::select(grid_id, pct_1_2room_rental)) %>%
+ left_join(., dist2_center_grid %>% dplyr::select(grid_id, dist_grid2center_km)) %>%
+ replace(., is.na(.), 0)
+head(reg_inflow_div)
+```
+
+ ## # A tibble: 6 × 9
+ ## grid_id mean_resale_price… sd_resale_price_… norm_div_shannon pct_industrial_…
+ ##
+ ## 1 286 3685. 396. 0.720 0
+ ## 2 304 3571. 323. 0.449 0
+ ## 3 321 3808. 445. 0.602 0
+ ## 4 338 4775. 464. 0.485 0.0936
+ ## 5 339 3518. 253. 0.547 0
+ ## 6 354 4186. 461. 0.507 0
+ ## # … with 4 more variables: pct_residential_area , dist2MRT_km ,
+ ## # pct_1_2room_rental , dist_grid2center_km
+
+#### OLS
+
+``` r
+ols_outflow_div <- lm(norm_div_shannon ~ mean_resale_price_sqm + pct_1_2room_rental + pct_industrial_area + pct_residential_area + dist2MRT_km + dist_grid2center_km, data = reg_outflow_div)
+summary(ols_outflow_div)
+```
+
+ ##
+ ## Call:
+ ## lm(formula = norm_div_shannon ~ mean_resale_price_sqm + pct_1_2room_rental +
+ ## pct_industrial_area + pct_residential_area + dist2MRT_km +
+ ## dist_grid2center_km, data = reg_outflow_div)
+ ##
+ ## Residuals:
+ ## Min 1Q Median 3Q Max
+ ## -0.52130 -0.05253 0.01568 0.08560 0.25775
+ ##
+ ## Coefficients:
+ ## Estimate Std. Error t value Pr(>|t|)
+ ## (Intercept) 1.415e+00 8.705e-02 16.254 < 2e-16 ***
+ ## mean_resale_price_sqm -7.073e-05 1.226e-05 -5.768 1.83e-08 ***
+ ## pct_1_2room_rental -1.683e-01 6.625e-02 -2.541 0.01151 *
+ ## pct_industrial_area 3.053e-01 1.870e-01 1.633 0.10344
+ ## pct_residential_area 3.241e-01 1.297e-01 2.498 0.01299 *
+ ## dist2MRT_km -4.264e-02 1.627e-02 -2.621 0.00917 **
+ ## dist_grid2center_km -3.490e-02 2.386e-03 -14.624 < 2e-16 ***
+ ## ---
+ ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+ ##
+ ## Residual standard error: 0.122 on 333 degrees of freedom
+ ## Multiple R-squared: 0.4907, Adjusted R-squared: 0.4815
+ ## F-statistic: 53.47 on 6 and 333 DF, p-value: < 2.2e-16
+
+``` r
+spatial_viz(augment(ols_outflow_div, data = reg_outflow_div) %>% left_join(., grids) %>% st_as_sf(),
+ fill_var = ".resid", palette = "RdBu",
+ legend_title = "Residual (OLS)",
+ main_title = "(d) Residuals of outflow diversity (Moran I statistic: 0.83)",
+ main.title_size = 1.6,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65)
+```
+
+
+
+``` r
+sf_reg_outflow_div <- reg_outflow_div %>%
+ left_join(., grids) %>%
+ st_as_sf() %>%
+ st_transform(crs = 4326)
+
+div_outflow_sp <- as(sf_reg_outflow_div, 'Spatial')
+div_outflow_neighbors <- poly2nb(div_outflow_sp)
+summary(div_outflow_neighbors)
+```
+
+ ## Neighbour list object:
+ ## Number of regions: 340
+ ## Number of nonzero links: 1398
+ ## Percentage nonzero weights: 1.209343
+ ## Average number of links: 4.111765
+ ## Link number distribution:
+ ##
+ ## 1 2 3 4 5 6
+ ## 6 37 75 82 75 65
+ ## 6 least connected regions:
+ ## 30 47 113 233 236 290 with 1 link
+ ## 65 most connected regions:
+ ## 6 9 10 19 42 49 53 65 70 72 88 101 103 109 117 120 124 126 130 134 139 154 155 158 161 162 166 167 170 176 181 184 189 195 197 199 201 202 207 220 246 247 250 251 252 256 257 258 261 262 269 274 279 295 296 302 305 312 320 326 329 330 331 333 334 with 6 links
+
+``` r
+div_outflow_weights <- nb2listw(div_outflow_neighbors, style="W", zero.policy=TRUE)
+```
+
+``` r
+moran.test(div_outflow_sp$norm_div_shannon, div_outflow_weights)
+```
+
+ ##
+ ## Moran I test under randomisation
+ ##
+ ## data: div_outflow_sp$norm_div_shannon
+ ## weights: div_outflow_weights
+ ##
+ ## Moran I statistic standard deviate = 21.122, p-value < 2.2e-16
+ ## alternative hypothesis: greater
+ ## sample estimates:
+ ## Moran I statistic Expectation Variance
+ ## 0.827636979 -0.002949853 0.001546279
+
+#### Spatial Error (SEM) Models
+
+``` r
+sem_outflow_div <- spatialreg::errorsarlm(norm_div_shannon ~ mean_resale_price_sqm + pct_1_2room_rental + pct_industrial_area + pct_residential_area + dist2MRT_km + dist_grid2center_km, data = reg_outflow_div, listw = div_outflow_weights)
+summary(sem_outflow_div)
+```
+
+ ##
+ ## Call:
+ ## spatialreg::errorsarlm(formula = norm_div_shannon ~ mean_resale_price_sqm +
+ ## pct_1_2room_rental + pct_industrial_area + pct_residential_area +
+ ## dist2MRT_km + dist_grid2center_km, data = reg_outflow_div,
+ ## listw = div_outflow_weights)
+ ##
+ ## Residuals:
+ ## Min 1Q Median 3Q Max
+ ## -0.330392 -0.033828 0.012769 0.045078 0.206674
+ ##
+ ## Type: error
+ ## Coefficients: (asymptotic standard errors)
+ ## Estimate Std. Error z value Pr(>|z|)
+ ## (Intercept) 1.0402e+00 8.1315e-02 12.7917 < 2.2e-16
+ ## mean_resale_price_sqm -1.0795e-05 1.0048e-05 -1.0743 0.282670
+ ## pct_1_2room_rental -2.7653e-02 4.0916e-02 -0.6758 0.499145
+ ## pct_industrial_area 5.3330e-02 1.2732e-01 0.4189 0.675324
+ ## pct_residential_area 1.5487e-01 8.5847e-02 1.8040 0.071232
+ ## dist2MRT_km -4.5331e-02 1.4362e-02 -3.1562 0.001598
+ ## dist_grid2center_km -2.6936e-02 4.0070e-03 -6.7224 1.788e-11
+ ##
+ ## Lambda: 0.78611, LR test value: 276.51, p-value: < 2.22e-16
+ ## Asymptotic standard error: 0.028634
+ ## z-value: 27.454, p-value: < 2.22e-16
+ ## Wald statistic: 753.7, p-value: < 2.22e-16
+ ##
+ ## Log likelihood: 374.6151 for error model
+ ## ML residual variance (sigma squared): 0.0050956, (sigma: 0.071384)
+ ## Number of observations: 340
+ ## Number of parameters estimated: 9
+ ## AIC: -731.23, (AIC for lm: -456.72)
+
+``` r
+sf_reg_outflow_div$resid_error <- residuals(sem_outflow_div)
+spatial_viz(sf_reg_outflow_div,
+ fill_var = "resid_error",
+ legend_title = "Residual (SEM)", palette = "RdBu",
+ main_title = "(d) Residuals of outflow diversity (Moran I statistic: -0.08)",
+ main.title_size = 1.6,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65)
+```
+
+
+
+``` r
+moran.test(sf_reg_outflow_div$resid_error, div_outflow_weights)
+```
+
+ ##
+ ## Moran I test under randomisation
+ ##
+ ## data: sf_reg_outflow_div$resid_error
+ ## weights: div_outflow_weights
+ ##
+ ## Moran I statistic standard deviate = -1.93, p-value = 0.9732
+ ## alternative hypothesis: greater
+ ## sample estimates:
+ ## Moran I statistic Expectation Variance
+ ## -0.078715371 -0.002949853 0.001541164
+
+#### Spatial Lag Models (SLM):
+
+``` r
+slm_outflow_div <- spatialreg::lagsarlm(norm_div_shannon ~ mean_resale_price_sqm + pct_1_2room_rental + pct_industrial_area + pct_residential_area + dist2MRT_km + dist_grid2center_km, data = reg_outflow_div, listw = div_outflow_weights)
+summary(slm_outflow_div)
+```
+
+ ##
+ ## Call:spatialreg::lagsarlm(formula = norm_div_shannon ~ mean_resale_price_sqm +
+ ## pct_1_2room_rental + pct_industrial_area + pct_residential_area +
+ ## dist2MRT_km + dist_grid2center_km, data = reg_outflow_div,
+ ## listw = div_outflow_weights)
+ ##
+ ## Residuals:
+ ## Min 1Q Median 3Q Max
+ ## -0.334087 -0.035365 0.010145 0.046122 0.201250
+ ##
+ ## Type: lag
+ ## Coefficients: (asymptotic standard errors)
+ ## Estimate Std. Error z value Pr(>|z|)
+ ## (Intercept) 3.7667e-01 6.6190e-02 5.6907 1.265e-08
+ ## mean_resale_price_sqm -2.0634e-05 7.5228e-06 -2.7428 0.006092
+ ## pct_1_2room_rental -5.4066e-02 3.9605e-02 -1.3651 0.172209
+ ## pct_industrial_area 7.8100e-02 1.1172e-01 0.6991 0.484497
+ ## pct_residential_area 1.4265e-01 7.7670e-02 1.8366 0.066267
+ ## dist2MRT_km -2.9105e-02 9.9039e-03 -2.9388 0.003295
+ ## dist_grid2center_km -9.3834e-03 1.7344e-03 -5.4102 6.297e-08
+ ##
+ ## Rho: 0.75238, LR test value: 272.44, p-value: < 2.22e-16
+ ## Asymptotic standard error: 0.030895
+ ## z-value: 24.352, p-value: < 2.22e-16
+ ## Wald statistic: 593.04, p-value: < 2.22e-16
+ ##
+ ## Log likelihood: 372.5805 for lag model
+ ## ML residual variance (sigma squared): 0.0053062, (sigma: 0.072844)
+ ## Number of observations: 340
+ ## Number of parameters estimated: 9
+ ## AIC: -727.16, (AIC for lm: -456.72)
+ ## LM test for residual autocorrelation
+ ## test value: 8.6884, p-value: 0.0032024
+
+``` r
+spatialreg::impacts(slm_outflow_div, listw = div_outflow_weights)
+```
+
+ ## Impact measures (lag, exact):
+ ## Direct Indirect Total
+ ## mean_resale_price_sqm -2.677017e-05 -5.655649e-05 -8.332666e-05
+ ## pct_1_2room_rental -7.014541e-02 -1.481940e-01 -2.183394e-01
+ ## pct_industrial_area 1.013274e-01 2.140712e-01 3.153985e-01
+ ## pct_residential_area 1.850751e-01 3.910024e-01 5.760775e-01
+ ## dist2MRT_km -3.776132e-02 -7.977717e-02 -1.175385e-01
+ ## dist_grid2center_km -1.217409e-02 -2.571982e-02 -3.789391e-02
+
+``` r
+sf_reg_outflow_div$resid_lagsarlm <- residuals(slm_outflow_div)
+spatial_viz(sf_reg_outflow_div,
+ fill_var = "resid_lagsarlm",
+ legend_title = "Residual (SLM)", palette = "-RdBu",
+ main_title = "(d) Residuals of outflow diversity (Moran I statistic: -0.06)",
+ main.title_size = 1.6,
+ legend.hist_height = 0.25, legend.hist_width = 0.5,
+ legend_width = 0.4, legend.hist_size = 0.5,
+ legend.title_size = 1.2, legend.text_size = 0.65)
+```
+
+
+
+``` r
+moran.test(sf_reg_outflow_div$resid_lagsarlm, div_outflow_weights)
+```
+
+ ##
+ ## Moran I test under randomisation
+ ##
+ ## data: sf_reg_outflow_div$resid_lagsarlm
+ ## weights: div_outflow_weights
+ ##
+ ## Moran I statistic standard deviate = -1.4211, p-value = 0.9224
+ ## alternative hypothesis: greater
+ ## sample estimates:
+ ## Moran I statistic Expectation Variance
+ ## -0.058732644 -0.002949853 0.001540864
+
+``` r
+AIC(ols_outflow_div, slm_outflow_div, sem_outflow_div)
+```
+
+ ## df AIC
+ ## ols_outflow_div 8 -456.7188
+ ## slm_outflow_div 9 -727.1609
+ ## sem_outflow_div 9 -731.2302
diff --git a/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-10-1.png b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-10-1.png
new file mode 100644
index 0000000..dac3512
Binary files /dev/null and b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-10-1.png differ
diff --git a/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-11-1.png b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-11-1.png
new file mode 100644
index 0000000..e3551fe
Binary files /dev/null and b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-11-1.png differ
diff --git a/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-13-1.png b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-13-1.png
new file mode 100644
index 0000000..51ba90e
Binary files /dev/null and b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-13-1.png differ
diff --git a/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-14-1.png b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-14-1.png
new file mode 100644
index 0000000..7d2b6ce
Binary files /dev/null and b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-14-1.png differ
diff --git a/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-16-1.png b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-16-1.png
new file mode 100644
index 0000000..4daceb0
Binary files /dev/null and b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-16-1.png differ
diff --git a/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-17-1.png b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-17-1.png
new file mode 100644
index 0000000..ab658bb
Binary files /dev/null and b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-17-1.png differ
diff --git a/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-19-1.png b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-19-1.png
new file mode 100644
index 0000000..44b7e47
Binary files /dev/null and b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-19-1.png differ
diff --git a/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-20-1.png b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-20-1.png
new file mode 100644
index 0000000..03e6784
Binary files /dev/null and b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-20-1.png differ
diff --git a/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-22-1.png b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-22-1.png
new file mode 100644
index 0000000..0dded6c
Binary files /dev/null and b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-22-1.png differ
diff --git a/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-23-1.png b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-23-1.png
new file mode 100644
index 0000000..cc70036
Binary files /dev/null and b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-23-1.png differ
diff --git a/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-25-1.png b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-25-1.png
new file mode 100644
index 0000000..c024342
Binary files /dev/null and b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-25-1.png differ
diff --git a/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-28-1.png b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-28-1.png
new file mode 100644
index 0000000..f136ab5
Binary files /dev/null and b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-28-1.png differ
diff --git a/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-29-1.png b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-29-1.png
new file mode 100644
index 0000000..96e4d84
Binary files /dev/null and b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-29-1.png differ
diff --git a/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-32-1.png b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-32-1.png
new file mode 100644
index 0000000..554ca65
Binary files /dev/null and b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-32-1.png differ
diff --git a/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-35-1.png b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-35-1.png
new file mode 100644
index 0000000..e67dd70
Binary files /dev/null and b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-35-1.png differ
diff --git a/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-36-1.png b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-36-1.png
new file mode 100644
index 0000000..1d5ad07
Binary files /dev/null and b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-36-1.png differ
diff --git a/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-39-1.png b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-39-1.png
new file mode 100644
index 0000000..a402504
Binary files /dev/null and b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-39-1.png differ
diff --git a/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-4-1.png b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-4-1.png
new file mode 100644
index 0000000..f36c66b
Binary files /dev/null and b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-4-1.png differ
diff --git a/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-42-1.png b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-42-1.png
new file mode 100644
index 0000000..edc3e00
Binary files /dev/null and b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-42-1.png differ
diff --git a/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-43-1.png b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-43-1.png
new file mode 100644
index 0000000..99379e1
Binary files /dev/null and b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-43-1.png differ
diff --git a/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-46-1.png b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-46-1.png
new file mode 100644
index 0000000..07d1364
Binary files /dev/null and b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-46-1.png differ
diff --git a/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-49-1.png b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-49-1.png
new file mode 100644
index 0000000..0b12d0a
Binary files /dev/null and b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-49-1.png differ
diff --git a/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-5-1.png b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-5-1.png
new file mode 100644
index 0000000..c53635b
Binary files /dev/null and b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-5-1.png differ
diff --git a/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-50-1.png b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-50-1.png
new file mode 100644
index 0000000..1716bcc
Binary files /dev/null and b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-50-1.png differ
diff --git a/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-7-1.png b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-7-1.png
new file mode 100644
index 0000000..c058b5e
Binary files /dev/null and b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-7-1.png differ
diff --git a/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-8-1.png b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-8-1.png
new file mode 100644
index 0000000..8a98319
Binary files /dev/null and b/analysis/10-regression-analysis_files/figure-gfm/unnamed-chunk-8-1.png differ