-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathPollsScrape.Rmd
85 lines (62 loc) · 2.44 KB
/
PollsScrape.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
---
title: "Poll Data"
author: "Matt Bixley"
date: "20 February 2020"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
source("code/helper_functions.R")
check_package(c("tidyverse", "rvest", "xml2", "stringr","purrr"))
```
# web scrape
collect the polls.csv data via code rather than a manual colation as previoulsy done from [wiki](https://en.wikipedia.org/wiki/Opinion_polling_for_the_2020_New_Zealand_general_election) table 3 the individual polls
## tidying the data
some good regular expression examples [here](https://datascience-enthusiast.com/R/webscraping.html)
```{r 2020}
url <- "https://en.wikipedia.org/wiki/Opinion_polling_for_the_2020_New_Zealand_general_election"
polls <- url %>%
read_html() %>%
html_nodes(xpath = '//*[@id="mw-content-text"]/div/table[1]') %>%
html_table()
polls <- polls[[1]]
cols <- colnames(polls)
party <- cols[-c(1:3)]
colnames(polls)[1:3] <- c("Date", "Poll", "Size")
# several notes have been added remove those rows by forcing a party results to numeric which returns NA for the comments
p2020 <- polls %>%
filter(!is.na(as.numeric(polls$NAT))) %>%
# make the polling data numeric
mutate_at(.,party, ~as.numeric(.x)) %>%
# remove comma from size
mutate(Size = as.numeric(str_replace_all(.$Size,",",""))) %>%
select(-"Lead")
str(p2020)
```
```{r 2017}
url <- "https://en.wikipedia.org/wiki/Opinion_polling_for_the_2017_New_Zealand_general_election"
polls <- url %>%
read_html() %>%
html_nodes(xpath = '//*[@id="mw-content-text"]/div/table[1]') %>%
html_table(fill = TRUE)
polls <- polls[[1]]
cols <- colnames(polls)
party <- cols[-c(1:2)]
colnames(polls)[1:2] <- c("Poll", "Date")
# several notes have been added remove those rows by forcing a party results to numeric which returns NA for the comments
p2017 <- polls %>% filter(!is.na(as.numeric(polls[,party[1]]))) %>%
# make the polling data numeric
mutate_at(.,party, ~as.numeric(.x)) %>%
# remove notes [1]
mutate(Poll = str_replace_all(.$Poll,"\\[(.*)","")) %>%
# date tidy
mutate(day = str_replace_all(.$Date," (.*)","")) %>%
mutate(day = str_replace_all(.$day,"–(.*)","")) %>%
# see the helper function get_last
# mutate(year = map(.$Date, get_last, n = 1)) %>%
# mutate(month = map(.$Date, get_last, n = 2))
# see the helper function get_last
mutate(year = str_extract(.$Date, "([0-9]{4})")) %>%
mutate(month = str_extract(.$Date, "([A-z]{3})"))
p2017
```