-
Notifications
You must be signed in to change notification settings - Fork 1
/
Bike_counter_get.R
121 lines (101 loc) · 5.25 KB
/
Bike_counter_get.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# Bike counter API on Broadway, Cambridge MA
# https://data.cambridgema.gov/resource/gxzm-dpwp.json # Broadway
# https://data.cambridgema.gov/resource/gqic-86ts.json # Hampshire, 6 months of data, not formatted the same as Broadway
# https://data.edmonton.ca/resource/tq23-qn4m.json # Edmonton
fetchtime <- Sys.time()
library(RSocrata)
library(dplyr)
# Grab data from API ----
api_get <- 'https://data.cambridgema.gov/resource/gxzm-dpwp.csv' # this will be a variable that the user selects by selecting a station or city name
pattern <- ".{9}(?=\\.csv$|\\.json)" # perl=TRUE
parse_socrata_ID <- regexpr(pattern, api_get, perl = T)
socrata_ID <- substring(api_get, first = parse_socrata_ID, last = parse_socrata_ID + attr(parse_socrata_ID, 'match.length') - 1)
# Load historical count once. Takes 7s initially to download, vs. 0.1s to load from working directory
hist_count_name = file.path('Data', paste0("BikeCountHist_", socrata_ID, ".RData"))
# We will increment the historical data daily
if(!file.exists(hist_count_name)){
hist_count <- read.socrata(api_get)
hist_count <- hist_count[order(hist_count$datetime),]
original_rownum <- nrow(hist_count)
hist_count <- hist_count[!duplicated(hist_count[,c('datetime','day','entries','exits','total')]),]
dedup_rownum <- nrow(hist_count); dup_rows = original_rownum - dedup_rownum
save(list=c("hist_count", "dup_rows"), file = hist_count_name)
} else { load(hist_count_name) }
rows_hist <- nrow(hist_count)
last_day <- max(hist_count$date)
# Try to get fresher data, if more than 1 day old.
# Defaults to hist_count if nothing fresher found.
if(Sys.Date() > as.Date(last_day - 1)){
# Combine order and offset. Default is to order new to old, while our RSocrata query returns results old to new.
response <- httr::GET(paste0(api_get, "?$order=date&$offset=", rows_hist + dup_rows))
if(grepl('csv', api_get)){
# read_csv parses datetime, so need to provide correct time zone as well
r_df <- readr::read_csv(httr::content(response,
as = "text",
type = "text/csv",
encoding = "utf-8"),
locale = readr::locale(tz = 'America/New_York')
)
}
# Check to see if there is actually new data or if just delayed compared to Sys.Date()
if(nrow(r_df) > 0){
new_count <- as.data.frame(r_df)
new_count <- new_count[order(new_count$datetime),]
original_rownum <- nrow(new_count)
new_count <- new_count[!duplicated(new_count[,c('datetime','day','entries','exits','total')]),]
new_dedup_rownum <- nrow(new_count); new_dup_rows = original_rownum - new_dedup_rownum
dup_rows = dup_rows + new_dup_rows
hist_count <- rbind(hist_count, new_count)
count <- hist_count
} else {
count <- hist_count }
}
save(list=c('hist_count', 'dup_rows'), file = hist_count_name)
fetchtimediff <- Sys.time() - fetchtime
# Get some metrics ----
metrictime <- Sys.time()
# read.socrata reads 'date' as date-time, all at midnight. Need to reformat as actualy date only, without time.
# Also, rename variables here -- still fetch and store with original. Ensure they are numeric
count <- count %>%
mutate(date = as.Date(date),
year = format(datetime, '%Y')) %>%
rename(Total = total,
Eastbound = exits,
Westbound = entries) %>%
mutate(Total = as.numeric(Total),
Eastbound = as.numeric(Eastbound),
Westbound = as.numeric(Westbound))
# Count by year, month, day of week, hour of day, for each day. This is is the input for the ZINB and RF models.
hourly_day = count %>%
mutate(hour = as.numeric(format(datetime, '%H')),
month = format(datetime, '%m')) %>%
group_by(year, date, month, day, hour) %>%
dplyr::summarise(Total = sum(Total),
Westbound = sum(Westbound),
Eastbound = sum(Eastbound))
# Count by hour of day and month. Used in the hourly view.
hourly_hour_month <- count %>%
mutate(hour = as.numeric(format(datetime, '%H')),
month = format(datetime, '%m')) %>%
group_by(hour, month) %>%
summarize(Total = mean(Total),
Westbound = mean(Westbound),
Eastbound = mean(Eastbound))
# Daily summary, no hourly breakdown. This is used in the default daily view, the day of week view, and the time series model.
daily = count %>%
group_by(year, date, day_of_week = as.factor(day)) %>%
dplyr::summarise(Total = sum(Total),
Westbound = sum(Westbound),
Eastbound = sum(Eastbound))
max_day = daily %>% ungroup() %>% dplyr::filter(Total == max(Total))
latest_day = daily %>% ungroup() %>% dplyr::filter(date == max(date))
# Order day of week factor better
# levels(daily$day_of_week)
levels(daily$day_of_week) <- paste(c(6, 2, 7, 1, 5, 3, 4),
levels(daily$day_of_week))
daily$day_of_week <- as.factor(as.character(daily$day_of_week))
levels(daily$day_of_week) <- c('Sunday', 'Monday', 'Tuesday', 'Wednesday',
'Thursday', 'Friday', 'Saturday')
metrictimediff <- Sys.time() - metrictime
# cat('Fetch took:', fetchtimediff, attr(fetchtimediff, 'units'), '\n') # Use these for profiling run time
# cat('Metrics took:', metrictimediff, attr(metrictimediff, 'units'), '\n')