From 46814cdb51578567b66bd6daec25359c15a43ddb Mon Sep 17 00:00:00 2001 From: Jaco DuToit Date: Tue, 3 Oct 2023 11:33:43 +0200 Subject: [PATCH] added web traffic dataset and info (#1326) * added web traffic dataset and info * removed CSV file and updated the url --- river/datasets/web_traffic.py | 48 +++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 river/datasets/web_traffic.py diff --git a/river/datasets/web_traffic.py b/river/datasets/web_traffic.py new file mode 100644 index 0000000000..9a253f8923 --- /dev/null +++ b/river/datasets/web_traffic.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +from river import stream + +from . import base + + +class WebTraffic(base.RemoteDataset): + """Web sessions information from an events company based in South Africa. + + The goal is to predict the number of web sessions in 4 different regions in South Africa. + + The data consists of 15 minute interval traffic values between '2023-06-16 00:00:00' and '2023-09-15 23:45:00' for each region. + Two types of sessions are captured `sessionsA` and `sessionsB`. The `isMissing` flag is equal to 1 if any of the servers failed + to capture sessions, otherwise if all servers functioned properly this flag is equal to 0. Things to consider: + + * region `R5` captures sessions in backup mode. Strictly speaking, `R5` is not necessary to predict. + * Can `sessionsA` and `sessionsB` events be predicted accurately for each region over the next day (next 96 intervals)? + * What is the best way to deal with the missing values? + * How can model selection be used (a multi-model approach)? + * Can dependence (correlation) between regions be utilised for more accurate predictions? + * Can both `sessionA` and `sessionB` be predicted simultaneously with one model? + + This dataset is well suited for time series forecasting models, as well as anomaly detection + methods. Ideally, the goal is to build a time series forecasting model that is robust to the + anomalous events and generalise well on normal operating conditions. + + """ + + def __init__(self): + super().__init__( + url="https://maxhalford.github.io/files/datasets/web-traffic.csv.zip", + filename="web-traffic.csv", + task=base.REG, + n_features=2, + n_samples=44_160, + ) + + def _iter(self): + return stream.iter_csv( + self.path, + target="sessionsA", + converters={ + "region": str, + "isMissing": int, + }, + parse_dates={"dateTime": "%Y-%m-%d %H:%M:%S"}, + )