-
-
Notifications
You must be signed in to change notification settings - Fork 553
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
added web traffic dataset and info (#1326)
* added web traffic dataset and info * removed CSV file and updated the url
- Loading branch information
Showing
1 changed file
with
48 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
from __future__ import annotations | ||
|
||
from river import stream | ||
|
||
from . import base | ||
|
||
|
||
class WebTraffic(base.RemoteDataset): | ||
"""Web sessions information from an events company based in South Africa. | ||
The goal is to predict the number of web sessions in 4 different regions in South Africa. | ||
The data consists of 15 minute interval traffic values between '2023-06-16 00:00:00' and '2023-09-15 23:45:00' for each region. | ||
Two types of sessions are captured `sessionsA` and `sessionsB`. The `isMissing` flag is equal to 1 if any of the servers failed | ||
to capture sessions, otherwise if all servers functioned properly this flag is equal to 0. Things to consider: | ||
* region `R5` captures sessions in backup mode. Strictly speaking, `R5` is not necessary to predict. | ||
* Can `sessionsA` and `sessionsB` events be predicted accurately for each region over the next day (next 96 intervals)? | ||
* What is the best way to deal with the missing values? | ||
* How can model selection be used (a multi-model approach)? | ||
* Can dependence (correlation) between regions be utilised for more accurate predictions? | ||
* Can both `sessionA` and `sessionB` be predicted simultaneously with one model? | ||
This dataset is well suited for time series forecasting models, as well as anomaly detection | ||
methods. Ideally, the goal is to build a time series forecasting model that is robust to the | ||
anomalous events and generalise well on normal operating conditions. | ||
""" | ||
|
||
def __init__(self): | ||
super().__init__( | ||
url="https://maxhalford.github.io/files/datasets/web-traffic.csv.zip", | ||
filename="web-traffic.csv", | ||
task=base.REG, | ||
n_features=2, | ||
n_samples=44_160, | ||
) | ||
|
||
def _iter(self): | ||
return stream.iter_csv( | ||
self.path, | ||
target="sessionsA", | ||
converters={ | ||
"region": str, | ||
"isMissing": int, | ||
}, | ||
parse_dates={"dateTime": "%Y-%m-%d %H:%M:%S"}, | ||
) |