From 151f9e8ee6a178f080cb87a8002186e906cef2d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Wed, 20 Nov 2024 12:21:19 -0600 Subject: [PATCH] feat(auto): support input_size (#451) --- mlforecast/auto.py | 4 ++++ mlforecast/optimization.py | 4 ++++ nbs/auto.ipynb | 49 +++++++++++++++++++++++++++++++++----- nbs/optimization.ipynb | 16 +++++++++++-- 4 files changed, 65 insertions(+), 8 deletions(-) diff --git a/mlforecast/auto.py b/mlforecast/auto.py index bc015256..e2de31a8 100644 --- a/mlforecast/auto.py +++ b/mlforecast/auto.py @@ -445,6 +445,7 @@ def fit( h: int, num_samples: int, step_size: Optional[int] = None, + input_size: Optional[int] = None, refit: Union[bool, int] = False, loss: Optional[Callable[[DataFrame, DataFrame], float]] = None, id_col: str = "unique_id", @@ -470,6 +471,8 @@ def fit( Number of trials to run step_size : int, optional (default=None) Step size between each cross validation window. If None it will be equal to `h`. + input_size : int, optional (default=None) + Maximum training samples per serie in each window. If None, will use an expanding window. refit : bool or int (default=False) Retrain model for each cross validation window. If False, the models are trained at the beginning and then used to predict each window. @@ -550,6 +553,7 @@ def config_fn(trial: optuna.Trial) -> Dict[str, Any]: n_windows=n_windows, h=h, step_size=step_size, + input_size=input_size, refit=refit, id_col=id_col, time_col=time_col, diff --git a/mlforecast/optimization.py b/mlforecast/optimization.py index 58bcd61e..37a8df97 100644 --- a/mlforecast/optimization.py +++ b/mlforecast/optimization.py @@ -30,6 +30,7 @@ def mlforecast_objective( n_windows: int, h: int, step_size: Optional[int] = None, + input_size: Optional[int] = None, refit: Union[bool, int] = False, id_col: str = "unique_id", time_col: str = "ds", @@ -56,6 +57,8 @@ def mlforecast_objective( Forecast horizon. step_size : int, optional (default=None) Step size between each cross validation window. If None it will be equal to `h`. + input_size : int, optional (default=None) + Maximum training samples per serie in each window. If None, will use an expanding window. refit : bool or int (default=False) Retrain model for each cross validation window. If False, the models are trained at the beginning and then used to predict each window. @@ -86,6 +89,7 @@ def objective(trial: optuna.Trial) -> float: time_col=time_col, freq=freq, step_size=step_size, + input_size=input_size, ) model_copy = clone(model) model_params = config["model_params"] diff --git a/nbs/auto.ipynb b/nbs/auto.ipynb index bdb10958..0abfd09b 100644 --- a/nbs/auto.ipynb +++ b/nbs/auto.ipynb @@ -523,6 +523,7 @@ " h: int,\n", " num_samples: int,\n", " step_size: Optional[int] = None,\n", + " input_size: Optional[int] = None,\n", " refit: Union[bool, int] = False,\n", " loss: Optional[Callable[[DataFrame, DataFrame], float]] = None,\n", " id_col: str = 'unique_id',\n", @@ -548,6 +549,8 @@ " Number of trials to run\n", " step_size : int, optional (default=None)\n", " Step size between each cross validation window. If None it will be equal to `h`.\n", + " input_size : int, optional (default=None)\n", + " Maximum training samples per serie in each window. If None, will use an expanding window.\n", " refit : bool or int (default=False)\n", " Retrain model for each cross validation window.\n", " If False, the models are trained at the beginning and then used to predict each window.\n", @@ -625,6 +628,7 @@ " n_windows=n_windows,\n", " h=h,\n", " step_size=step_size,\n", + " input_size=input_size,\n", " refit=refit,\n", " id_col=id_col,\n", " time_col=time_col,\n", @@ -818,6 +822,7 @@ "> (df:Union[pandas.core.frame.DataFrame,polars.datafram\n", "> e.frame.DataFrame], n_windows:int, h:int,\n", "> num_samples:int, step_size:Optional[int]=None,\n", + "> input_size:Optional[int]=None,\n", "> refit:Union[bool,int]=False, loss:Optional[Callable[[\n", "> Union[pandas.core.frame.DataFrame,polars.dataframe.fr\n", "> ame.DataFrame],Union[pandas.core.frame.DataFrame,pola\n", @@ -839,6 +844,7 @@ "| h | int | | Forecast horizon. |\n", "| num_samples | int | | Number of trials to run |\n", "| step_size | Optional | None | Step size between each cross validation window. If None it will be equal to `h`. |\n", + "| input_size | Optional | None | Maximum training samples per serie in each window. If None, will use an expanding window. |\n", "| refit | Union | False | Retrain model for each cross validation window.
If False, the models are trained at the beginning and then used to predict each window.
If positive int, the models are retrained every `refit` windows. |\n", "| loss | Optional | None | Function that takes the validation and train dataframes and produces a float.
If `None` will use the average SMAPE across series. |\n", "| id_col | str | unique_id | Column that identifies each serie. |\n", @@ -861,6 +867,7 @@ "> (df:Union[pandas.core.frame.DataFrame,polars.datafram\n", "> e.frame.DataFrame], n_windows:int, h:int,\n", "> num_samples:int, step_size:Optional[int]=None,\n", + "> input_size:Optional[int]=None,\n", "> refit:Union[bool,int]=False, loss:Optional[Callable[[\n", "> Union[pandas.core.frame.DataFrame,polars.dataframe.fr\n", "> ame.DataFrame],Union[pandas.core.frame.DataFrame,pola\n", @@ -882,6 +889,7 @@ "| h | int | | Forecast horizon. |\n", "| num_samples | int | | Number of trials to run |\n", "| step_size | Optional | None | Step size between each cross validation window. If None it will be equal to `h`. |\n", + "| input_size | Optional | None | Maximum training samples per serie in each window. If None, will use an expanding window. |\n", "| refit | Union | False | Retrain model for each cross validation window.
If False, the models are trained at the beginning and then used to predict each window.
If positive int, the models are retrained every `refit` windows. |\n", "| loss | Optional | None | Function that takes the validation and train dataframes and produces a float.
If `None` will use the average SMAPE across series. |\n", "| id_col | str | unique_id | Column that identifies each serie. |\n", @@ -914,7 +922,7 @@ "text/markdown": [ "---\n", "\n", - "[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/auto.py#L574){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/auto.py#L592){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### AutoMLForecast.predict\n", "\n", @@ -934,7 +942,7 @@ "text/plain": [ "---\n", "\n", - "[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/auto.py#L574){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/auto.py#L592){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### AutoMLForecast.predict\n", "\n", @@ -972,7 +980,7 @@ "text/markdown": [ "---\n", "\n", - "[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/auto.py#L606){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/auto.py#L624){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### AutoMLForecast.save\n", "\n", @@ -988,7 +996,7 @@ "text/plain": [ "---\n", "\n", - "[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/auto.py#L606){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/auto.py#L624){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### AutoMLForecast.save\n", "\n", @@ -1022,7 +1030,7 @@ "text/markdown": [ "---\n", "\n", - "[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/auto.py#L616){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/auto.py#L634){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### AutoMLForecast.forecast_fitted_values\n", "\n", @@ -1040,7 +1048,7 @@ "text/plain": [ "---\n", "\n", - "[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/auto.py#L616){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/Nixtla/mlforecast/blob/main/mlforecast/auto.py#L634){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### AutoMLForecast.forecast_fitted_values\n", "\n", @@ -1072,6 +1080,8 @@ "metadata": {}, "outputs": [], "source": [ + "import time\n", + "\n", "import pandas as pd\n", "from datasetsforecast.m4 import M4, M4Evaluation, M4Info\n", "from sklearn.linear_model import Ridge\n", @@ -1788,6 +1798,33 @@ " preds2.rename(columns={'id': 'unique_id', 'time': 'ds'}),\n", ")" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "caf14191-370e-41b9-a322-2c9b0a7a5f6e", + "metadata": {}, + "outputs": [], + "source": [ + "#| hide\n", + "# using input_size\n", + "fit_kwargs = dict(\n", + " n_windows=3,\n", + " h=h,\n", + " num_samples=5,\n", + " optimize_kwargs={'timeout': 60}, \n", + ")\n", + "\n", + "start = time.perf_counter()\n", + "auto_mlf.fit(df=train, **fit_kwargs)\n", + "no_limit = time.perf_counter() - start\n", + "\n", + "start = time.perf_counter()\n", + "auto_mlf.fit(df=train, input_size=50, **fit_kwargs)\n", + "with_limit = time.perf_counter() - start\n", + "\n", + "assert with_limit < no_limit" + ] } ], "metadata": { diff --git a/nbs/optimization.ipynb b/nbs/optimization.ipynb index f9b579c4..440c1f68 100644 --- a/nbs/optimization.ipynb +++ b/nbs/optimization.ipynb @@ -81,6 +81,7 @@ " n_windows: int,\n", " h: int,\n", " step_size: Optional[int] = None,\n", + " input_size: Optional[int] = None,\n", " refit: Union[bool, int] = False,\n", " id_col: str = 'unique_id',\n", " time_col: str = 'ds',\n", @@ -107,6 +108,8 @@ " Forecast horizon.\n", " step_size : int, optional (default=None)\n", " Step size between each cross validation window. If None it will be equal to `h`.\n", + " input_size : int, optional (default=None)\n", + " Maximum training samples per serie in each window. If None, will use an expanding window.\n", " refit : bool or int (default=False)\n", " Retrain model for each cross validation window.\n", " If False, the models are trained at the beginning and then used to predict each window.\n", @@ -136,6 +139,7 @@ " time_col=time_col,\n", " freq=freq,\n", " step_size=step_size,\n", + " input_size=input_size,\n", " )\n", " model_copy = clone(model)\n", " model_params = config['model_params']\n", @@ -232,11 +236,13 @@ "> ial._trial.Trial],Dict[str,Any]], loss:Callable,\n", "> model:sklearn.base.BaseEstimator,\n", "> freq:Union[int,str], n_windows:int, h:int,\n", + "> step_size:Optional[int]=None,\n", + "> input_size:Optional[int]=None,\n", "> refit:Union[bool,int]=False,\n", "> id_col:str='unique_id', time_col:str='ds',\n", "> target_col:str='y')\n", "\n", - "optuna objective function for the MLForecast class\n", + "*optuna objective function for the MLForecast class*\n", "\n", "| | **Type** | **Default** | **Details** |\n", "| -- | -------- | ----------- | ----------- |\n", @@ -247,6 +253,8 @@ "| freq | Union | | pandas' or polars' offset alias or integer denoting the frequency of the series. |\n", "| n_windows | int | | Number of windows to evaluate. |\n", "| h | int | | Forecast horizon. |\n", + "| step_size | Optional | None | Step size between each cross validation window. If None it will be equal to `h`. |\n", + "| input_size | Optional | None | |\n", "| refit | Union | False | Retrain model for each cross validation window.
If False, the models are trained at the beginning and then used to predict each window.
If positive int, the models are retrained every `refit` windows. |\n", "| id_col | str | unique_id | Column that identifies each serie. |\n", "| time_col | str | ds | Column that identifies each timestep, its values can be timestamps or integers. |\n", @@ -266,11 +274,13 @@ "> ial._trial.Trial],Dict[str,Any]], loss:Callable,\n", "> model:sklearn.base.BaseEstimator,\n", "> freq:Union[int,str], n_windows:int, h:int,\n", + "> step_size:Optional[int]=None,\n", + "> input_size:Optional[int]=None,\n", "> refit:Union[bool,int]=False,\n", "> id_col:str='unique_id', time_col:str='ds',\n", "> target_col:str='y')\n", "\n", - "optuna objective function for the MLForecast class\n", + "*optuna objective function for the MLForecast class*\n", "\n", "| | **Type** | **Default** | **Details** |\n", "| -- | -------- | ----------- | ----------- |\n", @@ -281,6 +291,8 @@ "| freq | Union | | pandas' or polars' offset alias or integer denoting the frequency of the series. |\n", "| n_windows | int | | Number of windows to evaluate. |\n", "| h | int | | Forecast horizon. |\n", + "| step_size | Optional | None | Step size between each cross validation window. If None it will be equal to `h`. |\n", + "| input_size | Optional | None | |\n", "| refit | Union | False | Retrain model for each cross validation window.
If False, the models are trained at the beginning and then used to predict each window.
If positive int, the models are retrained every `refit` windows. |\n", "| id_col | str | unique_id | Column that identifies each serie. |\n", "| time_col | str | ds | Column that identifies each timestep, its values can be timestamps or integers. |\n",