From 43f49b48844570c9d50950940c2161b711a4edba Mon Sep 17 00:00:00 2001 From: Luis Moneda Date: Mon, 3 Jan 2022 10:21:49 -0300 Subject: [PATCH] Refactor gridsearch (#45) * Change hyper-opt logic to avoid fitting the same model multiple times * Bump version * Update docs * Change non-existent variable name --- README.md | 19 +++++----- pyproject.toml | 2 +- time_robust_forest/hyper_opt.py | 63 ++++++++++++++++++++++++--------- 3 files changed, 58 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 775606c..0fa16d9 100644 --- a/README.md +++ b/README.md @@ -51,24 +51,25 @@ aggregated. Options: {"avg": average, "max": maximum, the worst case}. To use the environment-wise optimization: -```python +```python from time_robust_forest.hyper_opt import env_wise_hyper_opt -params_grid = {"n_estimators": [30, 60, 120], +params_grid = {"n_estimators": [30, 60, 120], "max_depth": [5, 10], "min_impurity_decrease": [1e-1, 1e-3, 0], "min_sample_periods": [5, 10, 30], "period_criterion": ["max", "avg"]} - + model = TimeForestClassifier(time_column=time_column) - -opt_param = env_wise_hyper_opt(training_data[features + [time_column]], - training_data[TARGET], - model, + +opt_param = env_wise_hyper_opt(training_data[features + [time_column]], + training_data[TARGET], + model, time_column, params_grid, cv=5, - score=roc_auc_score) + scorer=make_scorer(roc_auc_score, + needs_proba=True)) ``` @@ -82,7 +83,7 @@ Don't simply use a timestamp column from the dataset, make it discrete before an This project is licensed under the terms of the `BSD-3` license. See [LICENSE](https://github.com/lgmoneda/time-robust-forest/blob/main/LICENSE) for more details. -## Useful links +## Useful links - [Introducing the Time Robust Tree blog post](http://lgmoneda.github.io/2021/12/03/introducing-time-robust-tree.html) diff --git a/pyproject.toml b/pyproject.toml index 7918357..2be4cd3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "time-robust-forest" -version = "0.1.11" +version = "0.1.12" description = "Explores time information to train a robust random forest" readme = "README.md" authors = [ diff --git a/time_robust_forest/hyper_opt.py b/time_robust_forest/hyper_opt.py index 493eebc..b12a8e2 100644 --- a/time_robust_forest/hyper_opt.py +++ b/time_robust_forest/hyper_opt.py @@ -1,3 +1,5 @@ +from functools import partial + import pandas as pd from sklearn.metrics import make_scorer, roc_auc_score from sklearn.model_selection import GridSearchCV, StratifiedKFold @@ -5,12 +7,15 @@ def extract_results_from_grid_cv(cv_results, kfolds, envs): """ - Extract the resuls from a fitted grid search object from sklearn so + Extract the resuls from a fitted grid search object from sklearn + to enable picking the best using custom logic. """ + split_keys = [i for i in cv_results.keys() if "split" in i] split_env = { - split_key: envs[i % len(envs)] for i, split_key in enumerate(split_keys) + split_key: split_key.split("env_")[-1] + for i, split_key in enumerate(split_keys) } params_idx = [i for i in range(len(cv_results["params"]))] all_folds_df = [] @@ -43,26 +48,34 @@ def select_best_model_from_results_df(results_df): ) results_df = results_df.groupby("params_idx").agg(second_agg_dict) - return results_df.iloc[results_df["perf"].argmax()]["params"] + return results_df.iloc[results_df["perf"].argmax()]["params"], results_df -def leave_one_env_out_cv(data, env_column="period", cv=5): +def env_stratified_folds(data, env_column="period", cv=5): """ - Create folds that keep only one environment in the test fold. + Create folds that are stratified on the environment. """ envs = data[env_column].unique() cv_sets = [] kfolds = StratifiedKFold(n_splits=cv) for train_idx, test_idx in kfolds.split(data, data[env_column]): - for env in envs: - all_env_elements = data[data[env_column] == env].index - test_env_idx = [i for i in test_idx if i in all_env_elements] - cv_sets.append((train_idx, test_env_idx)) + cv_sets.append((train_idx, test_idx)) return cv_sets -def grid_search(X, y, model, param_grid, env_cvs, score): +def env_wise_score(estimator, X, y, scorer, env, env_column): + """ + Filter data to evaluate only a specific environment using a + certain scorer. + """ + env_mask = X[env_column] == env + evaluation = scorer(estimator, X[env_mask], y[env_mask]) + + return evaluation + + +def grid_search(X, y, model, param_grid, env_cvs, scorer): """ FIt the grid search and return it. """ @@ -71,9 +84,10 @@ def grid_search(X, y, model, param_grid, env_cvs, score): model, param_grid=param_grid, cv=env_cvs, - scoring=make_scorer(score), + scoring=scorer, n_jobs=-1, verbose=0, + refit=False, ) grid_cv.fit(X, y) @@ -81,20 +95,37 @@ def grid_search(X, y, model, param_grid, env_cvs, score): def env_wise_hyper_opt( - X, y, model, env_column, param_grid, cv=5, score=roc_auc_score + X, + y, + model, + env_column, + param_grid, + cv=5, + scorer=make_scorer(roc_auc_score, needs_proba=True), + ret_results=False, ): """ Optimize the hyper parmaters of a model considering the leave one env out cross-validation and selecting the worst case regarding the test performance in the different environments. """ - env_cvs = leave_one_env_out_cv(X, env_column, cv) + env_cvs = env_stratified_folds(X, env_column, cv) + envs = X[env_column].unique() + + scoring_fs = { + f"{scorer.__repr__()}_env_{env}": partial( + env_wise_score, scorer=scorer, env=env, env_column=env_column + ) + for env in envs + } - grid_cv = grid_search(X, y, model, param_grid, env_cvs, score) + grid_cv = grid_search(X, y, model, param_grid, env_cvs, scoring_fs) - envs = X[env_column].unique() results_df = extract_results_from_grid_cv(grid_cv.cv_results_, cv, envs) - opt_params = select_best_model_from_results_df(results_df) + opt_params, agg_results_df = select_best_model_from_results_df(results_df) + + if ret_results: + return opt_params, results_df, agg_results_df return opt_params