From 43f49b48844570c9d50950940c2161b711a4edba Mon Sep 17 00:00:00 2001
From: Luis Moneda <lg.moneda@gmail.com>
Date: Mon, 3 Jan 2022 10:21:49 -0300
Subject: [PATCH] Refactor gridsearch (#45)

* Change hyper-opt logic to avoid fitting the same model multiple times

* Bump version

* Update docs

* Change non-existent variable name
---
 README.md                       | 19 +++++-----
 pyproject.toml                  |  2 +-
 time_robust_forest/hyper_opt.py | 63 ++++++++++++++++++++++++---------
 3 files changed, 58 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index 775606c..0fa16d9 100644
--- a/README.md
+++ b/README.md
@@ -51,24 +51,25 @@ aggregated. Options: {"avg": average, "max": maximum, the worst case}.
 
 To use the environment-wise optimization:
 
-```python 
+```python
 from time_robust_forest.hyper_opt import env_wise_hyper_opt
 
-params_grid = {"n_estimators": [30, 60, 120], 
+params_grid = {"n_estimators": [30, 60, 120],
               "max_depth": [5, 10],
               "min_impurity_decrease": [1e-1, 1e-3, 0],
               "min_sample_periods": [5, 10, 30],
               "period_criterion": ["max", "avg"]}
-			  
+
 model = TimeForestClassifier(time_column=time_column)
-										
-opt_param = env_wise_hyper_opt(training_data[features + [time_column]], 
-                               training_data[TARGET], 
-                               model, 
+
+opt_param = env_wise_hyper_opt(training_data[features + [time_column]],
+                               training_data[TARGET],
+                               model,
                                time_column,
                                params_grid,
                                cv=5,
-                               score=roc_auc_score)
+                               scorer=make_scorer(roc_auc_score,
+                                                  needs_proba=True))
 
 ```
 
@@ -82,7 +83,7 @@ Don't simply use a timestamp column from the dataset, make it discrete before an
 
 This project is licensed under the terms of the `BSD-3` license. See [LICENSE](https://github.com/lgmoneda/time-robust-forest/blob/main/LICENSE) for more details.
 
-## Useful links 
+## Useful links
 
 - [Introducing the Time Robust Tree blog post](http://lgmoneda.github.io/2021/12/03/introducing-time-robust-tree.html)
 
diff --git a/pyproject.toml b/pyproject.toml
index 7918357..2be4cd3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.poetry]
 name = "time-robust-forest"
-version = "0.1.11"
+version = "0.1.12"
 description = "Explores time information to train a robust random forest"
 readme = "README.md"
 authors = [
diff --git a/time_robust_forest/hyper_opt.py b/time_robust_forest/hyper_opt.py
index 493eebc..b12a8e2 100644
--- a/time_robust_forest/hyper_opt.py
+++ b/time_robust_forest/hyper_opt.py
@@ -1,3 +1,5 @@
+from functools import partial
+
 import pandas as pd
 from sklearn.metrics import make_scorer, roc_auc_score
 from sklearn.model_selection import GridSearchCV, StratifiedKFold
@@ -5,12 +7,15 @@
 
 def extract_results_from_grid_cv(cv_results, kfolds, envs):
     """
-    Extract the resuls from a fitted grid search object from sklearn so
+    Extract the resuls from a fitted grid search object from sklearn
+    to enable picking the best using custom logic.
     """
+
     split_keys = [i for i in cv_results.keys() if "split" in i]
 
     split_env = {
-        split_key: envs[i % len(envs)] for i, split_key in enumerate(split_keys)
+        split_key: split_key.split("env_")[-1]
+        for i, split_key in enumerate(split_keys)
     }
     params_idx = [i for i in range(len(cv_results["params"]))]
     all_folds_df = []
@@ -43,26 +48,34 @@ def select_best_model_from_results_df(results_df):
     )
     results_df = results_df.groupby("params_idx").agg(second_agg_dict)
 
-    return results_df.iloc[results_df["perf"].argmax()]["params"]
+    return results_df.iloc[results_df["perf"].argmax()]["params"], results_df
 
 
-def leave_one_env_out_cv(data, env_column="period", cv=5):
+def env_stratified_folds(data, env_column="period", cv=5):
     """
-    Create folds that keep only one environment in the test fold.
+    Create folds that are stratified on the environment.
     """
     envs = data[env_column].unique()
     cv_sets = []
     kfolds = StratifiedKFold(n_splits=cv)
     for train_idx, test_idx in kfolds.split(data, data[env_column]):
-        for env in envs:
-            all_env_elements = data[data[env_column] == env].index
-            test_env_idx = [i for i in test_idx if i in all_env_elements]
-            cv_sets.append((train_idx, test_env_idx))
+        cv_sets.append((train_idx, test_idx))
 
     return cv_sets
 
 
-def grid_search(X, y, model, param_grid, env_cvs, score):
+def env_wise_score(estimator, X, y, scorer, env, env_column):
+    """
+    Filter data to evaluate only a specific environment using a
+    certain scorer.
+    """
+    env_mask = X[env_column] == env
+    evaluation = scorer(estimator, X[env_mask], y[env_mask])
+
+    return evaluation
+
+
+def grid_search(X, y, model, param_grid, env_cvs, scorer):
     """
     FIt the grid search and return it.
     """
@@ -71,9 +84,10 @@ def grid_search(X, y, model, param_grid, env_cvs, score):
         model,
         param_grid=param_grid,
         cv=env_cvs,
-        scoring=make_scorer(score),
+        scoring=scorer,
         n_jobs=-1,
         verbose=0,
+        refit=False,
     )
 
     grid_cv.fit(X, y)
@@ -81,20 +95,37 @@ def grid_search(X, y, model, param_grid, env_cvs, score):
 
 
 def env_wise_hyper_opt(
-    X, y, model, env_column, param_grid, cv=5, score=roc_auc_score
+    X,
+    y,
+    model,
+    env_column,
+    param_grid,
+    cv=5,
+    scorer=make_scorer(roc_auc_score, needs_proba=True),
+    ret_results=False,
 ):
     """
     Optimize the hyper parmaters of a model considering the leave one env out
     cross-validation and selecting the worst case regarding the test performance
     in the different environments.
     """
-    env_cvs = leave_one_env_out_cv(X, env_column, cv)
+    env_cvs = env_stratified_folds(X, env_column, cv)
+    envs = X[env_column].unique()
+
+    scoring_fs = {
+        f"{scorer.__repr__()}_env_{env}": partial(
+            env_wise_score, scorer=scorer, env=env, env_column=env_column
+        )
+        for env in envs
+    }
 
-    grid_cv = grid_search(X, y, model, param_grid, env_cvs, score)
+    grid_cv = grid_search(X, y, model, param_grid, env_cvs, scoring_fs)
 
-    envs = X[env_column].unique()
     results_df = extract_results_from_grid_cv(grid_cv.cv_results_, cv, envs)
 
-    opt_params = select_best_model_from_results_df(results_df)
+    opt_params, agg_results_df = select_best_model_from_results_df(results_df)
+
+    if ret_results:
+        return opt_params, results_df, agg_results_df
 
     return opt_params