From 7d4c709dc3caaea2ebc587db533254e296a8bb83 Mon Sep 17 00:00:00 2001 From: julianteichgraber Date: Wed, 12 Apr 2023 17:09:34 +0100 Subject: [PATCH] option to remove data after fit --- auto_causality/optimiser.py | 12 ++- .../autocausality/test_drop_data_after_fit.py | 80 +++++++++++++++++++ 2 files changed, 88 insertions(+), 4 deletions(-) create mode 100644 tests/autocausality/test_drop_data_after_fit.py diff --git a/auto_causality/optimiser.py b/auto_causality/optimiser.py index e56f4985..5f0fbb07 100644 --- a/auto_causality/optimiser.py +++ b/auto_causality/optimiser.py @@ -82,7 +82,6 @@ class AutoCausality: def __init__( self, - data_df=None, metric="energy_distance", metrics_to_report=None, time_budget=None, @@ -107,7 +106,6 @@ def __init__( """constructor. Args: - data_df (pandas.DataFrame): dataset to perform causal inference on metric (str): metric to optimise. Defaults to "erupt" for CATE, "energy_distance" for IV metrics_to_report (list). additional metrics to compute and report. @@ -185,7 +183,6 @@ def __init__( self._best_estimators = defaultdict(lambda: (float("-inf"), None)) self.original_estimator_list = estimator_list - self.data_df = data_df or pd.DataFrame() self.causal_model = None self.identified_estimand = None self.problem = None @@ -257,13 +254,14 @@ def fit( estimator_list: Optional[Union[str, List[str]]] = None, resume: Optional[bool] = False, time_budget: Optional[int] = None, + store_data: Optional[bool] = True, ): """Performs AutoML on list of causal inference estimators - If estimator has a search space specified in its parameters, HPO is performed on the whole model. - Otherwise, only its component models are optimised Args: - data_df (pandas.DataFrame): dataset for causal inference + data (pandas.DataFrame): dataset for causal inference treatment (str): name of treatment variable outcome (str): name of outcome variable common_causes (List[str]): list of names of common causes @@ -273,6 +271,7 @@ def fit( estimator_list (Optional[Union[str, List[str]]]): subset of estimators to consider resume (Optional[bool]): set to True to continue previous fit time_budget (Optional[int]): change new time budget allocated to fit, useful for warm starts. + store_data (Optional[bool]): Set true if keep train_df, test_df after fit """ if not isinstance(data, CausalityDataset): @@ -456,6 +455,11 @@ def fit( ) self.update_summary_scores() + + if not store_data: + delattr(self, 'train_df') + delattr(self, 'test_df') + delattr(self, 'data') def update_summary_scores(self): self.scores = Scorer.best_score_by_estimator(self.results.results, self.metric) diff --git a/tests/autocausality/test_drop_data_after_fit.py b/tests/autocausality/test_drop_data_after_fit.py new file mode 100644 index 00000000..4ca349a7 --- /dev/null +++ b/tests/autocausality/test_drop_data_after_fit.py @@ -0,0 +1,80 @@ +import pytest +import warnings + +from auto_causality import AutoCausality +from auto_causality.datasets import synth_ihdp, linear_multi_dataset +from auto_causality.params import SimpleParamService + +warnings.filterwarnings("ignore") # suppress sklearn deprecation warnings for now.. + + +class TestDropDataAfterFit(object): + def test_fit_and_drop_data(self): + """tests if CATE model can be instantiated and fit to data""" + + from auto_causality.shap import shap_values # noqa F401 + + data = synth_ihdp() + data.preprocess_dataset() + + cfg = SimpleParamService( + propensity_model=None, + outcome_model=None, + n_jobs=-1, + include_experimental=False, + multivalue=False, + ) + estimator_list = cfg.estimator_names_from_patterns("backdoor", "all", 1) + # outcome = targets[0] + auto_causality = AutoCausality( + num_samples=len(estimator_list), + components_time_budget=5, + estimator_list=estimator_list, # "all", # + use_ray=False, + verbose=3, + components_verbose=2, + resources_per_trial={"cpu": 0.5}, + ) + + auto_causality.fit(data, store_data=False) + auto_causality.effect(data.data) + auto_causality.score_dataset(data.data, "test") + + # now let's test Shapley values calculation + for est_name, scores in auto_causality.scores.items(): + # Dummy model doesn't support Shapley values + # Orthoforest shapley calc is VERY slow + if "Dummy" not in est_name and "Ortho" not in est_name: + + print("Calculating Shapley values for", est_name) + shap_values(scores["estimator"], data.data[:10]) + + print(f"Best estimator: {auto_causality.best_estimator}") + + def test_fit_and_keep_data(self): + data = linear_multi_dataset(10000) + cfg = SimpleParamService( + propensity_model=None, + outcome_model=None, + n_jobs=-1, + include_experimental=False, + multivalue=True, + ) + estimator_list = cfg.estimator_names_from_patterns( + "backdoor", "all", data_rows=len(data) + ) + + data.preprocess_dataset() + + ac = AutoCausality( + estimator_list="all", + num_samples=len(estimator_list), + components_time_budget=5, + ) + ac.fit(data) + # TODO add an effect() call and an effect_tt call + + +if __name__ == "__main__": + pytest.main([__file__]) + # TestEndToEnd().test_endtoend_iv()