From 860e5ce3074dcd026cd8ae5d68e28c9dbba8bbb8 Mon Sep 17 00:00:00 2001 From: Felipe Olmos <92923444+folmos-at-orange@users.noreply.github.com> Date: Thu, 12 Dec 2024 15:57:24 +0100 Subject: [PATCH] Remove n_pairs parameter from KhiopsRegressor It was never supported. --- khiops/sklearn/estimators.py | 111 ++++++++++++++++------------- tests/test_estimator_attributes.py | 2 +- tests/test_sklearn.py | 27 ------- 3 files changed, 61 insertions(+), 79 deletions(-) diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py index addcc88a..fb976393 100644 --- a/khiops/sklearn/estimators.py +++ b/khiops/sklearn/estimators.py @@ -1388,7 +1388,6 @@ class KhiopsSupervisedEstimator(KhiopsEstimator): def __init__( self, n_features=100, - n_pairs=0, n_trees=10, specific_pairs=None, all_possible_pairs=True, @@ -1407,7 +1406,6 @@ def __init__( internal_sort=internal_sort, ) self.n_features = n_features - self.n_pairs = n_pairs self.n_trees = n_trees self.specific_pairs = specific_pairs self.all_possible_pairs = all_possible_pairs @@ -1497,25 +1495,6 @@ def _fit_check_params(self, ds, **kwargs): raise TypeError(type_error_message("n_trees", self.n_trees, int)) if self.n_trees < 0: raise ValueError("'n_trees' must be positive") - if not isinstance(self.n_pairs, int): - raise TypeError(type_error_message("n_pairs", self.n_pairs, int)) - if self.n_pairs < 0: - raise ValueError("'n_pairs' must be positive") - if self.specific_pairs is not None: - if not is_list_like(self.specific_pairs): - raise TypeError( - type_error_message( - "specific_pairs", self.specific_pairs, "list-like" - ) - ) - else: - for pair in self.specific_pairs: - if not isinstance(pair, tuple): - raise TypeError(type_error_message(pair, pair, tuple)) - if not isinstance(self.all_possible_pairs, bool): - raise TypeError( - type_error_message("all_possible_pairs", self.all_possible_pairs, bool) - ) if self.construction_rules is not None: if not is_list_like(self.construction_rules): raise TypeError( @@ -1602,7 +1581,6 @@ def _fit_prepare_training_function_inputs(self, ds, computation_dir): # Rename parameters to be compatible with khiops.core kwargs["max_constructed_variables"] = kwargs.pop("n_features") - kwargs["max_pairs"] = kwargs.pop("n_pairs") kwargs["max_trees"] = kwargs.pop("n_trees") # Add the additional_data_tables parameter @@ -1766,7 +1744,6 @@ class KhiopsPredictor(KhiopsSupervisedEstimator): def __init__( self, n_features=100, - n_pairs=0, n_trees=10, n_selected_features=0, n_evaluated_features=0, @@ -1781,7 +1758,6 @@ def __init__( ): super().__init__( n_features=n_features, - n_pairs=n_pairs, n_trees=n_trees, specific_pairs=specific_pairs, all_possible_pairs=all_possible_pairs, @@ -2061,12 +2037,9 @@ def __init__( ): super().__init__( n_features=n_features, - n_pairs=n_pairs, n_trees=n_trees, n_selected_features=n_selected_features, n_evaluated_features=n_evaluated_features, - specific_pairs=specific_pairs, - all_possible_pairs=all_possible_pairs, construction_rules=construction_rules, verbose=verbose, output_dir=output_dir, @@ -2074,6 +2047,9 @@ def __init__( key=key, internal_sort=internal_sort, ) + self.n_pairs = n_pairs + self.specific_pairs = specific_pairs + self.all_possible_pairs = all_possible_pairs self.group_target_value = group_target_value self._khiops_model_prefix = "SNB_" self._predicted_target_meta_data_tag = "Prediction" @@ -2119,12 +2095,44 @@ def _fit_check_params(self, ds, **kwargs): # Call parent method super()._fit_check_params(ds, **kwargs) + # Check the pair related parameters + if not isinstance(self.n_pairs, int): + raise TypeError(type_error_message("n_pairs", self.n_pairs, int)) + if self.n_pairs < 0: + raise ValueError("'n_pairs' must be positive") + if self.specific_pairs is not None: + if not is_list_like(self.specific_pairs): + raise TypeError( + type_error_message( + "specific_pairs", self.specific_pairs, "list-like" + ) + ) + else: + for pair in self.specific_pairs: + if not isinstance(pair, tuple): + raise TypeError(type_error_message(pair, pair, tuple)) + if not isinstance(self.all_possible_pairs, bool): + raise TypeError( + type_error_message("all_possible_pairs", self.all_possible_pairs, bool) + ) + # Check 'group_target_value' parameter if not isinstance(self.group_target_value, bool): raise TypeError( type_error_message("group_target_value", self.group_target_value, bool) ) + def _fit_prepare_training_function_inputs(self, ds, computation_dir): + # Call the parent method + args, kwargs = super()._fit_prepare_training_function_inputs( + ds, computation_dir + ) + + # Rename parameters to be compatible with khiops.core + kwargs["max_pairs"] = kwargs.pop("n_pairs") + + return args, kwargs + def fit(self, X, y, **kwargs): """Fits a Selective Naive Bayes classifier according to X, y @@ -2406,27 +2414,12 @@ class KhiopsRegressor(RegressorMixin, KhiopsPredictor): n_features : int, default 100 *Multi-table only* : Maximum number of multi-table aggregate features to construct. See :doc:`/multi_table_primer` for more details. - n_pairs : int, default 0 - Maximum number of pair features to construct. These features are 2D grid - partitions of univariate feature pairs. The grid is optimized such that in each - cell the target distribution is well approximated by a constant histogram. Only - pairs that are jointly more informative than their marginals may be taken into - account in the regressor. n_selected_features : int, default 0 Maximum number of features to be selected in the SNB predictor. If equal to 0 it selects all the features kept in the training. n_evaluated_features : int, default 0 Maximum number of features to be evaluated in the SNB predictor training. If equal to 0 it evaluates all informative features. - specific_pairs : list of tuple, optional - User-specified pairs as a list of 2-tuples of feature names. If a given tuple - contains only one non-empty feature name, then it generates all the pairs - containing it (within the maximum limit ``n_pairs``). These pairs have top - priority: they are constructed first. - all_possible_pairs : bool, default ``True`` - If ``True`` tries to create all possible pairs within the limit ``n_pairs``. - Pairs specified with ``specific_pairs`` have top priority: they are constructed - first. construction_rules : list of str, optional Allowed rules for the automatic feature construction. If not set, it uses all possible rules. @@ -2508,12 +2501,9 @@ class KhiopsRegressor(RegressorMixin, KhiopsPredictor): def __init__( self, n_features=100, - n_pairs=0, n_trees=0, n_selected_features=0, n_evaluated_features=0, - specific_pairs=None, - all_possible_pairs=True, construction_rules=None, verbose=False, output_dir=None, @@ -2523,12 +2513,9 @@ def __init__( ): super().__init__( n_features=n_features, - n_pairs=n_pairs, n_trees=n_trees, n_selected_features=n_selected_features, n_evaluated_features=n_evaluated_features, - specific_pairs=specific_pairs, - all_possible_pairs=all_possible_pairs, construction_rules=construction_rules, verbose=verbose, output_dir=output_dir, @@ -2820,10 +2807,7 @@ def __init__( ): super().__init__( n_features=n_features, - n_pairs=n_pairs, n_trees=n_trees, - specific_pairs=specific_pairs, - all_possible_pairs=all_possible_pairs, construction_rules=construction_rules, verbose=verbose, output_dir=output_dir, @@ -2831,6 +2815,9 @@ def __init__( key=key, internal_sort=internal_sort, ) + self.n_pairs = n_pairs + self.specific_pairs = specific_pairs + self.all_possible_pairs = all_possible_pairs self.categorical_target = categorical_target self.group_target_value = group_target_value self.transform_type_categorical = transform_type_categorical @@ -2903,6 +2890,27 @@ def _fit_check_params(self, ds, **kwargs): # Call parent method super()._fit_check_params(ds, **kwargs) + # Check the pair related parameters + if not isinstance(self.n_pairs, int): + raise TypeError(type_error_message("n_pairs", self.n_pairs, int)) + if self.n_pairs < 0: + raise ValueError("'n_pairs' must be positive") + if self.specific_pairs is not None: + if not is_list_like(self.specific_pairs): + raise TypeError( + type_error_message( + "specific_pairs", self.specific_pairs, "list-like" + ) + ) + else: + for pair in self.specific_pairs: + if not isinstance(pair, tuple): + raise TypeError(type_error_message(pair, pair, tuple)) + if not isinstance(self.all_possible_pairs, bool): + raise TypeError( + type_error_message("all_possible_pairs", self.all_possible_pairs, bool) + ) + # Check 'transform_type_categorical' parameter if not isinstance(self.transform_type_categorical, str): raise TypeError( @@ -3025,6 +3033,7 @@ def _fit_prepare_training_function_inputs(self, ds, computation_dir): ) # Rename encoder parameters, delete unused ones # to be compatible with khiops.core + kwargs["max_pairs"] = kwargs.pop("n_pairs") kwargs["keep_initial_categorical_variables"] = kwargs["keep_initial_variables"] kwargs["keep_initial_numerical_variables"] = kwargs.pop( "keep_initial_variables" diff --git a/tests/test_estimator_attributes.py b/tests/test_estimator_attributes.py index 299d64ff..49c4464e 100644 --- a/tests/test_estimator_attributes.py +++ b/tests/test_estimator_attributes.py @@ -215,7 +215,7 @@ def test_regressor_attributes_monotable(self): adult_df = pd.read_csv(adult_dataset_path, sep="\t").sample(750) X = adult_df.drop("age", axis=1) y = adult_df["age"] - khr_adult = KhiopsRegressor(n_trees=0, n_pairs=5) + khr_adult = KhiopsRegressor(n_trees=0) with warnings.catch_warnings(): warnings.filterwarnings( action="ignore", diff --git a/tests/test_sklearn.py b/tests/test_sklearn.py index 67b7ffcd..f62b42c2 100644 --- a/tests/test_sklearn.py +++ b/tests/test_sklearn.py @@ -1112,12 +1112,9 @@ def setUpClass(cls): "field_separator": "\t", "detect_format": False, "header_line": True, - "max_pairs": 1, "max_trees": 0, "max_selected_variables": 1, "max_evaluated_variables": 3, - "specific_pairs": [("age", "race")], - "all_possible_pairs": False, "construction_rules": ["TableMode", "TableSelection"], "additional_data_tables": {}, } @@ -1204,12 +1201,9 @@ def setUpClass(cls): "field_separator": "\t", "detect_format": False, "header_line": True, - "max_pairs": 1, "max_trees": 0, "max_selected_variables": 1, "max_evaluated_variables": 3, - "specific_pairs": [("age", "race")], - "all_possible_pairs": False, "construction_rules": ["TableMode", "TableSelection"], "additional_data_tables": {}, } @@ -1304,12 +1298,9 @@ def setUpClass(cls): "detect_format": False, "header_line": True, "max_constructed_variables": 10, - "max_pairs": 1, "max_trees": 0, "max_selected_variables": 1, "max_evaluated_variables": 3, - "specific_pairs": [], - "all_possible_pairs": False, "construction_rules": ["TableMode", "TableSelection"], "additional_data_tables": { "SpliceJunction`SpliceJunctionDNA" @@ -1414,12 +1405,9 @@ def setUpClass(cls): "detect_format": False, "header_line": True, "max_constructed_variables": 10, - "max_pairs": 1, "max_trees": 0, "max_selected_variables": 1, "max_evaluated_variables": 3, - "specific_pairs": [], - "all_possible_pairs": False, "construction_rules": ["TableMode", "TableSelection"], "log_file_path": os.path.join( cls.output_dir, "khiops.log" @@ -2405,11 +2393,8 @@ def test_parameter_transfer_regressor_fit_from_monotable_dataframe(self): schema_type="monotable", source_type="dataframe", extra_estimator_kwargs={ - "n_pairs": 1, "n_selected_features": 1, "n_evaluated_features": 3, - "specific_pairs": [("age", "race")], - "all_possible_pairs": False, "construction_rules": ["TableMode", "TableSelection"], }, ) @@ -2424,11 +2409,8 @@ def test_parameter_transfer_regressor_fit_from_monotable_dataframe_with_df_y( schema_type="monotable", source_type="dataframe_xy", extra_estimator_kwargs={ - "n_pairs": 1, "n_selected_features": 1, "n_evaluated_features": 3, - "specific_pairs": [("age", "race")], - "all_possible_pairs": False, "construction_rules": ["TableMode", "TableSelection"], }, ) @@ -2441,11 +2423,8 @@ def test_parameter_transfer_regressor_fit_from_monotable_file_dataset(self): schema_type="monotable", source_type="file_dataset", extra_estimator_kwargs={ - "n_pairs": 1, "n_selected_features": 1, "n_evaluated_features": 3, - "specific_pairs": [("age", "race")], - "all_possible_pairs": False, "construction_rules": ["TableMode", "TableSelection"], }, ) @@ -2459,12 +2438,9 @@ def test_parameter_transfer_regressor_fit_from_multitable_dataframe(self): source_type="dataframe", extra_estimator_kwargs={ "n_features": 10, - "n_pairs": 1, "n_trees": 0, "n_selected_features": 1, "n_evaluated_features": 3, - "specific_pairs": [], - "all_possible_pairs": False, "construction_rules": ["TableMode", "TableSelection"], }, ) @@ -2478,12 +2454,9 @@ def test_parameter_transfer_regressor_fit_from_multitable_file_dataset(self): source_type="file_dataset", extra_estimator_kwargs={ "n_features": 10, - "n_pairs": 1, "n_trees": 0, "n_selected_features": 1, "n_evaluated_features": 3, - "specific_pairs": [], - "all_possible_pairs": False, "construction_rules": ["TableMode", "TableSelection"], }, )