From 19ebe3caa9f9a6fa6556ee99e908f3764b71bb33 Mon Sep 17 00:00:00 2001 From: NAIR BENREKIA Nour Eddine Yassine INNOV/IT-S Date: Wed, 11 Dec 2024 14:01:36 +0100 Subject: [PATCH] support float and boolean targets in KhiopsClassifier --- doc/samples/samples_sklearn.rst | 89 +++++++++++++++++++++ khiops/samples/samples_sklearn.ipynb | 115 +++++++++++++++++++++++++++ khiops/samples/samples_sklearn.py | 51 +++++++++++- khiops/sklearn/estimators.py | 37 +++++++-- tests/test_sklearn_output_types.py | 7 +- 5 files changed, 288 insertions(+), 11 deletions(-) diff --git a/doc/samples/samples_sklearn.rst b/doc/samples/samples_sklearn.rst index 0ca0320b..a76da219 100644 --- a/doc/samples/samples_sklearn.rst +++ b/doc/samples/samples_sklearn.rst @@ -479,6 +479,95 @@ Samples test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1]) print(f"Test accuracy = {test_accuracy}") print(f"Test auc = {test_auc}") +.. autofunction:: khiops_classifier_float_target +.. code-block:: python + + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsClassifier + from sklearn.model_selection import train_test_split + + # Load the dataset into a pandas dataframe + adult_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + adult_df = pd.read_csv(adult_path, sep="\t") + adult_df["class"] = adult_df["class"].replace({"less": 0.0, "more": 1.0}) + + # Split the whole dataframe into train and test (70%-30%) + adult_train_df, adult_test_df = train_test_split( + adult_df, test_size=0.3, random_state=1 + ) + + # Split the dataset into: + # - the X feature table + # - the y target vector ("class" column) + X_train = adult_train_df.drop("class", axis=1) + X_test = adult_test_df.drop("class", axis=1) + y_train = adult_train_df["class"] + + # Create the classifier object + khc = KhiopsClassifier() + + # Train the classifier + khc.fit(X_train, y_train) + # Predict the classes on the test dataset + y_test_pred = khc.predict(X_test) + print("Predicted classes (first 10):") + print(y_test_pred[0:10]) + print("---") + + # Predict the class probabilities on the test dataset + y_test_probas = khc.predict_proba(X_test) + print(f"Class order: {khc.classes_}") + print("Predicted class probabilities (first 10):") + print(y_test_probas[0:10]) + print("---") +.. autofunction:: khiops_classifier_boolean_target +.. code-block:: python + + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsClassifier + from sklearn.model_selection import train_test_split + + # Load the dataset into a pandas dataframe + adult_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + adult_df = pd.read_csv(adult_path, sep="\t") + adult_df["class"] = adult_df["class"].replace({"less": False, "more": True}) + + # Split the whole dataframe into train and test (70%-30%) + adult_train_df, adult_test_df = train_test_split( + adult_df, test_size=0.3, random_state=1 + ) + + # Split the dataset into: + # - the X feature table + # - the y target vector ("class" column) + X_train = adult_train_df.drop("class", axis=1) + X_test = adult_test_df.drop("class", axis=1) + y_train = adult_train_df["class"] + + # Create the classifier object + khc = KhiopsClassifier() + + # Train the classifier + khc.fit(X_train, y_train) + + # Predict the classes on the test dataset + y_test_pred = khc.predict(X_test) + print("Predicted classes (first 10):") + print(y_test_pred[0:10]) + print("---") + + # Predict the class probabilities on the test dataset + y_test_probas = khc.predict_proba(X_test) + print(f"Class order: {khc.classes_}") + print("Predicted class probabilities (first 10):") + print(y_test_probas[0:10]) + print("---") .. autofunction:: khiops_regressor .. code-block:: python diff --git a/khiops/samples/samples_sklearn.ipynb b/khiops/samples/samples_sklearn.ipynb index 13d5fd30..f2c31ff7 100644 --- a/khiops/samples/samples_sklearn.ipynb +++ b/khiops/samples/samples_sklearn.ipynb @@ -545,6 +545,121 @@ "print(f\"Test auc = {test_auc}\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `khiops_classifier_float_target()`\n\n", + "Trains a `.KhiopsClassifier` on a monotable dataframe\n with a float target\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Imports\n", + "import os\n", + "import pandas as pd\n", + "from khiops import core as kh\n", + "from khiops.sklearn import KhiopsClassifier\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "# Load the dataset into a pandas dataframe\n", + "adult_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", + "adult_df = pd.read_csv(adult_path, sep=\"\\t\")\n", + "adult_df[\"class\"] = adult_df[\"class\"].replace({\"less\": 0.0, \"more\": 1.0})\n", + "\n", + "# Split the whole dataframe into train and test (70%-30%)\n", + "adult_train_df, adult_test_df = train_test_split(\n", + " adult_df, test_size=0.3, random_state=1\n", + ")\n", + "\n", + "# Split the dataset into:\n", + "# - the X feature table\n", + "# - the y target vector (\"class\" column)\n", + "X_train = adult_train_df.drop(\"class\", axis=1)\n", + "X_test = adult_test_df.drop(\"class\", axis=1)\n", + "y_train = adult_train_df[\"class\"]\n", + "\n", + "# Create the classifier object\n", + "khc = KhiopsClassifier()\n", + "\n", + "# Train the classifier\n", + "khc.fit(X_train, y_train)\n", + "# Predict the classes on the test dataset\n", + "y_test_pred = khc.predict(X_test)\n", + "print(\"Predicted classes (first 10):\")\n", + "print(y_test_pred[0:10])\n", + "print(\"---\")\n", + "\n", + "# Predict the class probabilities on the test dataset\n", + "y_test_probas = khc.predict_proba(X_test)\n", + "print(f\"Class order: {khc.classes_}\")\n", + "print(\"Predicted class probabilities (first 10):\")\n", + "print(y_test_probas[0:10])\n", + "print(\"---\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `khiops_classifier_boolean_target()`\n\n", + "Trains a `.KhiopsClassifier` on a monotable dataframe\n with a boolean target\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Imports\n", + "import os\n", + "import pandas as pd\n", + "from khiops import core as kh\n", + "from khiops.sklearn import KhiopsClassifier\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "# Load the dataset into a pandas dataframe\n", + "adult_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n", + "adult_df = pd.read_csv(adult_path, sep=\"\\t\")\n", + "adult_df[\"class\"] = adult_df[\"class\"].replace({\"less\": False, \"more\": True})\n", + "\n", + "# Split the whole dataframe into train and test (70%-30%)\n", + "adult_train_df, adult_test_df = train_test_split(\n", + " adult_df, test_size=0.3, random_state=1\n", + ")\n", + "\n", + "# Split the dataset into:\n", + "# - the X feature table\n", + "# - the y target vector (\"class\" column)\n", + "X_train = adult_train_df.drop(\"class\", axis=1)\n", + "X_test = adult_test_df.drop(\"class\", axis=1)\n", + "y_train = adult_train_df[\"class\"]\n", + "\n", + "# Create the classifier object\n", + "khc = KhiopsClassifier()\n", + "\n", + "# Train the classifier\n", + "khc.fit(X_train, y_train)\n", + "\n", + "# Predict the classes on the test dataset\n", + "y_test_pred = khc.predict(X_test)\n", + "print(\"Predicted classes (first 10):\")\n", + "print(y_test_pred[0:10])\n", + "print(\"---\")\n", + "\n", + "# Predict the class probabilities on the test dataset\n", + "y_test_probas = khc.predict_proba(X_test)\n", + "print(f\"Class order: {khc.classes_}\")\n", + "print(\"Predicted class probabilities (first 10):\")\n", + "print(y_test_probas[0:10])\n", + "print(\"---\")" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/khiops/samples/samples_sklearn.py b/khiops/samples/samples_sklearn.py index e951d74a..6053932a 100644 --- a/khiops/samples/samples_sklearn.py +++ b/khiops/samples/samples_sklearn.py @@ -119,6 +119,52 @@ def khiops_classifier_float_target(): # Create the classifier object khc = KhiopsClassifier() + # Train the classifier + khc.fit(X_train, y_train) + # Predict the classes on the test dataset + y_test_pred = khc.predict(X_test) + print("Predicted classes (first 10):") + print(y_test_pred[0:10]) + print("---") + + # Predict the class probabilities on the test dataset + y_test_probas = khc.predict_proba(X_test) + print(f"Class order: {khc.classes_}") + print("Predicted class probabilities (first 10):") + print(y_test_probas[0:10]) + print("---") + + +def khiops_classifier_boolean_target(): + """Trains a `.KhiopsClassifier` on a monotable dataframe + with a boolean target""" + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsClassifier + from sklearn.model_selection import train_test_split + + # Load the dataset into a pandas dataframe + adult_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") + adult_df = pd.read_csv(adult_path, sep="\t") + adult_df["class"] = adult_df["class"].replace({"less": False, "more": True}) + + # Split the whole dataframe into train and test (70%-30%) + adult_train_df, adult_test_df = train_test_split( + adult_df, test_size=0.3, random_state=1 + ) + + # Split the dataset into: + # - the X feature table + # - the y target vector ("class" column) + X_train = adult_train_df.drop("class", axis=1) + X_test = adult_test_df.drop("class", axis=1) + y_train = adult_train_df["class"] + + # Create the classifier object + khc = KhiopsClassifier() + # Train the classifier khc.fit(X_train, y_train) @@ -1108,8 +1154,6 @@ def khiops_classifier_multitable_star_file(): print(f"Test auc = {test_auc}") -exported_samples = [khiops_classifier_float_target] -""" exported_samples = [ khiops_classifier, khiops_classifier_multiclass, @@ -1118,6 +1162,8 @@ def khiops_classifier_multitable_star_file(): khiops_classifier_sparse, khiops_classifier_pickle, khiops_classifier_with_hyperparameters, + khiops_classifier_float_target, + khiops_classifier_boolean_target, khiops_regressor, khiops_encoder, khiops_encoder_multitable_star, @@ -1129,7 +1175,6 @@ def khiops_classifier_multitable_star_file(): khiops_classifier_multitable_list, khiops_classifier_multitable_star_file, ] -""" def execute_samples(args): diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py index 2e736b3c..c772b9ac 100644 --- a/khiops/sklearn/estimators.py +++ b/khiops/sklearn/estimators.py @@ -153,6 +153,7 @@ def _check_categorical_target_type(ds): or pd.api.types.is_string_dtype(ds.target_column.dtype) or pd.api.types.is_integer_dtype(ds.target_column.dtype) or pd.api.types.is_float_dtype(ds.target_column.dtype) + or pd.api.types.is_bool_dtype(ds.target_column.dtype) ): raise ValueError( f"'y' has invalid type '{ds.target_column_type}'. " @@ -2093,6 +2094,24 @@ def _is_real_target_dtype_integer(self): ) ) + def _is_real_target_dtype_float(self): + return self._original_target_dtype is not None and ( + pd.api.types.is_float_dtype(self._original_target_dtype) + or ( + isinstance(self._original_target_dtype, pd.CategoricalDtype) + and pd.api.types.is_float_dtype(self._original_target_dtype.categories) + ) + ) + + def _is_real_target_dtype_bool(self): + return self._original_target_dtype is not None and ( + pd.api.types.is_bool_dtype(self._original_target_dtype) + or ( + isinstance(self._original_target_dtype, pd.CategoricalDtype) + and pd.api.types.is_bool_dtype(self._original_target_dtype.categories) + ) + ) + def _sorted_prob_variable_names(self): """Returns the model probability variable names in the order of self.classes_""" assert self.is_fitted_, "Model not fit yet" @@ -2195,11 +2214,15 @@ def _fit_training_post_process(self, ds): for key in variable.meta_data.keys: if key.startswith("TargetProb"): self.classes_.append(variable.meta_data.get_value(key)) - if ds.is_in_memory and self._is_real_target_dtype_integer(): - self.classes_ = [int(class_value) for class_value in self.classes_] + if ds.is_in_memory: + if self._is_real_target_dtype_integer(): + self.classes_ = [int(class_value) for class_value in self.classes_] + elif self._is_real_target_dtype_float(): + self.classes_ = [float(class_value) for class_value in self.classes_] + elif self._is_real_target_dtype_bool(): + self.classes_ = [class_value == "True" for class_value in self.classes_] self.classes_.sort() self.classes_ = column_or_1d(self.classes_) - # Count number of classes self.n_classes_ = len(self.classes_) @@ -2273,10 +2296,10 @@ def predict(self, X): self._original_target_dtype ): y_pred = y_pred.astype(str, copy=False) - elif pd.api.types.is_float_dtype(self._original_target_type): - print(self._original_target_type) - y_pred = y_pred.astype(str, copy=False) - print(y_pred) + elif pd.api.types.is_float_dtype(self._original_target_dtype): + y_pred = y_pred.astype(float, copy=False) + elif pd.api.types.is_bool_dtype(self._original_target_dtype): + y_pred = y_pred.astype(bool, copy=False) # If category first coerce the type to the categories' type else: assert isinstance(self._original_target_dtype, pd.CategoricalDtype), ( diff --git a/tests/test_sklearn_output_types.py b/tests/test_sklearn_output_types.py index 7e728f70..8005fd34 100644 --- a/tests/test_sklearn_output_types.py +++ b/tests/test_sklearn_output_types.py @@ -54,11 +54,12 @@ def test_classifier_output_types(self): """Test the KhiopsClassifier output types and classes of predict* methods""" X, y = create_iris() X_mt, X_sec_mt, _ = create_iris_mt() - fixtures = { "ys": { "int": y, "int binary": y.replace({0: 0, 1: 0, 2: 1}), + "float": y.replace({0: 0.0, 1: 1.0, 2: 2.0}).astype(float), + "bool": y.replace({0: True, 1: True, 2: False}), "string": y.replace({0: "se", 1: "vi", 2: "ve"}), "string binary": y.replace({0: "vi_or_se", 1: "vi_or_se", 2: "ve"}), "int as string": y.replace({0: "8", 1: "9", 2: "10"}), @@ -69,6 +70,8 @@ def test_classifier_output_types(self): "y_type_check": { "int": pd.api.types.is_integer_dtype, "int binary": pd.api.types.is_integer_dtype, + "float": pd.api.types.is_float_dtype, + "bool": pd.api.types.is_bool_dtype, "string": pd.api.types.is_string_dtype, "string binary": pd.api.types.is_string_dtype, "int as string": pd.api.types.is_string_dtype, @@ -79,6 +82,8 @@ def test_classifier_output_types(self): "expected_classes": { "int": column_or_1d([0, 1, 2]), "int binary": column_or_1d([0, 1]), + "float": column_or_1d([0.0, 1.0, 2.0]), + "bool": column_or_1d([False, True]), "string": column_or_1d(["se", "ve", "vi"]), "string binary": column_or_1d(["ve", "vi_or_se"]), "int as string": column_or_1d(["10", "8", "9"]),