support float and boolean targets in KhiopsClassifier

KhiopsML · Dec 11, 2024 · 19ebe3c · 19ebe3c
1 parent 747d9c4
commit 19ebe3c
Show file tree

Hide file tree

Showing 5 changed files with 288 additions and 11 deletions.
diff --git a/doc/samples/samples_sklearn.rst b/doc/samples/samples_sklearn.rst
@@ -479,6 +479,95 @@ Samples
     test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1])
     print(f"Test accuracy = {test_accuracy}")
     print(f"Test auc      = {test_auc}")
+.. autofunction:: khiops_classifier_float_target
+.. code-block:: python
+
+    # Imports
+    import os
+    import pandas as pd
+    from khiops import core as kh
+    from khiops.sklearn import KhiopsClassifier
+    from sklearn.model_selection import train_test_split
+
+    # Load the dataset into a pandas dataframe
+    adult_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
+    adult_df = pd.read_csv(adult_path, sep="\t")
+    adult_df["class"] = adult_df["class"].replace({"less": 0.0, "more": 1.0})
+
+    # Split the whole dataframe into train and test (70%-30%)
+    adult_train_df, adult_test_df = train_test_split(
+        adult_df, test_size=0.3, random_state=1
+    )
+
+    # Split the dataset into:
+    # - the X feature table
+    # - the y target vector ("class" column)
+    X_train = adult_train_df.drop("class", axis=1)
+    X_test = adult_test_df.drop("class", axis=1)
+    y_train = adult_train_df["class"]
+
+    # Create the classifier object
+    khc = KhiopsClassifier()
+
+    # Train the classifier
+    khc.fit(X_train, y_train)
+    # Predict the classes on the test dataset
+    y_test_pred = khc.predict(X_test)
+    print("Predicted classes (first 10):")
+    print(y_test_pred[0:10])
+    print("---")
+
+    # Predict the class probabilities on the test dataset
+    y_test_probas = khc.predict_proba(X_test)
+    print(f"Class order: {khc.classes_}")
+    print("Predicted class probabilities (first 10):")
+    print(y_test_probas[0:10])
+    print("---")
+.. autofunction:: khiops_classifier_boolean_target
+.. code-block:: python
+
+    # Imports
+    import os
+    import pandas as pd
+    from khiops import core as kh
+    from khiops.sklearn import KhiopsClassifier
+    from sklearn.model_selection import train_test_split
+
+    # Load the dataset into a pandas dataframe
+    adult_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
+    adult_df = pd.read_csv(adult_path, sep="\t")
+    adult_df["class"] = adult_df["class"].replace({"less": False, "more": True})
+
+    # Split the whole dataframe into train and test (70%-30%)
+    adult_train_df, adult_test_df = train_test_split(
+        adult_df, test_size=0.3, random_state=1
+    )
+
+    # Split the dataset into:
+    # - the X feature table
+    # - the y target vector ("class" column)
+    X_train = adult_train_df.drop("class", axis=1)
+    X_test = adult_test_df.drop("class", axis=1)
+    y_train = adult_train_df["class"]
+
+    # Create the classifier object
+    khc = KhiopsClassifier()
+
+    # Train the classifier
+    khc.fit(X_train, y_train)
+
+    # Predict the classes on the test dataset
+    y_test_pred = khc.predict(X_test)
+    print("Predicted classes (first 10):")
+    print(y_test_pred[0:10])
+    print("---")
+
+    # Predict the class probabilities on the test dataset
+    y_test_probas = khc.predict_proba(X_test)
+    print(f"Class order: {khc.classes_}")
+    print("Predicted class probabilities (first 10):")
+    print(y_test_probas[0:10])
+    print("---")
 .. autofunction:: khiops_regressor
 .. code-block:: python
 

diff --git a/khiops/samples/samples_sklearn.ipynb b/khiops/samples/samples_sklearn.ipynb
@@ -545,6 +545,121 @@
     "print(f\"Test auc      = {test_auc}\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### `khiops_classifier_float_target()`\n\n",
+    "Trains a `.KhiopsClassifier` on a monotable dataframe\n    with a float target\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Imports\n",
+    "import os\n",
+    "import pandas as pd\n",
+    "from khiops import core as kh\n",
+    "from khiops.sklearn import KhiopsClassifier\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "# Load the dataset into a pandas dataframe\n",
+    "adult_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n",
+    "adult_df = pd.read_csv(adult_path, sep=\"\\t\")\n",
+    "adult_df[\"class\"] = adult_df[\"class\"].replace({\"less\": 0.0, \"more\": 1.0})\n",
+    "\n",
+    "# Split the whole dataframe into train and test (70%-30%)\n",
+    "adult_train_df, adult_test_df = train_test_split(\n",
+    "    adult_df, test_size=0.3, random_state=1\n",
+    ")\n",
+    "\n",
+    "# Split the dataset into:\n",
+    "# - the X feature table\n",
+    "# - the y target vector (\"class\" column)\n",
+    "X_train = adult_train_df.drop(\"class\", axis=1)\n",
+    "X_test = adult_test_df.drop(\"class\", axis=1)\n",
+    "y_train = adult_train_df[\"class\"]\n",
+    "\n",
+    "# Create the classifier object\n",
+    "khc = KhiopsClassifier()\n",
+    "\n",
+    "# Train the classifier\n",
+    "khc.fit(X_train, y_train)\n",
+    "# Predict the classes on the test dataset\n",
+    "y_test_pred = khc.predict(X_test)\n",
+    "print(\"Predicted classes (first 10):\")\n",
+    "print(y_test_pred[0:10])\n",
+    "print(\"---\")\n",
+    "\n",
+    "# Predict the class probabilities on the test dataset\n",
+    "y_test_probas = khc.predict_proba(X_test)\n",
+    "print(f\"Class order: {khc.classes_}\")\n",
+    "print(\"Predicted class probabilities (first 10):\")\n",
+    "print(y_test_probas[0:10])\n",
+    "print(\"---\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### `khiops_classifier_boolean_target()`\n\n",
+    "Trains a `.KhiopsClassifier` on a monotable dataframe\n    with a boolean target\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Imports\n",
+    "import os\n",
+    "import pandas as pd\n",
+    "from khiops import core as kh\n",
+    "from khiops.sklearn import KhiopsClassifier\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "# Load the dataset into a pandas dataframe\n",
+    "adult_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n",
+    "adult_df = pd.read_csv(adult_path, sep=\"\\t\")\n",
+    "adult_df[\"class\"] = adult_df[\"class\"].replace({\"less\": False, \"more\": True})\n",
+    "\n",
+    "# Split the whole dataframe into train and test (70%-30%)\n",
+    "adult_train_df, adult_test_df = train_test_split(\n",
+    "    adult_df, test_size=0.3, random_state=1\n",
+    ")\n",
+    "\n",
+    "# Split the dataset into:\n",
+    "# - the X feature table\n",
+    "# - the y target vector (\"class\" column)\n",
+    "X_train = adult_train_df.drop(\"class\", axis=1)\n",
+    "X_test = adult_test_df.drop(\"class\", axis=1)\n",
+    "y_train = adult_train_df[\"class\"]\n",
+    "\n",
+    "# Create the classifier object\n",
+    "khc = KhiopsClassifier()\n",
+    "\n",
+    "# Train the classifier\n",
+    "khc.fit(X_train, y_train)\n",
+    "\n",
+    "# Predict the classes on the test dataset\n",
+    "y_test_pred = khc.predict(X_test)\n",
+    "print(\"Predicted classes (first 10):\")\n",
+    "print(y_test_pred[0:10])\n",
+    "print(\"---\")\n",
+    "\n",
+    "# Predict the class probabilities on the test dataset\n",
+    "y_test_probas = khc.predict_proba(X_test)\n",
+    "print(f\"Class order: {khc.classes_}\")\n",
+    "print(\"Predicted class probabilities (first 10):\")\n",
+    "print(y_test_probas[0:10])\n",
+    "print(\"---\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},

diff --git a/khiops/samples/samples_sklearn.py b/khiops/samples/samples_sklearn.py
@@ -119,6 +119,52 @@ def khiops_classifier_float_target():
     # Create the classifier object
     khc = KhiopsClassifier()
 
+    # Train the classifier
+    khc.fit(X_train, y_train)
+    # Predict the classes on the test dataset
+    y_test_pred = khc.predict(X_test)
+    print("Predicted classes (first 10):")
+    print(y_test_pred[0:10])
+    print("---")
+
+    # Predict the class probabilities on the test dataset
+    y_test_probas = khc.predict_proba(X_test)
+    print(f"Class order: {khc.classes_}")
+    print("Predicted class probabilities (first 10):")
+    print(y_test_probas[0:10])
+    print("---")
+
+
+def khiops_classifier_boolean_target():
+    """Trains a `.KhiopsClassifier` on a monotable dataframe
+    with a boolean target"""
+    # Imports
+    import os
+    import pandas as pd
+    from khiops import core as kh
+    from khiops.sklearn import KhiopsClassifier
+    from sklearn.model_selection import train_test_split
+
+    # Load the dataset into a pandas dataframe
+    adult_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
+    adult_df = pd.read_csv(adult_path, sep="\t")
+    adult_df["class"] = adult_df["class"].replace({"less": False, "more": True})
+
+    # Split the whole dataframe into train and test (70%-30%)
+    adult_train_df, adult_test_df = train_test_split(
+        adult_df, test_size=0.3, random_state=1
+    )
+
+    # Split the dataset into:
+    # - the X feature table
+    # - the y target vector ("class" column)
+    X_train = adult_train_df.drop("class", axis=1)
+    X_test = adult_test_df.drop("class", axis=1)
+    y_train = adult_train_df["class"]
+
+    # Create the classifier object
+    khc = KhiopsClassifier()
+
     # Train the classifier
     khc.fit(X_train, y_train)
 
@@ -1108,8 +1154,6 @@ def khiops_classifier_multitable_star_file():
     print(f"Test auc      = {test_auc}")
 
 
-exported_samples = [khiops_classifier_float_target]
-"""
 exported_samples = [
     khiops_classifier,
     khiops_classifier_multiclass,
@@ -1118,6 +1162,8 @@ def khiops_classifier_multitable_star_file():
     khiops_classifier_sparse,
     khiops_classifier_pickle,
     khiops_classifier_with_hyperparameters,
+    khiops_classifier_float_target,
+    khiops_classifier_boolean_target,
     khiops_regressor,
     khiops_encoder,
     khiops_encoder_multitable_star,
@@ -1129,7 +1175,6 @@ def khiops_classifier_multitable_star_file():
     khiops_classifier_multitable_list,
     khiops_classifier_multitable_star_file,
 ]
-"""
 
 
 def execute_samples(args):

diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py
@@ -153,6 +153,7 @@ def _check_categorical_target_type(ds):
         or pd.api.types.is_string_dtype(ds.target_column.dtype)
         or pd.api.types.is_integer_dtype(ds.target_column.dtype)
         or pd.api.types.is_float_dtype(ds.target_column.dtype)
+        or pd.api.types.is_bool_dtype(ds.target_column.dtype)
     ):
         raise ValueError(
             f"'y' has invalid type '{ds.target_column_type}'. "
@@ -2093,6 +2094,24 @@ def _is_real_target_dtype_integer(self):
             )
         )
 
+    def _is_real_target_dtype_float(self):
+        return self._original_target_dtype is not None and (
+            pd.api.types.is_float_dtype(self._original_target_dtype)
+            or (
+                isinstance(self._original_target_dtype, pd.CategoricalDtype)
+                and pd.api.types.is_float_dtype(self._original_target_dtype.categories)
+            )
+        )
+
+    def _is_real_target_dtype_bool(self):
+        return self._original_target_dtype is not None and (
+            pd.api.types.is_bool_dtype(self._original_target_dtype)
+            or (
+                isinstance(self._original_target_dtype, pd.CategoricalDtype)
+                and pd.api.types.is_bool_dtype(self._original_target_dtype.categories)
+            )
+        )
+
     def _sorted_prob_variable_names(self):
         """Returns the model probability variable names in the order of self.classes_"""
         assert self.is_fitted_, "Model not fit yet"
@@ -2195,11 +2214,15 @@ def _fit_training_post_process(self, ds):
             for key in variable.meta_data.keys:
                 if key.startswith("TargetProb"):
                     self.classes_.append(variable.meta_data.get_value(key))
-        if ds.is_in_memory and self._is_real_target_dtype_integer():
-            self.classes_ = [int(class_value) for class_value in self.classes_]
+        if ds.is_in_memory:
+            if self._is_real_target_dtype_integer():
+                self.classes_ = [int(class_value) for class_value in self.classes_]
+            elif self._is_real_target_dtype_float():
+                self.classes_ = [float(class_value) for class_value in self.classes_]
+            elif self._is_real_target_dtype_bool():
+                self.classes_ = [class_value == "True" for class_value in self.classes_]
             self.classes_.sort()
         self.classes_ = column_or_1d(self.classes_)
-
         # Count number of classes
         self.n_classes_ = len(self.classes_)
 
@@ -2273,10 +2296,10 @@ def predict(self, X):
                 self._original_target_dtype
             ):
                 y_pred = y_pred.astype(str, copy=False)
-            elif pd.api.types.is_float_dtype(self._original_target_type):
-                print(self._original_target_type)
-                y_pred = y_pred.astype(str, copy=False)
-                print(y_pred)
+            elif pd.api.types.is_float_dtype(self._original_target_dtype):
+                y_pred = y_pred.astype(float, copy=False)
+            elif pd.api.types.is_bool_dtype(self._original_target_dtype):
+                y_pred = y_pred.astype(bool, copy=False)
             # If category first coerce the type to the categories' type
             else:
                 assert isinstance(self._original_target_dtype, pd.CategoricalDtype), (

diff --git a/tests/test_sklearn_output_types.py b/tests/test_sklearn_output_types.py
@@ -54,11 +54,12 @@ def test_classifier_output_types(self):
         """Test the KhiopsClassifier output types and classes of predict* methods"""
         X, y = create_iris()
         X_mt, X_sec_mt, _ = create_iris_mt()
-
         fixtures = {
             "ys": {
                 "int": y,
                 "int binary": y.replace({0: 0, 1: 0, 2: 1}),
+                "float": y.replace({0: 0.0, 1: 1.0, 2: 2.0}).astype(float),
+                "bool": y.replace({0: True, 1: True, 2: False}),
                 "string": y.replace({0: "se", 1: "vi", 2: "ve"}),
                 "string binary": y.replace({0: "vi_or_se", 1: "vi_or_se", 2: "ve"}),
                 "int as string": y.replace({0: "8", 1: "9", 2: "10"}),
@@ -69,6 +70,8 @@ def test_classifier_output_types(self):
             "y_type_check": {
                 "int": pd.api.types.is_integer_dtype,
                 "int binary": pd.api.types.is_integer_dtype,
+                "float": pd.api.types.is_float_dtype,
+                "bool": pd.api.types.is_bool_dtype,
                 "string": pd.api.types.is_string_dtype,
                 "string binary": pd.api.types.is_string_dtype,
                 "int as string": pd.api.types.is_string_dtype,
@@ -79,6 +82,8 @@ def test_classifier_output_types(self):
             "expected_classes": {
                 "int": column_or_1d([0, 1, 2]),
                 "int binary": column_or_1d([0, 1]),
+                "float": column_or_1d([0.0, 1.0, 2.0]),
+                "bool": column_or_1d([False, True]),
                 "string": column_or_1d(["se", "ve", "vi"]),
                 "string binary": column_or_1d(["ve", "vi_or_se"]),
                 "int as string": column_or_1d(["10", "8", "9"]),