Skip to content

Commit

Permalink
support float and boolean targets in KhiopsClassifier
Browse files Browse the repository at this point in the history
  • Loading branch information
nairbenrekia committed Dec 11, 2024
1 parent 747d9c4 commit 19ebe3c
Show file tree
Hide file tree
Showing 5 changed files with 288 additions and 11 deletions.
89 changes: 89 additions & 0 deletions doc/samples/samples_sklearn.rst
Original file line number Diff line number Diff line change
Expand Up @@ -479,6 +479,95 @@ Samples
test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1])
print(f"Test accuracy = {test_accuracy}")
print(f"Test auc = {test_auc}")
.. autofunction:: khiops_classifier_float_target
.. code-block:: python
# Imports
import os
import pandas as pd
from khiops import core as kh
from khiops.sklearn import KhiopsClassifier
from sklearn.model_selection import train_test_split
# Load the dataset into a pandas dataframe
adult_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
adult_df = pd.read_csv(adult_path, sep="\t")
adult_df["class"] = adult_df["class"].replace({"less": 0.0, "more": 1.0})
# Split the whole dataframe into train and test (70%-30%)
adult_train_df, adult_test_df = train_test_split(
adult_df, test_size=0.3, random_state=1
)
# Split the dataset into:
# - the X feature table
# - the y target vector ("class" column)
X_train = adult_train_df.drop("class", axis=1)
X_test = adult_test_df.drop("class", axis=1)
y_train = adult_train_df["class"]
# Create the classifier object
khc = KhiopsClassifier()
# Train the classifier
khc.fit(X_train, y_train)
# Predict the classes on the test dataset
y_test_pred = khc.predict(X_test)
print("Predicted classes (first 10):")
print(y_test_pred[0:10])
print("---")
# Predict the class probabilities on the test dataset
y_test_probas = khc.predict_proba(X_test)
print(f"Class order: {khc.classes_}")
print("Predicted class probabilities (first 10):")
print(y_test_probas[0:10])
print("---")
.. autofunction:: khiops_classifier_boolean_target
.. code-block:: python
# Imports
import os
import pandas as pd
from khiops import core as kh
from khiops.sklearn import KhiopsClassifier
from sklearn.model_selection import train_test_split
# Load the dataset into a pandas dataframe
adult_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
adult_df = pd.read_csv(adult_path, sep="\t")
adult_df["class"] = adult_df["class"].replace({"less": False, "more": True})
# Split the whole dataframe into train and test (70%-30%)
adult_train_df, adult_test_df = train_test_split(
adult_df, test_size=0.3, random_state=1
)
# Split the dataset into:
# - the X feature table
# - the y target vector ("class" column)
X_train = adult_train_df.drop("class", axis=1)
X_test = adult_test_df.drop("class", axis=1)
y_train = adult_train_df["class"]
# Create the classifier object
khc = KhiopsClassifier()
# Train the classifier
khc.fit(X_train, y_train)
# Predict the classes on the test dataset
y_test_pred = khc.predict(X_test)
print("Predicted classes (first 10):")
print(y_test_pred[0:10])
print("---")
# Predict the class probabilities on the test dataset
y_test_probas = khc.predict_proba(X_test)
print(f"Class order: {khc.classes_}")
print("Predicted class probabilities (first 10):")
print(y_test_probas[0:10])
print("---")
.. autofunction:: khiops_regressor
.. code-block:: python
Expand Down
115 changes: 115 additions & 0 deletions khiops/samples/samples_sklearn.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -545,6 +545,121 @@
"print(f\"Test auc = {test_auc}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### `khiops_classifier_float_target()`\n\n",
"Trains a `.KhiopsClassifier` on a monotable dataframe\n with a float target\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Imports\n",
"import os\n",
"import pandas as pd\n",
"from khiops import core as kh\n",
"from khiops.sklearn import KhiopsClassifier\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"# Load the dataset into a pandas dataframe\n",
"adult_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n",
"adult_df = pd.read_csv(adult_path, sep=\"\\t\")\n",
"adult_df[\"class\"] = adult_df[\"class\"].replace({\"less\": 0.0, \"more\": 1.0})\n",
"\n",
"# Split the whole dataframe into train and test (70%-30%)\n",
"adult_train_df, adult_test_df = train_test_split(\n",
" adult_df, test_size=0.3, random_state=1\n",
")\n",
"\n",
"# Split the dataset into:\n",
"# - the X feature table\n",
"# - the y target vector (\"class\" column)\n",
"X_train = adult_train_df.drop(\"class\", axis=1)\n",
"X_test = adult_test_df.drop(\"class\", axis=1)\n",
"y_train = adult_train_df[\"class\"]\n",
"\n",
"# Create the classifier object\n",
"khc = KhiopsClassifier()\n",
"\n",
"# Train the classifier\n",
"khc.fit(X_train, y_train)\n",
"# Predict the classes on the test dataset\n",
"y_test_pred = khc.predict(X_test)\n",
"print(\"Predicted classes (first 10):\")\n",
"print(y_test_pred[0:10])\n",
"print(\"---\")\n",
"\n",
"# Predict the class probabilities on the test dataset\n",
"y_test_probas = khc.predict_proba(X_test)\n",
"print(f\"Class order: {khc.classes_}\")\n",
"print(\"Predicted class probabilities (first 10):\")\n",
"print(y_test_probas[0:10])\n",
"print(\"---\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### `khiops_classifier_boolean_target()`\n\n",
"Trains a `.KhiopsClassifier` on a monotable dataframe\n with a boolean target\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Imports\n",
"import os\n",
"import pandas as pd\n",
"from khiops import core as kh\n",
"from khiops.sklearn import KhiopsClassifier\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"# Load the dataset into a pandas dataframe\n",
"adult_path = os.path.join(kh.get_samples_dir(), \"Adult\", \"Adult.txt\")\n",
"adult_df = pd.read_csv(adult_path, sep=\"\\t\")\n",
"adult_df[\"class\"] = adult_df[\"class\"].replace({\"less\": False, \"more\": True})\n",
"\n",
"# Split the whole dataframe into train and test (70%-30%)\n",
"adult_train_df, adult_test_df = train_test_split(\n",
" adult_df, test_size=0.3, random_state=1\n",
")\n",
"\n",
"# Split the dataset into:\n",
"# - the X feature table\n",
"# - the y target vector (\"class\" column)\n",
"X_train = adult_train_df.drop(\"class\", axis=1)\n",
"X_test = adult_test_df.drop(\"class\", axis=1)\n",
"y_train = adult_train_df[\"class\"]\n",
"\n",
"# Create the classifier object\n",
"khc = KhiopsClassifier()\n",
"\n",
"# Train the classifier\n",
"khc.fit(X_train, y_train)\n",
"\n",
"# Predict the classes on the test dataset\n",
"y_test_pred = khc.predict(X_test)\n",
"print(\"Predicted classes (first 10):\")\n",
"print(y_test_pred[0:10])\n",
"print(\"---\")\n",
"\n",
"# Predict the class probabilities on the test dataset\n",
"y_test_probas = khc.predict_proba(X_test)\n",
"print(f\"Class order: {khc.classes_}\")\n",
"print(\"Predicted class probabilities (first 10):\")\n",
"print(y_test_probas[0:10])\n",
"print(\"---\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down
51 changes: 48 additions & 3 deletions khiops/samples/samples_sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,52 @@ def khiops_classifier_float_target():
# Create the classifier object
khc = KhiopsClassifier()

# Train the classifier
khc.fit(X_train, y_train)
# Predict the classes on the test dataset
y_test_pred = khc.predict(X_test)
print("Predicted classes (first 10):")
print(y_test_pred[0:10])
print("---")

# Predict the class probabilities on the test dataset
y_test_probas = khc.predict_proba(X_test)
print(f"Class order: {khc.classes_}")
print("Predicted class probabilities (first 10):")
print(y_test_probas[0:10])
print("---")


def khiops_classifier_boolean_target():
"""Trains a `.KhiopsClassifier` on a monotable dataframe
with a boolean target"""
# Imports
import os
import pandas as pd
from khiops import core as kh
from khiops.sklearn import KhiopsClassifier
from sklearn.model_selection import train_test_split

# Load the dataset into a pandas dataframe
adult_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
adult_df = pd.read_csv(adult_path, sep="\t")
adult_df["class"] = adult_df["class"].replace({"less": False, "more": True})

# Split the whole dataframe into train and test (70%-30%)
adult_train_df, adult_test_df = train_test_split(
adult_df, test_size=0.3, random_state=1
)

# Split the dataset into:
# - the X feature table
# - the y target vector ("class" column)
X_train = adult_train_df.drop("class", axis=1)
X_test = adult_test_df.drop("class", axis=1)
y_train = adult_train_df["class"]

# Create the classifier object
khc = KhiopsClassifier()

# Train the classifier
khc.fit(X_train, y_train)

Expand Down Expand Up @@ -1108,8 +1154,6 @@ def khiops_classifier_multitable_star_file():
print(f"Test auc = {test_auc}")


exported_samples = [khiops_classifier_float_target]
"""
exported_samples = [
khiops_classifier,
khiops_classifier_multiclass,
Expand All @@ -1118,6 +1162,8 @@ def khiops_classifier_multitable_star_file():
khiops_classifier_sparse,
khiops_classifier_pickle,
khiops_classifier_with_hyperparameters,
khiops_classifier_float_target,
khiops_classifier_boolean_target,
khiops_regressor,
khiops_encoder,
khiops_encoder_multitable_star,
Expand All @@ -1129,7 +1175,6 @@ def khiops_classifier_multitable_star_file():
khiops_classifier_multitable_list,
khiops_classifier_multitable_star_file,
]
"""


def execute_samples(args):
Expand Down
37 changes: 30 additions & 7 deletions khiops/sklearn/estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ def _check_categorical_target_type(ds):
or pd.api.types.is_string_dtype(ds.target_column.dtype)
or pd.api.types.is_integer_dtype(ds.target_column.dtype)
or pd.api.types.is_float_dtype(ds.target_column.dtype)
or pd.api.types.is_bool_dtype(ds.target_column.dtype)
):
raise ValueError(
f"'y' has invalid type '{ds.target_column_type}'. "
Expand Down Expand Up @@ -2093,6 +2094,24 @@ def _is_real_target_dtype_integer(self):
)
)

def _is_real_target_dtype_float(self):
return self._original_target_dtype is not None and (
pd.api.types.is_float_dtype(self._original_target_dtype)
or (
isinstance(self._original_target_dtype, pd.CategoricalDtype)
and pd.api.types.is_float_dtype(self._original_target_dtype.categories)
)
)

def _is_real_target_dtype_bool(self):
return self._original_target_dtype is not None and (
pd.api.types.is_bool_dtype(self._original_target_dtype)
or (
isinstance(self._original_target_dtype, pd.CategoricalDtype)
and pd.api.types.is_bool_dtype(self._original_target_dtype.categories)
)
)

def _sorted_prob_variable_names(self):
"""Returns the model probability variable names in the order of self.classes_"""
assert self.is_fitted_, "Model not fit yet"
Expand Down Expand Up @@ -2195,11 +2214,15 @@ def _fit_training_post_process(self, ds):
for key in variable.meta_data.keys:
if key.startswith("TargetProb"):
self.classes_.append(variable.meta_data.get_value(key))
if ds.is_in_memory and self._is_real_target_dtype_integer():
self.classes_ = [int(class_value) for class_value in self.classes_]
if ds.is_in_memory:
if self._is_real_target_dtype_integer():
self.classes_ = [int(class_value) for class_value in self.classes_]
elif self._is_real_target_dtype_float():
self.classes_ = [float(class_value) for class_value in self.classes_]
elif self._is_real_target_dtype_bool():
self.classes_ = [class_value == "True" for class_value in self.classes_]
self.classes_.sort()
self.classes_ = column_or_1d(self.classes_)

# Count number of classes
self.n_classes_ = len(self.classes_)

Expand Down Expand Up @@ -2273,10 +2296,10 @@ def predict(self, X):
self._original_target_dtype
):
y_pred = y_pred.astype(str, copy=False)
elif pd.api.types.is_float_dtype(self._original_target_type):
print(self._original_target_type)
y_pred = y_pred.astype(str, copy=False)
print(y_pred)
elif pd.api.types.is_float_dtype(self._original_target_dtype):
y_pred = y_pred.astype(float, copy=False)
elif pd.api.types.is_bool_dtype(self._original_target_dtype):
y_pred = y_pred.astype(bool, copy=False)
# If category first coerce the type to the categories' type
else:
assert isinstance(self._original_target_dtype, pd.CategoricalDtype), (
Expand Down
7 changes: 6 additions & 1 deletion tests/test_sklearn_output_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,12 @@ def test_classifier_output_types(self):
"""Test the KhiopsClassifier output types and classes of predict* methods"""
X, y = create_iris()
X_mt, X_sec_mt, _ = create_iris_mt()

fixtures = {
"ys": {
"int": y,
"int binary": y.replace({0: 0, 1: 0, 2: 1}),
"float": y.replace({0: 0.0, 1: 1.0, 2: 2.0}).astype(float),
"bool": y.replace({0: True, 1: True, 2: False}),
"string": y.replace({0: "se", 1: "vi", 2: "ve"}),
"string binary": y.replace({0: "vi_or_se", 1: "vi_or_se", 2: "ve"}),
"int as string": y.replace({0: "8", 1: "9", 2: "10"}),
Expand All @@ -69,6 +70,8 @@ def test_classifier_output_types(self):
"y_type_check": {
"int": pd.api.types.is_integer_dtype,
"int binary": pd.api.types.is_integer_dtype,
"float": pd.api.types.is_float_dtype,
"bool": pd.api.types.is_bool_dtype,
"string": pd.api.types.is_string_dtype,
"string binary": pd.api.types.is_string_dtype,
"int as string": pd.api.types.is_string_dtype,
Expand All @@ -79,6 +82,8 @@ def test_classifier_output_types(self):
"expected_classes": {
"int": column_or_1d([0, 1, 2]),
"int binary": column_or_1d([0, 1]),
"float": column_or_1d([0.0, 1.0, 2.0]),
"bool": column_or_1d([False, True]),
"string": column_or_1d(["se", "ve", "vi"]),
"string binary": column_or_1d(["ve", "vi_or_se"]),
"int as string": column_or_1d(["10", "8", "9"]),
Expand Down

0 comments on commit 19ebe3c

Please sign in to comment.