diff --git a/category_encoders/cat_boost.py b/category_encoders/cat_boost.py index 8ce57240..11b813df 100644 --- a/category_encoders/cat_boost.py +++ b/category_encoders/cat_boost.py @@ -103,7 +103,6 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, def _fit(self, X, y, **kwargs): X = X.copy(deep=True) - self._mean = y.mean() self.mapping = {col: self._fit_column_map(X[col], y) for col in self.cols} diff --git a/category_encoders/utils.py b/category_encoders/utils.py index 3a519366..1671fc55 100644 --- a/category_encoders/utils.py +++ b/category_encoders/utils.py @@ -6,12 +6,13 @@ import pandas as pd import numpy as np import sklearn.base -from pandas.api.types import is_object_dtype, is_string_dtype +from pandas.api.types import is_object_dtype, is_string_dtype, is_numeric_dtype from pandas.core.dtypes.dtypes import CategoricalDtype from sklearn.base import BaseEstimator, TransformerMixin from sklearn.exceptions import NotFittedError from typing import Dict, List, Optional, Union from scipy.sparse import csr_matrix +from sklearn.preprocessing import LabelEncoder __author__ = 'willmcginnis' @@ -294,11 +295,18 @@ def fit(self, X, y=None, **kwargs): Returns self. """ - self._check_fit_inputs(X, y) X, y = convert_inputs(X, y) + self._check_fit_inputs(X, y) self.feature_names_in_ = X.columns.tolist() self.n_features_in_ = len(self.feature_names_in_) + if self._get_tags().get('supervised_encoder'): + if not is_numeric_dtype(y): + self.lab_encoder_ = LabelEncoder() + y = self.lab_encoder_.fit_transform(y) + else: + self.lab_encoder_ = None + self._dim = X.shape[1] self._determine_fit_columns(X) @@ -324,8 +332,12 @@ def fit(self, X, y=None, **kwargs): return self def _check_fit_inputs(self, X, y): - if self._get_tags().get('supervised_encoder') and y is None: - raise ValueError('Supervised encoders need a target for the fitting. The target cannot be None') + if self._get_tags().get('supervised_encoder'): + if y is None: + raise ValueError('Supervised encoders need a target for the fitting. The target cannot be None') + else: + if y.isna().any(): # Target column should never have missing values + raise ValueError("The target column y must not contain missing values.") def _check_transform_inputs(self, X): if self.handle_missing == 'error': @@ -435,6 +447,8 @@ def transform(self, X, y=None, override_return_df=False): # first check the type X, y = convert_inputs(X, y, deep=True) self._check_transform_inputs(X) + if y is not None and self.lab_encoder_ is not None: + y = self.lab_encoder_.transform(y) if not list(self.cols): return X diff --git a/category_encoders/woe.py b/category_encoders/woe.py index af0781bb..a49f6fc2 100644 --- a/category_encoders/woe.py +++ b/category_encoders/woe.py @@ -3,6 +3,7 @@ from category_encoders.ordinal import OrdinalEncoder import category_encoders.utils as util from sklearn.utils.random import check_random_state +import pandas as pd __author__ = 'Jan Motl' @@ -87,6 +88,7 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, def _fit(self, X, y, **kwargs): # The label must be binary with values {0,1} + y = pd.Series(y) unique = y.unique() if len(unique) != 2: raise ValueError("The target column y must be binary. But the target contains " + str(len(unique)) + " unique value(s).")