Skip to content

Commit

Permalink
Fix numericalencoder (#22)
Browse files Browse the repository at this point in the history
* fix NumericalEncoder with default values

* fix NumericalEncoder with default values
  • Loading branch information
Fabien VAVRAND authored and Lionel MASSOULARD committed Oct 10, 2019
1 parent 3bbc4ab commit f819e2d
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 15 deletions.
22 changes: 9 additions & 13 deletions aikit/transformers/categories.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,16 +185,6 @@ def fit(self, X, y=None):

self.variable_modality_mapping = {col: self.modalities_filter(X[col]) for col in self._columns_to_encode}

self._variable_modality_dict = {}
for col in self._columns_to_encode:
# ddict = defaultdict(lambda :-1, self.variable_modality_mapping[col])
ddict = dict(self.variable_modality_mapping[col])
if "__null__" in self.variable_modality_mapping[col]:
ddict[np.nan] = self.variable_modality_mapping[col]["__null__"]
ddict[None] = self.variable_modality_mapping[col]["__null__"]

self._variable_modality_dict[col] = ddict

# Rmk : si on veut pas faire un encodage ou les variables sont par ordre croissant, on peut faire un randomization des numbre ici

if self.encoding_type == "num":
Expand Down Expand Up @@ -266,9 +256,15 @@ def transform(self, X):
return result

def _transform_to_encode(self, X):
all_result_series = [
X[col].map(defaultdict(lambda: -1, self._variable_modality_dict[col])) for col in self._columns_to_encode
]

all_result_series = []
for col, mapping in self.variable_modality_mapping.items():
default_value = -1 if "__default__" not in mapping else mapping["__default__"]
mapping = defaultdict(lambda: default_value, mapping)
if "__null__" in mapping:
mapping[np.nan] = mapping["__null__"]
mapping[None] = mapping["__null__"]
all_result_series.append(X[col].map(mapping))

if self.encoding_type == "num":
result = pd.concat(all_result_series, axis=1, ignore_index=True, copy=False).astype(np.int32)
Expand Down
27 changes: 25 additions & 2 deletions tests/transformers/test_categories.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,8 +268,7 @@ def test_NumericalEncoder_num_fit_parameters():

np.random.seed(123)
df = get_sample_df(100, seed=123)
ind = np.arange(len(df))
df.index = ind
df.index = np.arange(len(df))

df["cat_col_1"] = df["text_col"].apply(lambda s: s[0:3])
df["cat_col_2"] = df["text_col"].apply(lambda s: s[4:7])
Expand Down Expand Up @@ -337,6 +336,30 @@ def test_NumericalEncoder_num_fit_parameters():
assert res["cat_col_3"].nunique() == 4


def test_NumericalEncoder_default_and_null_values():
np.random.seed(123)
df = get_sample_df(100, seed=123)
df.index = np.arange(len(df))

df["cat_col_1"] = df["text_col"].apply(lambda s: s[0:3])
df.loc[0:10, "cat_col_1"] = None

# All modalities are kept, __null__ category is created
encoder = NumericalEncoder(encoding_type="num", min_modalities_number=2, max_cum_proba=0.8, max_na_percentage=0)

res = encoder.fit_transform(df)
assert '__default__' in encoder.model.variable_modality_mapping['cat_col_1']
assert '__null__' in encoder.model.variable_modality_mapping['cat_col_1']

df["cat_col_1"] = 'zzz' # Never seen value
res = encoder.transform(df)
assert res["cat_col_1"].unique()[0] == encoder.model.variable_modality_mapping['cat_col_1']['__default__']

df["cat_col_1"] = None
res = encoder.transform(df)
assert res["cat_col_1"].unique()[0] == encoder.model.variable_modality_mapping['cat_col_1']['__null__']


def test_NumericalEncoder_with_boolean():
dfX = pd.DataFrame({"c": [True, False] * 200})

Expand Down

0 comments on commit f819e2d

Please sign in to comment.