Skip to content

Commit

Permalink
Integrate latest SG contributions (#51)
Browse files Browse the repository at this point in the history
* Fix pandas 1.0 compatibility (#45)

* pd.core.algorithm don't exist anymore

* compat pandas 1 et pandas 0

* fix test for pandas 1

* change pd.categories handling

* fix typo in test

* .isnull().sum() doesn't work with sparse input ...

* add helper to remove sparse columns

* fix na_replacing for sparse columns

* test on na_replacing

* change requirements to allow pandas 1

* new test for sparse

* fix for  compat sparse pd 1

* helper to make something sparse

* change test for pd1

* fix test

* fix conversion if there are categories

* add test with RandomTrainTestCv : shows it can't be used with cross_val_predict and cross_validation (#48)

* Doc improve3 (#46)

* improve doc

* read version from package

* move notebook and scripts inside docs

* massive doc refactoring

* doc refactoring continue

* ignore all bat from doc

* doc refactor

* add a stacking notebook

* update notebooks

* python script to execute_notebooks

* downgrade title

* Lgbm wrapper (#47)

* add a testing of model function

* add wrapper

* fix typo + comments

* create lgbm tests

* remove duplicate import

* cleanning up

* make test optional if no lgbm

* typos and spaces

* dataframe as input in test

* spaces

Co-authored-by: Lionel MASSOULARD <[email protected]>
  • Loading branch information
gfournier and Lionel MASSOULARD authored Apr 2, 2020
1 parent 210f5cc commit fde5e89
Show file tree
Hide file tree
Showing 69 changed files with 105,914 additions and 2,273 deletions.
4 changes: 1 addition & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,4 @@ venv.bak/
pytest_report.html
pytest_report_not_long.html
.DS_Store
/docs/make.bat
/docs/make_auto.bat
/docs/make_html.bat
/docs/*.bat
2 changes: 1 addition & 1 deletion aikit/cross_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -993,6 +993,6 @@ def _iter_test_masks(self, X, y, groups=None):

groups_test = np.sort(np.unique(ugroups[index_test])) # groups to keep in test

boolean_test_index = pd.core.algorithms.match(groups, groups_test) != -1
boolean_test_index = pd.Series(groups).isin(groups_test).values

yield boolean_test_index
11 changes: 11 additions & 0 deletions aikit/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,14 @@
from .stacking import StackerClassifier, StackerRegressor, OutSamplerTransformer
from .base import DBSCANWrapper, KMeansWrapper, AgglomerativeClusteringWrapper

try:
import lightgbm
except ImportError:
lightgbm = None

if lightgbm is not None:
from .sklearn_lightgbm_wrapper import LGBMClassifier, LGBMRegressor, LGBMRanker

__all__ = [
"StackerClassifier",
"StackerRegressor",
Expand All @@ -16,3 +24,6 @@
"KMeansWrapper",
"AgglomerativeClusteringWrapper",
]

if lightgbm is not None:
__all__ += ["LGBMClassifier", "LGBMRegressor", "LGBMRanker"]
989 changes: 989 additions & 0 deletions aikit/models/sklearn_lightgbm_wrapper.py

Large diffs are not rendered by default.

87 changes: 71 additions & 16 deletions aikit/tools/data_structure_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@

from aikit.enums import DataTypes

_IS_PD1 = int(pd.__version__.split(".")[0]) >= 1


def get_type(data):
"""Retrieve the type of a data
Expand Down Expand Up @@ -45,8 +47,8 @@ def get_type(data):
elif type_of_data == np.ndarray:
return DataTypes.NumpyArray

elif type_of_data == pd.SparseDataFrame:
return DataTypes.SparseDataFrame
elif not _IS_PD1 and type_of_data == pd.SparseDataFrame:
return DataTypes.SparseDataFrame # Won't exist in pandas 1.*.*

elif sparse.issparse(data):
return DataTypes.SparseArray
Expand All @@ -56,17 +58,37 @@ def get_type(data):


def get_rid_of_categories(df):
""" helper function to remove pd.categories in a DataFrame """
did_copy = False
for col in df.columns:
if str(df[col].dtype) == "category":
if not did_copy:
df = df.copy()
did_copy = True

df[col] = df[col].get_values()
df[col] = df[col].astype(df[col].cat.categories.dtype)

return df

def get_rid_of_sparse_columns(xx):
""" helper function to remove sparse column in a DataFrame """

if not isinstance(xx, pd.DataFrame):
raise TypeError("This function is for DataFrames only")

did_copy = False
for col in xx.columns:
if hasattr(xx[col], "sparse"):
if not did_copy:
result = xx.copy()
did_copy = True
result[col] = result[col].sparse.to_dense()

if not did_copy:
result = xx # nothing was done

return result


def convert_to_dataframe(xx, mapped_type=None):
""" convert something to a DataFrame """
Expand Down Expand Up @@ -105,7 +127,7 @@ def convert_to_array(xx, mapped_type=None):
return convert_to_array(convert_to_dataframe(xx))

if mapped_type == DataTypes.DataFrame:
return xx.values
return get_rid_of_categories(xx).values

elif mapped_type == DataTypes.Serie:
return xx.values.reshape((xx.shape[0], 1))
Expand Down Expand Up @@ -139,7 +161,10 @@ def convert_to_sparsearray(xx, mapped_type=None):
return convert_to_sparsearray(convert_to_dataframe(xx))

if mapped_type == DataTypes.DataFrame:
return sparse.csr_matrix(xx.values)
casting_dtype = np.concatenate([np.zeros(1, dtype=s.type) for s in get_rid_of_categories(xx).dtypes]).dtype
return sparse.csr_matrix(xx.values.astype(casting_dtype))
# look at the dtype in the DataFrame
# concat will create for me the "smallest" type holding all that

elif mapped_type == DataTypes.Serie:
sparse.csr_matrix(xx.values[:, np.newaxis]) # np.newaxis to make sure I have 2 dimensio
Expand All @@ -160,6 +185,13 @@ def convert_to_sparsearray(xx, mapped_type=None):
raise TypeError("I don't know how to convert that %s" % type(xx))


def convert_to_sparseserie(xx):
""" helper to convert a serie to its sparse equivalent """
if hasattr(xx, "sparse"):
return xx # nothing to do

return xx.astype(pd.SparseDtype(xx.dtype))

def convert_to_sparsedataframe(xx, mapped_type=None):
""" convert something to a Sparse DataFrame """

Expand All @@ -170,19 +202,39 @@ def convert_to_sparsedataframe(xx, mapped_type=None):
return convert_to_sparsedataframe(convert_to_dataframe(xx))

if mapped_type == DataTypes.DataFrame:
return pd.SparseDataFrame(xx, default_fill_value=0)
if _IS_PD1:
result = xx.copy()
for col in xx.columns:
result[col] = xx[col].astype(pd.SparseDtype(xx.dtypes[col]))
return result
else:
return pd.SparseDataFrame(xx, default_fill_value=0)

elif mapped_type == DataTypes.Serie:
return pd.SparseDataFrame(pd.DataFrame(xx), default_fill_value=0)
if _IS_PD1:
return pd.DataFrame(xx, dtype=pd.SparseDtype(xx.dtype), index=xx.index)
else:
return pd.SparseDataFrame(pd.DataFrame(xx), default_fill_value=0)


elif mapped_type == DataTypes.NumpyArray:

if xx.ndim == 1:
return pd.SparseDataFrame(xx.reshape((xx.shape[0], 1)), default_fill_value=0)
if _IS_PD1:
return pd.DataFrame({0: pd.arrays.SparseArray(xx)})
else:
return pd.SparseDataFrame(xx.reshape((xx.shape[0], 1)), default_fill_value=0)
else:
return pd.SparseDataFrame(xx, default_fill_value=0)
if _IS_PD1:
return pd.DataFrame({j:pd.arrays.SparseArray(xx[:,j]) for j in range(xx.shape[1])})
else:
return pd.SparseDataFrame(xx, default_fill_value=0)

elif mapped_type == DataTypes.SparseArray:
return pd.SparseDataFrame(xx, default_fill_value=0)
if _IS_PD1:
return pd.DataFrame.sparse.from_spmatrix(xx)
else:
return pd.SparseDataFrame(xx, default_fill_value=0)

elif mapped_type == DataTypes.SparseDataFrame:
return xx
Expand All @@ -204,6 +256,9 @@ def convert_tononsparse(xx, mapped_type=None):

elif mapped_type == DataTypes.SparseDataFrame:
return convert_to_dataframe(xx)

elif _IS_PD1 and mapped_type == DataTypes.DataFrame:
return get_rid_of_sparse_columns(xx)

else:
return xx
Expand Down Expand Up @@ -307,10 +362,10 @@ def _nbrows(data):
return s[0]


def guess_output_type(all_datas):
def guess_output_type(all_datas, max_number_of_cells_for_non_sparse=10000000):
""" try to guess which output type should be better based on size of the data """

MAX_NUMBER_OF_CELLS = 10000000 # 1000 * 10000 # 1000 columns and 10 000 rows
# 1000 * 10000 # 1000 columns and 10 000 rows
all_types = [get_type(data) for data in all_datas]
all_types = list(np.unique([t for t in all_types if t is not None]))

Expand All @@ -337,15 +392,15 @@ def guess_output_type(all_datas):
# carefull np.sum result should be cast at int : otherwise it can be np.int32 and generate overflow which can make the product

# Lots of data point
if expected_number_of_columns * _nbrows(all_datas[0]) >= MAX_NUMBER_OF_CELLS:
if expected_number_of_columns * _nbrows(all_datas[0]) >= max_number_of_cells_for_non_sparse:
return DataTypes.SparseArray
else:
return DataTypes.NumpyArray

else:
expected_number_of_columns = int(np.sum([_nbcols(data) for data in all_datas]))
# careful np.sum result should be cast at int : otherwise it can be np.int32 and generate overflow
if expected_number_of_columns * _nbrows(all_datas[0]) >= MAX_NUMBER_OF_CELLS:
if expected_number_of_columns * _nbrows(all_datas[0]) >= max_number_of_cells_for_non_sparse:
return DataTypes.SparseArray
# return DataTypes.SparseDataFrame
else:
Expand Down Expand Up @@ -464,7 +519,7 @@ def guess_hstack_index(all_datas, raise_if_different=False):
return all_indexes[0]


def generic_hstack(all_datas, output_type=None, all_columns_names=None):
def generic_hstack(all_datas, output_type=None, all_columns_names=None, max_number_of_cells_for_non_sparse=10000000):
""" generic function to concatenate horizontaly some datas objects
All datas should have the same number of rows
Expand All @@ -487,7 +542,7 @@ def generic_hstack(all_datas, output_type=None, all_columns_names=None):
"""

if output_type is None:
output_type = guess_output_type(all_datas)
output_type = guess_output_type(all_datas, max_number_of_cells_for_non_sparse=max_number_of_cells_for_non_sparse)

all_datas = [data for data in all_datas if _nbcols(data) > 0]
nb_of_datas = len(all_datas)
Expand Down
2 changes: 1 addition & 1 deletion aikit/tools/db_informations.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def has_missing_values(s):
if not isinstance(s, pd.Series):
raise TypeError("s should be a Serie, not a '%s'" % type(s))

return bool(s.isnull().sum() > 0) # to prevent np.bool_
return bool(np.asarray(s.isnull()).sum().sum() > 0) # to prevent np.bool_


def get_columns_informations(dfX):
Expand Down
3 changes: 1 addition & 2 deletions aikit/transformers/categories.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,7 @@ def modalities_filter(self, input_serie):
raise TypeError("input_serie should be a pd.Series")

value_count = input_serie.value_counts()
nb_null = input_serie.isnull().sum()

nb_null = np.asarray(input_serie.isnull().values).sum() # Remark : input_serie.isnull().sum() doesn't work if sparse Serie
if nb_null > self.max_na_percentage * len(input_serie):
value_count["__null__"] = nb_null
value_count.sort_values(ascending=False, inplace=True)
Expand Down
21 changes: 14 additions & 7 deletions aikit/transformers/target.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,15 +131,18 @@ def smoothing(self, nb):
@staticmethod
def na_remplacing(serie):
""" remplace None with '_missing_' to make it a modality """


if hasattr(serie, "sparse"):
serie = serie.sparse.to_dense().copy()

ii_null = serie.isnull()

if not ii_null.any():
return serie

result_serie = serie.copy()
result_serie[ii_null] = "_missing_"

return result_serie

def fit(self, X, y):
Expand Down Expand Up @@ -200,7 +203,7 @@ def fit(self, X, y):
# Columns on which we want None to be a special modality
self._na_to_null = dict()
for col in self._columns_to_encode:
ii_null = X[col].isnull()
ii_null = X[col].isnull().values # for sparse column I need to call '.values' so that '.sum()' works on it
self._na_to_null[col] = ii_null.sum() >= self.max_na_percentage * len(X)

self._target_aggregat, self._target_aggregat_global = self._fit_aggregat(X, sy, noise_level=None)
Expand Down Expand Up @@ -317,8 +320,12 @@ def _transform_aggregat(self, X, target_aggregat, target_aggregat_global):
Xcol = self.na_remplacing(X[col])
else:
Xcol = X[col]

result = Xcol.apply(lambda x: self.get_value(x, target_aggregat[col], target_aggregat_global[col]))

if hasattr(Xcol, "sparse"):
result = Xcol.sparse.to_dense().apply(lambda x: self.get_value(x, target_aggregat[col], target_aggregat_global[col]))
# for some reason apply does't work correctly on sparse data
else:
result = Xcol.apply(lambda x: self.get_value(x, target_aggregat[col], target_aggregat_global[col]))
# result.columns = ["%s__%s" % (col,c) for c in result.columns]
all_results.append(result)

Expand Down
15 changes: 14 additions & 1 deletion docs/adding_new_models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ A model needs to be added at two different places in order to be fully integrate
Let's see what's need to be done to include an hypothetic new models::

class ReallyCoolNewTransformer(BaseEstimator, TransformerMixin):
""" This is a great new transformer """
def __init__(self, super_choice):
self.super_choice = super_choice
Expand Down Expand Up @@ -63,7 +64,8 @@ To do that you need to use the @register decorator::
custom_hyper = {"super_parameters":hp.HyperChoice(("superchoice_a","superchoice_b"))}

See _model_register for complete description of register
See :ref:`model_register` for complete description of register.
See :ref:`hyper_parameters` for complete description of register.

Remark:
The registers behaves like singletons so you can modify them in any part of the code.
Expand All @@ -74,5 +76,16 @@ If a model is stable and tested enough the new entry can be added to the python
* 'model_definition.py' : for the simple register
* ml_machine/ml_machine_registration.py : for the full auto-ml register

(See :ref:`contribution` for detailed about how to contribute to the evolution of the library)

Remark : you don't need to use the wrapper for your model to be incorporated in the framework. However, it is best to do so. That way you can focus on the logic and let the wrapper make your model more generic.


.. toctree::
:maxdepth: 1
:hidden:

model_register
hyper_parameters


22 changes: 0 additions & 22 deletions docs/aikit.datasets.rst

This file was deleted.

22 changes: 0 additions & 22 deletions docs/aikit.datasets.tests.rst

This file was deleted.

Loading

0 comments on commit fde5e89

Please sign in to comment.