Integrate latest SG contributions (#51)

* Fix pandas 1.0 compatibility (#45) * pd.core.algorithm don't exist anymore * compat pandas 1 et pandas 0 * fix test for pandas 1 * change pd.categories handling * fix typo in test * .isnull().sum() doesn't work with sparse input ... * add helper to remove sparse columns * fix na_replacing for sparse columns * test on na_replacing * change requirements to allow pandas 1 * new test for sparse * fix for compat sparse pd 1 * helper to make something sparse * change test for pd1 * fix test * fix conversion if there are categories * add test with RandomTrainTestCv : shows it can't be used with cross_val_predict and cross_validation (#48) * Doc improve3 (#46) * improve doc * read version from package * move notebook and scripts inside docs * massive doc refactoring * doc refactoring continue * ignore all bat from doc * doc refactor * add a stacking notebook * update notebooks * python script to execute_notebooks * downgrade title * Lgbm wrapper (#47) * add a testing of model function * add wrapper * fix typo + comments * create lgbm tests * remove duplicate import * cleanning up * make test optional if no lgbm * typos and spaces * dataframe as input in test * spaces Co-authored-by: Lionel MASSOULARD <[email protected]>
societe-generale · Apr 2, 2020 · fde5e89 · fde5e89
1 parent 210f5cc
commit fde5e89
Show file tree

Hide file tree

Showing 69 changed files with 105,914 additions and 2,273 deletions.
diff --git a/.gitignore b/.gitignore
@@ -120,6 +120,4 @@ venv.bak/
 pytest_report.html
 pytest_report_not_long.html
 .DS_Store
-/docs/make.bat
-/docs/make_auto.bat
-/docs/make_html.bat
+/docs/*.bat
diff --git a/aikit/cross_validation.py b/aikit/cross_validation.py
@@ -993,6 +993,6 @@ def _iter_test_masks(self, X, y, groups=None):
 
             groups_test = np.sort(np.unique(ugroups[index_test]))  # groups to keep in test
 
-            boolean_test_index = pd.core.algorithms.match(groups, groups_test) != -1
+            boolean_test_index = pd.Series(groups).isin(groups_test).values
 
             yield boolean_test_index
diff --git a/aikit/models/__init__.py b/aikit/models/__init__.py
@@ -8,6 +8,14 @@
 from .stacking import StackerClassifier, StackerRegressor, OutSamplerTransformer
 from .base import DBSCANWrapper, KMeansWrapper, AgglomerativeClusteringWrapper
 
+try:
+    import lightgbm
+except ImportError:
+    lightgbm = None
+
+if lightgbm is not None:
+    from .sklearn_lightgbm_wrapper import LGBMClassifier, LGBMRegressor, LGBMRanker
+
 __all__ = [
     "StackerClassifier",
     "StackerRegressor",
@@ -16,3 +24,6 @@
     "KMeansWrapper",
     "AgglomerativeClusteringWrapper",
 ]
+
+if lightgbm is not None:
+    __all__ += ["LGBMClassifier", "LGBMRegressor", "LGBMRanker"]
diff --git a/aikit/models/sklearn_lightgbm_wrapper.py b/aikit/models/sklearn_lightgbm_wrapper.py
diff --git a/aikit/tools/data_structure_helper.py b/aikit/tools/data_structure_helper.py
@@ -11,6 +11,8 @@
 
 from aikit.enums import DataTypes
 
+_IS_PD1 = int(pd.__version__.split(".")[0]) >= 1
+
 
 def get_type(data):
     """Retrieve the type of a data 
@@ -45,8 +47,8 @@ def get_type(data):
     elif type_of_data == np.ndarray:
         return DataTypes.NumpyArray
 
-    elif type_of_data == pd.SparseDataFrame:
-        return DataTypes.SparseDataFrame
+    elif not _IS_PD1 and type_of_data == pd.SparseDataFrame:
+        return DataTypes.SparseDataFrame # Won't exist in pandas 1.*.*
 
     elif sparse.issparse(data):
         return DataTypes.SparseArray
@@ -56,17 +58,37 @@ def get_type(data):
 
 
 def get_rid_of_categories(df):
+    """ helper function to remove pd.categories in a DataFrame """
     did_copy = False
     for col in df.columns:
         if str(df[col].dtype) == "category":
             if not did_copy:
                 df = df.copy()
                 did_copy = True
 
-            df[col] = df[col].get_values()
+            df[col] = df[col].astype(df[col].cat.categories.dtype)
 
     return df
 
+def get_rid_of_sparse_columns(xx):
+    """ helper function to remove sparse column in a DataFrame """
+
+    if not isinstance(xx, pd.DataFrame):
+        raise TypeError("This function is for DataFrames only")
+
+    did_copy = False
+    for col in xx.columns:
+        if hasattr(xx[col], "sparse"):
+            if not did_copy:
+                result = xx.copy()
+                did_copy = True
+            result[col] = result[col].sparse.to_dense()
+
+    if not did_copy:
+        result = xx # nothing was done
+
+    return result
+
 
 def convert_to_dataframe(xx, mapped_type=None):
     """ convert something to a DataFrame """
@@ -105,7 +127,7 @@ def convert_to_array(xx, mapped_type=None):
         return convert_to_array(convert_to_dataframe(xx))
 
     if mapped_type == DataTypes.DataFrame:
-        return xx.values
+        return get_rid_of_categories(xx).values
 
     elif mapped_type == DataTypes.Serie:
         return xx.values.reshape((xx.shape[0], 1))
@@ -139,7 +161,10 @@ def convert_to_sparsearray(xx, mapped_type=None):
         return convert_to_sparsearray(convert_to_dataframe(xx))
 
     if mapped_type == DataTypes.DataFrame:
-        return sparse.csr_matrix(xx.values)
+        casting_dtype = np.concatenate([np.zeros(1, dtype=s.type) for s in get_rid_of_categories(xx).dtypes]).dtype
+        return sparse.csr_matrix(xx.values.astype(casting_dtype))
+        # look at the dtype in the DataFrame
+        # concat will create for me the "smallest" type holding all that
 
     elif mapped_type == DataTypes.Serie:
         sparse.csr_matrix(xx.values[:, np.newaxis])  # np.newaxis to make sure I have 2 dimensio
@@ -160,6 +185,13 @@ def convert_to_sparsearray(xx, mapped_type=None):
         raise TypeError("I don't know how to convert that %s" % type(xx))
 
 
+def convert_to_sparseserie(xx):
+    """ helper to convert a serie to its sparse equivalent """
+    if hasattr(xx, "sparse"):
+        return xx # nothing to do
+
+    return xx.astype(pd.SparseDtype(xx.dtype))
+
 def convert_to_sparsedataframe(xx, mapped_type=None):
     """ convert something to a Sparse DataFrame """
 
@@ -170,19 +202,39 @@ def convert_to_sparsedataframe(xx, mapped_type=None):
         return convert_to_sparsedataframe(convert_to_dataframe(xx))
 
     if mapped_type == DataTypes.DataFrame:
-        return pd.SparseDataFrame(xx, default_fill_value=0)
+        if _IS_PD1:
+            result = xx.copy()
+            for col in xx.columns:
+                result[col] = xx[col].astype(pd.SparseDtype(xx.dtypes[col]))
+            return result
+        else:
+            return pd.SparseDataFrame(xx, default_fill_value=0)
 
     elif mapped_type == DataTypes.Serie:
-        return pd.SparseDataFrame(pd.DataFrame(xx), default_fill_value=0)
+        if _IS_PD1:
+            return pd.DataFrame(xx, dtype=pd.SparseDtype(xx.dtype), index=xx.index)
+        else:
+            return pd.SparseDataFrame(pd.DataFrame(xx), default_fill_value=0)
+
 
     elif mapped_type == DataTypes.NumpyArray:
+
         if xx.ndim == 1:
-            return pd.SparseDataFrame(xx.reshape((xx.shape[0], 1)), default_fill_value=0)
+            if _IS_PD1:
+                return pd.DataFrame({0: pd.arrays.SparseArray(xx)})
+            else:
+                return pd.SparseDataFrame(xx.reshape((xx.shape[0], 1)), default_fill_value=0)
         else:
-            return pd.SparseDataFrame(xx, default_fill_value=0)
+            if _IS_PD1:
+                return pd.DataFrame({j:pd.arrays.SparseArray(xx[:,j]) for j in range(xx.shape[1])})
+            else:
+                return pd.SparseDataFrame(xx, default_fill_value=0)
 
     elif mapped_type == DataTypes.SparseArray:
-        return pd.SparseDataFrame(xx, default_fill_value=0)
+        if _IS_PD1:
+            return pd.DataFrame.sparse.from_spmatrix(xx)
+        else:
+            return pd.SparseDataFrame(xx, default_fill_value=0)
 
     elif mapped_type == DataTypes.SparseDataFrame:
         return xx
@@ -204,6 +256,9 @@ def convert_tononsparse(xx, mapped_type=None):
 
     elif mapped_type == DataTypes.SparseDataFrame:
         return convert_to_dataframe(xx)
+
+    elif _IS_PD1 and mapped_type == DataTypes.DataFrame:
+        return get_rid_of_sparse_columns(xx)
 
     else:
         return xx
@@ -307,10 +362,10 @@ def _nbrows(data):
         return s[0]
 
 
-def guess_output_type(all_datas):
+def guess_output_type(all_datas, max_number_of_cells_for_non_sparse=10000000):
     """ try to guess which output type should be better based on size of the data """
 
-    MAX_NUMBER_OF_CELLS = 10000000  #  1000 * 10000 # 1000 columns and 10 000 rows
+    # 1000 * 10000 # 1000 columns and 10 000 rows
     all_types = [get_type(data) for data in all_datas]
     all_types = list(np.unique([t for t in all_types if t is not None]))
 
@@ -337,15 +392,15 @@ def guess_output_type(all_datas):
         # carefull np.sum result should be cast at int : otherwise it can be np.int32 and generate overflow which can make the product
 
         # Lots of data point
-        if expected_number_of_columns * _nbrows(all_datas[0]) >= MAX_NUMBER_OF_CELLS:
+        if expected_number_of_columns * _nbrows(all_datas[0]) >= max_number_of_cells_for_non_sparse:
             return DataTypes.SparseArray
         else:
             return DataTypes.NumpyArray
 
     else:
         expected_number_of_columns = int(np.sum([_nbcols(data) for data in all_datas]))
         # careful np.sum result should be cast at int : otherwise it can be np.int32 and generate overflow
-        if expected_number_of_columns * _nbrows(all_datas[0]) >= MAX_NUMBER_OF_CELLS:
+        if expected_number_of_columns * _nbrows(all_datas[0]) >= max_number_of_cells_for_non_sparse:
             return DataTypes.SparseArray
             # return DataTypes.SparseDataFrame
         else:
@@ -464,7 +519,7 @@ def guess_hstack_index(all_datas, raise_if_different=False):
     return all_indexes[0]
 
 
-def generic_hstack(all_datas, output_type=None, all_columns_names=None):
+def generic_hstack(all_datas, output_type=None, all_columns_names=None, max_number_of_cells_for_non_sparse=10000000):
     """ generic function to concatenate horizontaly some datas objects
     
     All datas should have the same number of rows
@@ -487,7 +542,7 @@ def generic_hstack(all_datas, output_type=None, all_columns_names=None):
     """
 
     if output_type is None:
-        output_type = guess_output_type(all_datas)
+        output_type = guess_output_type(all_datas, max_number_of_cells_for_non_sparse=max_number_of_cells_for_non_sparse)
 
     all_datas = [data for data in all_datas if _nbcols(data) > 0]
     nb_of_datas = len(all_datas)

diff --git a/aikit/tools/db_informations.py b/aikit/tools/db_informations.py
@@ -113,7 +113,7 @@ def has_missing_values(s):
     if not isinstance(s, pd.Series):
         raise TypeError("s should be a Serie, not a '%s'" % type(s))
 
-    return bool(s.isnull().sum() > 0)  # to prevent np.bool_
+    return bool(np.asarray(s.isnull()).sum().sum() > 0)  # to prevent np.bool_
 
 
 def get_columns_informations(dfX):

diff --git a/aikit/transformers/categories.py b/aikit/transformers/categories.py
@@ -74,8 +74,7 @@ def modalities_filter(self, input_serie):
             raise TypeError("input_serie should be a pd.Series")
 
         value_count = input_serie.value_counts()
-        nb_null = input_serie.isnull().sum()
-
+        nb_null = np.asarray(input_serie.isnull().values).sum() # Remark : input_serie.isnull().sum() doesn't work if sparse Serie
         if nb_null > self.max_na_percentage * len(input_serie):
             value_count["__null__"] = nb_null
             value_count.sort_values(ascending=False, inplace=True)

diff --git a/aikit/transformers/target.py b/aikit/transformers/target.py
@@ -131,15 +131,18 @@ def smoothing(self, nb):
     @staticmethod
     def na_remplacing(serie):
         """ remplace None with '_missing_' to make it a modality """
-
+
+        if hasattr(serie, "sparse"):
+            serie = serie.sparse.to_dense().copy()
+
         ii_null = serie.isnull()
-
+    
         if not ii_null.any():
             return serie
-
+    
         result_serie = serie.copy()
         result_serie[ii_null] = "_missing_"
-
+    
         return result_serie
 
     def fit(self, X, y):
@@ -200,7 +203,7 @@ def fit(self, X, y):
         # Columns on which we want None to be a special modality
         self._na_to_null = dict()
         for col in self._columns_to_encode:
-            ii_null = X[col].isnull()
+            ii_null = X[col].isnull().values # for sparse column I need to call '.values' so that '.sum()' works on it
             self._na_to_null[col] = ii_null.sum() >= self.max_na_percentage * len(X)
 
         self._target_aggregat, self._target_aggregat_global = self._fit_aggregat(X, sy, noise_level=None)
@@ -317,8 +320,12 @@ def _transform_aggregat(self, X, target_aggregat, target_aggregat_global):
                 Xcol = self.na_remplacing(X[col])
             else:
                 Xcol = X[col]
-
-            result = Xcol.apply(lambda x: self.get_value(x, target_aggregat[col], target_aggregat_global[col]))
+
+            if hasattr(Xcol, "sparse"):
+                result = Xcol.sparse.to_dense().apply(lambda x: self.get_value(x, target_aggregat[col], target_aggregat_global[col]))
+                # for some reason apply does't work correctly on sparse data
+            else:
+                result = Xcol.apply(lambda x: self.get_value(x, target_aggregat[col], target_aggregat_global[col]))
             # result.columns = ["%s__%s" % (col,c) for c in result.columns]
             all_results.append(result)
 

diff --git a/docs/adding_new_models.rst b/docs/adding_new_models.rst
@@ -13,6 +13,7 @@ A model needs to be added at two different places in order to be fully integrate
 Let's see what's need to be done to include an hypothetic new models::
 
     class ReallyCoolNewTransformer(BaseEstimator, TransformerMixin):
+        """ This is a great new  transformer """
         def __init__(self, super_choice):
             self.super_choice = super_choice
             
@@ -63,7 +64,8 @@ To do that you need to use the @register decorator::
         custom_hyper = {"super_parameters":hp.HyperChoice(("superchoice_a","superchoice_b"))}
 
     
-See _model_register for complete description of register
+See :ref:`model_register` for complete description of register.
+See :ref:`hyper_parameters` for complete description of register.
 
 Remark:
 The registers behaves like singletons so you can modify them in any part of the code.
@@ -74,5 +76,16 @@ If a model is stable and tested enough the new entry can be added to the python
  * 'model_definition.py' : for the simple register
  * ml_machine/ml_machine_registration.py : for the full auto-ml register
 
+(See :ref:`contribution` for detailed about how to contribute to the evolution of the library)
 
 Remark : you don't need to use the wrapper for your model to be incorporated in the framework. However, it is best to do so. That way you can focus on the logic and let the wrapper make your model more generic.
+
+
+.. toctree::
+   :maxdepth: 1
+   :hidden:
+
+   model_register
+   hyper_parameters
+
+
diff --git a/docs/aikit.datasets.rst b/docs/aikit.datasets.rst
diff --git a/docs/aikit.datasets.tests.rst b/docs/aikit.datasets.tests.rst