Merge pull request #5 from elisemercury/v1.0.0-bug-fixes

V1.0.0 bug fixes
elisemercury · Jun 15, 2022 · ead49e1 · ead49e1
2 parents 9ad8c59 + b9875e3
commit ead49e1
Show file tree

Hide file tree

Showing 2 changed files with 37 additions and 22 deletions.
diff --git a/AutoClean/Modules.py b/AutoClean/Modules.py
@@ -20,7 +20,7 @@ class MissingValues:
 
     def handle(self, df, _n_neighbors=3):
         # function for handling missing values in the data
-        logger.info('Started handling of missing values...', self.missing_num.upper())
+        logger.info('Started handling of missing values...', str(self.missing_num).upper())
         start = timer()
         self.count_missing = df.isna().sum().sum()
 
@@ -30,7 +30,7 @@ def handle(self, df, _n_neighbors=3):
             df.reset_index(drop=True)
 
             if self.missing_num: # numeric data
-                logger.info('Started handling of NUMERICAL missing values... Method: "{}"', self.missing_num.upper())
+                logger.info('Started handling of NUMERICAL missing values... Method: "{}"', str(self.missing_num).upper())
                 # automated handling
                 if self.missing_num == 'auto': 
                     self.missing_num = 'linreg'
@@ -57,7 +57,7 @@ def handle(self, df, _n_neighbors=3):
                     logger.debug('Deletion of {} NUMERIC missing value(s) succeeded', self.count_missing-df.isna().sum().sum())      
 
             if self.missing_categ: # categorical data
-                logger.info('Started handling of CATEGORICAL missing values... Method: "{}"', self.missing_categ.upper())
+                logger.info('Started handling of CATEGORICAL missing values... Method: "{}"', str(self.missing_categ).upper())
                 # automated handling
                 if self.missing_categ == 'auto':
                     self.missing_categ = 'logreg'
@@ -105,9 +105,9 @@ def _impute(self, df, imputer, type):
                             else:
                                 df[feature] = df_imputed
                             if counter != 0:
-                                logger.debug('{} imputation of {} value(s) succeeded for feature "{}"', self.missing_num.upper(), counter, feature)
+                                logger.debug('{} imputation of {} value(s) succeeded for feature "{}"', str(self.missing_num).upper(), counter, feature)
                         except:
-                            logger.warning('{} imputation failed for feature "{}"', self.missing_num.upper(), feature)
+                            logger.warning('{} imputation failed for feature "{}"', str(self.missing_num).upper(), feature)
         else:
             # categorical features
             for feature in df.columns:
@@ -132,7 +132,7 @@ def _impute(self, df, imputer, type):
                             if counter != 0:
                                 logger.debug('{} imputation of {} value(s) succeeded for feature "{}"', self.missing_categ.upper(), counter, feature)
                         except:
-                            logger.warning('{} imputation failed for feature "{}"', self.missing_categ.upper(), feature)
+                            logger.warning('{} imputation failed for feature "{}"', str(self.missing_categ).upper(), feature)
         return df
 
     def _lin_regression_impute(self, df, model):
@@ -256,7 +256,7 @@ class Outliers:
     def handle(self, df):
         # function for handling of outliers in the data
         if self.outliers:
-            logger.info('Started handling of outliers... Method: "{}"', self.outliers.upper())
+            logger.info('Started handling of outliers... Method: "{}"', str(self.outliers).upper())
             start = timer()  
 
             if self.outliers == 'winz':  
@@ -426,7 +426,7 @@ def handle(self, df):
                 target_cols = cols_categ # encode ALL columns
             else:
                 target_cols = self.encode_categ[1] # encode only specific columns
-            logger.info('Started encoding categorical features... Method: "AUTO"')
+            logger.info('Started encoding categorical features... Method: "{}"', self.encode_categ[0])
             start = timer()
             for feature in target_cols:
                 if feature in cols_categ:
@@ -456,12 +456,12 @@ def handle(self, df):
 
                         elif self.encode_categ[0] == 'onehot':
                             df = EncodeCateg._to_onehot(df, feature)
-                            logger.debug('Encoding to {} succeeded for feature "{}"', self.encode_categ[0].upper(), feature)
+                            logger.debug('Encoding to {} succeeded for feature "{}"', str(self.encode_categ[0]).upper(), feature)
                         elif self.encode_categ[0] == 'label':
                             df = EncodeCateg._to_label(df, feature)
-                            logger.debug('Encoding to {} succeeded for feature "{}"', self.encode_categ[0].upper(), feature)      
+                            logger.debug('Encoding to {} succeeded for feature "{}"', str(self.encode_categ[0]).upper(), feature)      
                     except:
-                        logger.warning('Encoding to {} failed for feature "{}"', self.encode_categ[0].upper(), feature)    
+                        logger.warning('Encoding to {} failed for feature "{}"', str(self.encode_categ[0]).upper(), feature)    
             end = timer()
             logger.info('Completed encoding of categorical features in {} seconds', round(end-start, 6))
         return df

diff --git a/README.md b/README.md
@@ -10,6 +10,7 @@
 ```python
 pip install py-AutoClean
 ```
+
 :thought_balloon: Read more on the AutoClean algorithm in my **Medium** article [Automated Data Cleaning with Python](https://eliselandman.medium.com/automated-data-cleaning-with-python-94d44d854423).
 
 View AutoClean on [PyPi](https://pypi.org/project/py-AutoClean/).
@@ -76,12 +77,12 @@ AutoClean(dataset, missing_num='auto', missing_categ='auto', encode_categ=['auto
 
 | Parameter | Type | Default Value | Other Values |
 | ------ | :---: | :---: | ------ | 
-| missing_num | `str` | `'auto'` | `linreg`, `knn`, `mean`, `median`, `most_frequent`, `delete`, `False` |
-| missing_categ | `str` | `'auto'` | `logreg`, `knn`, `most_frequent`, `delete`, `False` |
+| missing_num | `str` | `'auto'` | `'linreg'`, `'knn'`, `'mean'`, `'median'`, `'most_frequent'`, `'delete'`, `False` |
+| missing_categ | `str` | `'auto'` | `'logreg'`, `'knn'`, `'most_frequent'`, `'delete'`, `False` |
 | encode_categ | `list` | `['auto']` | `['onehot']`, `['label']`, `False` ; to encode only specific columns add a list of column names or indexes: `['auto', ['col1', 2]]` |
-| extract_datetime | `str` | `'s'` | `D`, `M`, `Y`, `h`, `m`, `False` |
-| outliers | `str` | `'winz'` | `delete`|
-| outlier_param | `int`, `float` | `1.5` | any int or float, `False` |
+| extract_datetime | `str` | `'s'` | `'D'`, `'M'`, `'Y'`, `'h'`, `'m'`, `False` |
+| outliers | `str` | `'winz'` | `'delete'`|
+| outlier_param | `int`, `float` | `1.5` | any int or float, `False` (recommended not to change default) |
 | logfile | `bool` | `True` | `False` |
 | verbose | `bool` | `False` | `True` |
 
@@ -97,15 +98,29 @@ Defines how **categorical** missing values in the data are handled. Missing valu
 
 You can specify the handling method by setting `missing_categ` to: `'logreg'`, `'knn'`, `'most_frequent'`, `'delete'` or to `False` if you want to skip this step.
 
+### encode_categ
+
+Defines how **categorical** values should be encoded. Categorical values can be onehot- or label-encoded. 
+
+The parameter must be handed as Python `list` type. When set to `['auto']`, AutoClean:
+
+* onehot-encodes features that have **less than 10 unique data values**
+* label-encodes features that have **less than 20 unique data values**
+* does not encode feature having **more than 20 unqiue data values**
+
+You can specify the encoding method manually by setting `encode_categ` to `['onehot']` or `['label']`. By default, AutoClean will encode all categorical features. You can specify which features to encode by giving the column names or indexes as parameter, for example  `['onehot', ['column_1', 2]]` - this will onehot-encode the column with column name 'column_1' and the column with index '2'.
+
+Set `encode_categ` to `False` to skip categorical encoding.
+
 ### extract_datetime
 
-AutoClean can search the data for datetime features, and **extract** the values to separate columns. When set to `s`, it extracts the datetime values up to the seconds i. e. day, month, year, hour, minutes, seconds.
+AutoClean can search the data for datetime features, and **extract** the values to separate columns. When set to `'s'`, it extracts the datetime values up to the seconds i. e. day, month, year, hour, minutes, seconds.
 
-You can set the granularity of the extraction manually by setting `extract_datetime`to `D` for day, `M` for month, `Y` for year, `h` for hour, `m` for minutes or to `False` if you want to skip this step. 
+You can set the granularity of the extraction manually by setting `extract_datetime` to `'D'` for day, `'M'` for month, `'Y'` for year, `'h'` for hour, `'m'` for minutes or to `False` if you want to skip this step. 
 
 ### outliers
 
-Defines how **outliers** in the data are handled. Outliers can be manipulated with two different methods: winsorization or deletion. You can specfiy the method by setting `outliers` to `winz` for winzorization, `delete`for deletion or to `False` if you want to skip this step.
+Defines how **outliers** in the data are handled. Outliers can be manipulated with two different methods: winsorization or deletion. You can specfiy the method by setting `'outliers'` to `'winz'` for winzorization, `'delete'` for deletion or to `False` if you want to skip this step.
 
 *When are outliers considered to be outliers?*  
 Oberservations are considered outliers if they are outside the following bounds:
@@ -119,13 +134,13 @@ where
 
 As soon as a value is below the lower or upper bound, the chosen outlier handling method is applied i. e. either winsorization, meaning it will be replaced by the respective lower or upper bound, or the observation will be deleted.
 
-You can **customize** the outlier bounds by changing the default `outlier_param` value of `1.5` to any integer or float of your choice.
+You can **customize** the outlier bounds by changing the default `outlier_param` value of `1.5` to any integer or float of your choice. **It is not recommended to change the `outlier_param` value!**
 
 ### outlier_param
 
 ! Recommended not to change default value
 
-You can **customize** the outlier bounds by changing the default `outlier_param` value of `1.5` to any integer or float of your choice.
+You can **customize** the outlier bounds by changing the default `outlier_param` value of `1.5` to any integer or float of your choice. **It is not recommended to change the `outlier_param` value!**
 
 ### logfile
 
@@ -135,4 +150,4 @@ You can view a [sample logfile here](https://github.com/elisemercury/AutoClean/b
 
 ### verbose
 
-Defined whether the logfile output should be shown on the console while the AutoClean process runs. Set to `True` if you want to follow the process logs in real-time.
+Defined whether the logfile output should be shown on the console while the AutoClean process runs. Set to `True` if you want to follow the process logs in real-time.