Skip to content

Commit

Permalink
Merge pull request #5 from elisemercury/v1.0.0-bug-fixes
Browse files Browse the repository at this point in the history
V1.0.0 bug fixes
  • Loading branch information
elisemercury authored Jun 15, 2022
2 parents 9ad8c59 + b9875e3 commit ead49e1
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 22 deletions.
22 changes: 11 additions & 11 deletions AutoClean/Modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class MissingValues:

def handle(self, df, _n_neighbors=3):
# function for handling missing values in the data
logger.info('Started handling of missing values...', self.missing_num.upper())
logger.info('Started handling of missing values...', str(self.missing_num).upper())
start = timer()
self.count_missing = df.isna().sum().sum()

Expand All @@ -30,7 +30,7 @@ def handle(self, df, _n_neighbors=3):
df.reset_index(drop=True)

if self.missing_num: # numeric data
logger.info('Started handling of NUMERICAL missing values... Method: "{}"', self.missing_num.upper())
logger.info('Started handling of NUMERICAL missing values... Method: "{}"', str(self.missing_num).upper())
# automated handling
if self.missing_num == 'auto':
self.missing_num = 'linreg'
Expand All @@ -57,7 +57,7 @@ def handle(self, df, _n_neighbors=3):
logger.debug('Deletion of {} NUMERIC missing value(s) succeeded', self.count_missing-df.isna().sum().sum())

if self.missing_categ: # categorical data
logger.info('Started handling of CATEGORICAL missing values... Method: "{}"', self.missing_categ.upper())
logger.info('Started handling of CATEGORICAL missing values... Method: "{}"', str(self.missing_categ).upper())
# automated handling
if self.missing_categ == 'auto':
self.missing_categ = 'logreg'
Expand Down Expand Up @@ -105,9 +105,9 @@ def _impute(self, df, imputer, type):
else:
df[feature] = df_imputed
if counter != 0:
logger.debug('{} imputation of {} value(s) succeeded for feature "{}"', self.missing_num.upper(), counter, feature)
logger.debug('{} imputation of {} value(s) succeeded for feature "{}"', str(self.missing_num).upper(), counter, feature)
except:
logger.warning('{} imputation failed for feature "{}"', self.missing_num.upper(), feature)
logger.warning('{} imputation failed for feature "{}"', str(self.missing_num).upper(), feature)
else:
# categorical features
for feature in df.columns:
Expand All @@ -132,7 +132,7 @@ def _impute(self, df, imputer, type):
if counter != 0:
logger.debug('{} imputation of {} value(s) succeeded for feature "{}"', self.missing_categ.upper(), counter, feature)
except:
logger.warning('{} imputation failed for feature "{}"', self.missing_categ.upper(), feature)
logger.warning('{} imputation failed for feature "{}"', str(self.missing_categ).upper(), feature)
return df

def _lin_regression_impute(self, df, model):
Expand Down Expand Up @@ -256,7 +256,7 @@ class Outliers:
def handle(self, df):
# function for handling of outliers in the data
if self.outliers:
logger.info('Started handling of outliers... Method: "{}"', self.outliers.upper())
logger.info('Started handling of outliers... Method: "{}"', str(self.outliers).upper())
start = timer()

if self.outliers == 'winz':
Expand Down Expand Up @@ -426,7 +426,7 @@ def handle(self, df):
target_cols = cols_categ # encode ALL columns
else:
target_cols = self.encode_categ[1] # encode only specific columns
logger.info('Started encoding categorical features... Method: "AUTO"')
logger.info('Started encoding categorical features... Method: "{}"', self.encode_categ[0])
start = timer()
for feature in target_cols:
if feature in cols_categ:
Expand Down Expand Up @@ -456,12 +456,12 @@ def handle(self, df):

elif self.encode_categ[0] == 'onehot':
df = EncodeCateg._to_onehot(df, feature)
logger.debug('Encoding to {} succeeded for feature "{}"', self.encode_categ[0].upper(), feature)
logger.debug('Encoding to {} succeeded for feature "{}"', str(self.encode_categ[0]).upper(), feature)
elif self.encode_categ[0] == 'label':
df = EncodeCateg._to_label(df, feature)
logger.debug('Encoding to {} succeeded for feature "{}"', self.encode_categ[0].upper(), feature)
logger.debug('Encoding to {} succeeded for feature "{}"', str(self.encode_categ[0]).upper(), feature)
except:
logger.warning('Encoding to {} failed for feature "{}"', self.encode_categ[0].upper(), feature)
logger.warning('Encoding to {} failed for feature "{}"', str(self.encode_categ[0]).upper(), feature)
end = timer()
logger.info('Completed encoding of categorical features in {} seconds', round(end-start, 6))
return df
Expand Down
37 changes: 26 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
```python
pip install py-AutoClean
```

:thought_balloon: Read more on the AutoClean algorithm in my **Medium** article [Automated Data Cleaning with Python](https://eliselandman.medium.com/automated-data-cleaning-with-python-94d44d854423).

View AutoClean on [PyPi](https://pypi.org/project/py-AutoClean/).
Expand Down Expand Up @@ -76,12 +77,12 @@ AutoClean(dataset, missing_num='auto', missing_categ='auto', encode_categ=['auto

| Parameter | Type | Default Value | Other Values |
| ------ | :---: | :---: | ------ |
| missing_num | `str` | `'auto'` | `linreg`, `knn`, `mean`, `median`, `most_frequent`, `delete`, `False` |
| missing_categ | `str` | `'auto'` | `logreg`, `knn`, `most_frequent`, `delete`, `False` |
| missing_num | `str` | `'auto'` | `'linreg'`, `'knn'`, `'mean'`, `'median'`, `'most_frequent'`, `'delete'`, `False` |
| missing_categ | `str` | `'auto'` | `'logreg'`, `'knn'`, `'most_frequent'`, `'delete'`, `False` |
| encode_categ | `list` | `['auto']` | `['onehot']`, `['label']`, `False` ; to encode only specific columns add a list of column names or indexes: `['auto', ['col1', 2]]` |
| extract_datetime | `str` | `'s'` | `D`, `M`, `Y`, `h`, `m`, `False` |
| outliers | `str` | `'winz'` | `delete`|
| outlier_param | `int`, `float` | `1.5` | any int or float, `False` |
| extract_datetime | `str` | `'s'` | `'D'`, `'M'`, `'Y'`, `'h'`, `'m'`, `False` |
| outliers | `str` | `'winz'` | `'delete'`|
| outlier_param | `int`, `float` | `1.5` | any int or float, `False` (recommended not to change default) |
| logfile | `bool` | `True` | `False` |
| verbose | `bool` | `False` | `True` |

Expand All @@ -97,15 +98,29 @@ Defines how **categorical** missing values in the data are handled. Missing valu

You can specify the handling method by setting `missing_categ` to: `'logreg'`, `'knn'`, `'most_frequent'`, `'delete'` or to `False` if you want to skip this step.

### encode_categ

Defines how **categorical** values should be encoded. Categorical values can be onehot- or label-encoded.

The parameter must be handed as Python `list` type. When set to `['auto']`, AutoClean:

* onehot-encodes features that have **less than 10 unique data values**
* label-encodes features that have **less than 20 unique data values**
* does not encode feature having **more than 20 unqiue data values**

You can specify the encoding method manually by setting `encode_categ` to `['onehot']` or `['label']`. By default, AutoClean will encode all categorical features. You can specify which features to encode by giving the column names or indexes as parameter, for example `['onehot', ['column_1', 2]]` - this will onehot-encode the column with column name 'column_1' and the column with index '2'.

Set `encode_categ` to `False` to skip categorical encoding.

### extract_datetime

AutoClean can search the data for datetime features, and **extract** the values to separate columns. When set to `s`, it extracts the datetime values up to the seconds i. e. day, month, year, hour, minutes, seconds.
AutoClean can search the data for datetime features, and **extract** the values to separate columns. When set to `'s'`, it extracts the datetime values up to the seconds i. e. day, month, year, hour, minutes, seconds.

You can set the granularity of the extraction manually by setting `extract_datetime`to `D` for day, `M` for month, `Y` for year, `h` for hour, `m` for minutes or to `False` if you want to skip this step.
You can set the granularity of the extraction manually by setting `extract_datetime` to `'D'` for day, `'M'` for month, `'Y'` for year, `'h'` for hour, `'m'` for minutes or to `False` if you want to skip this step.

### outliers

Defines how **outliers** in the data are handled. Outliers can be manipulated with two different methods: winsorization or deletion. You can specfiy the method by setting `outliers` to `winz` for winzorization, `delete`for deletion or to `False` if you want to skip this step.
Defines how **outliers** in the data are handled. Outliers can be manipulated with two different methods: winsorization or deletion. You can specfiy the method by setting `'outliers'` to `'winz'` for winzorization, `'delete'` for deletion or to `False` if you want to skip this step.

*When are outliers considered to be outliers?*
Oberservations are considered outliers if they are outside the following bounds:
Expand All @@ -119,13 +134,13 @@ where

As soon as a value is below the lower or upper bound, the chosen outlier handling method is applied i. e. either winsorization, meaning it will be replaced by the respective lower or upper bound, or the observation will be deleted.

You can **customize** the outlier bounds by changing the default `outlier_param` value of `1.5` to any integer or float of your choice.
You can **customize** the outlier bounds by changing the default `outlier_param` value of `1.5` to any integer or float of your choice. **It is not recommended to change the `outlier_param` value!**

### outlier_param

! Recommended not to change default value

You can **customize** the outlier bounds by changing the default `outlier_param` value of `1.5` to any integer or float of your choice.
You can **customize** the outlier bounds by changing the default `outlier_param` value of `1.5` to any integer or float of your choice. **It is not recommended to change the `outlier_param` value!**

### logfile

Expand All @@ -135,4 +150,4 @@ You can view a [sample logfile here](https://github.com/elisemercury/AutoClean/b

### verbose

Defined whether the logfile output should be shown on the console while the AutoClean process runs. Set to `True` if you want to follow the process logs in real-time.
Defined whether the logfile output should be shown on the console while the AutoClean process runs. Set to `True` if you want to follow the process logs in real-time.

0 comments on commit ead49e1

Please sign in to comment.