-
Notifications
You must be signed in to change notification settings - Fork 40
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Padronização de código e comentários
- Loading branch information
1 parent
5ad72b6
commit 26bd772
Showing
14 changed files
with
1,084 additions
and
765 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
6 changes: 2 additions & 4 deletions
6
hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/main.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,10 @@ | ||
import argparse | ||
import sys | ||
import os | ||
import argparse | ||
import logging | ||
from sagemaker_inference import model_server | ||
|
||
logging.getLogger().setLevel(logging.INFO) | ||
|
||
|
||
if __name__ == "__main__": | ||
|
||
model_server.start_model_server(handler_service="serving.handler") | ||
model_server.start_model_server(handler_service="serving.handler") |
128 changes: 79 additions & 49 deletions
128
hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/preprocessor.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,71 +1,101 @@ | ||
from ml.preprocessing.preprocessing import Preprocessing | ||
from ml.preprocessing.dataquality import DataQuality | ||
from ml.data_source.spreadsheet import Spreadsheet | ||
import great_expectations as ge | ||
from datetime import date | ||
import pandas as pd | ||
import argparse | ||
import logging | ||
from datetime import date | ||
|
||
import pandas as pd | ||
import glob | ||
import json | ||
from joblib import dump, load | ||
import great_expectations as ge | ||
|
||
from ml.preprocessing.preprocessing import Preprocessing | ||
from ml.preprocessing.dataquality import DataQuality | ||
from ml.data_source.spreadsheet import Spreadsheet | ||
|
||
logging.getLogger().setLevel('INFO') | ||
|
||
if __name__=='__main__': | ||
path_input = '/opt/ml/processing/input/' | ||
path_output = '/opt/ml/processing/output/' | ||
date = date.today().strftime('%Y%m%d') | ||
|
||
def data_quality(df, step_train): | ||
""" | ||
If True, it creates the DataQuality object, | ||
otherwise it loads an existing one | ||
Parameters | ||
---------- | ||
df : pd.Dataframe | ||
Train or test dataset | ||
step_train : boolean | ||
Train or test | ||
""" | ||
Execute the processor step in the virtual environment | ||
if step_train: | ||
dq = DataQuality(discrete_cat_cols=['Sex', 'Pclass', 'Survived']) | ||
df_ge = dq.perform(df) | ||
df_ge.save_expectation_suite(path_output + | ||
'expectations/expectations.json') | ||
else: | ||
df_ge = ge.dataset.PandasDataset(df) | ||
ge_val = df_ge.validate(expectation_suite=path_input + | ||
'expectations/expectations.json', | ||
only_return_failures=False) | ||
with open(f'{path_output}validations/{date}.json', 'w') as f: | ||
json.dump(ge_val.to_json_dict(), f) | ||
|
||
|
||
def preprocessing(df, step_train): | ||
""" | ||
If True, it creates the Preprocessing object, | ||
otherwise it loads an existing one | ||
Parameters | ||
---------- | ||
df : pd.Dataframe | ||
Train or test dataset | ||
step_train : boolean | ||
Train or test | ||
""" | ||
if step_train: | ||
norm_cols = {'min-max': ['Age']} | ||
oneHot_cols = ['Pclass', 'Sex'] | ||
p = Preprocessing(norm_cols, oneHot_cols) | ||
train, test_train = p.execute(df, step_train=True, val_size=0.2) | ||
logging.info("Saving") | ||
dump(p, path_output+'preprocessing/preprocessing.pkl') | ||
train.to_csv(path_output+'processed/train/train.csv', index=False) | ||
test_train.to_csv(path_output+'processed/val/val.csv', index=False) | ||
else: | ||
p = load(path_input+'preprocessing/preprocessing.pkl') | ||
test = p.execute(df, step_train=False) | ||
logging.info("Saving") | ||
test.to_csv(path_output+'processed/inference/inference.csv', | ||
index=False) | ||
|
||
|
||
if __name__ == '__main__': | ||
""" | ||
Execute the processor step in the virtual environment | ||
""" | ||
logging.info('Starting the preprocessing') | ||
|
||
# Read the step argument (train or test) | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('--step', type=str, default='train') | ||
args = parser.parse_args() | ||
args = parser.parse_args() | ||
step_train = True if args.step == "train" else False | ||
logging.info(f'step_train: {step_train}') | ||
|
||
logging.info('Reading the inputs') | ||
file = glob.glob("/opt/ml/processing/input/raw_data/*.csv")[0] | ||
file = glob.glob(path_input+"raw_data/*.csv")[0] | ||
logging.info(f'Reading file: {file}') | ||
df = Spreadsheet().get_data(file) | ||
|
||
logging.info("Data Quality") | ||
# If True, it creates the DataQuality object, otherwise it loads an existing one | ||
if step_train: | ||
dq = DataQuality(discrete_cat_cols=['Sex', 'Pclass']) | ||
df_ge = dq.perform(df, target='Survived') | ||
df_ge.save_expectation_suite('/opt/ml/processing/output/expectations/expectations.json') | ||
else: | ||
date = date.today().strftime('%Y%m%d') | ||
df_without_target = df.copy() | ||
if 'Survived' in df_without_target.columns: | ||
df_without_target.drop(columns=['Survived'], inplace=True) | ||
df_ge = ge.dataset.PandasDataset(df_without_target) | ||
ge_val = df_ge.validate(expectation_suite='/opt/ml/processing/input/expectations/expectations.json', only_return_failures=False) | ||
with open(f'/opt/ml/processing/output/validations/{date}.json', 'w') as f: | ||
json.dump(ge_val.to_json_dict(), f) | ||
data_quality(df, step_train) | ||
|
||
logging.info("Preprocessing") | ||
# If True, it creates the Preprocessing object, otherwise it loads an existing one | ||
if step_train: | ||
norm_cols = {'min-max': ['Age']} | ||
oneHot_cols = ['Pclass','Sex'] | ||
p = Preprocessing(norm_cols, oneHot_cols) | ||
train, test_train = p.execute(df, step_train = True, val_size = 0.2) | ||
else: | ||
p = load("/opt/ml/processing/input/preprocessing/preprocessing.pkl") | ||
test = p.execute(df, step_train = False) | ||
|
||
logging.info("Saving") | ||
# If True, it saves the Preprocessing to be used later in the inference step | ||
if step_train: | ||
dump(p, '/opt/ml/processing/output/preprocessing/preprocessing.pkl') | ||
|
||
# If True, it saves the train and val files, otherwise it saves only the inference file | ||
if step_train: | ||
train.to_csv('/opt/ml/processing/output/processed/train/train.csv', index=False) | ||
test_train.to_csv('/opt/ml/processing/output/processed/val/val.csv', index=False) | ||
else: | ||
test.to_csv('/opt/ml/processing/output/processed/inference/inference.csv', index=False) | ||
preprocessing(df, step_train) |
Oops, something went wrong.