Skip to content

Commit

Permalink
Padronização de código e comentários
Browse files Browse the repository at this point in the history
  • Loading branch information
karenstemartins committed Jul 23, 2021
1 parent 5ad72b6 commit 26bd772
Show file tree
Hide file tree
Showing 14 changed files with 1,084 additions and 765 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,9 @@ The bash command will access the Dockerfile in the folder, create the image and

To test the images in ECR, execute the following notebooks:

- project-name/src/ml/notebooks/Sagemaker_Processor.ipynb
- project-name/src/ml/notebooks/Sagemaker_Train.ipynb
- project-name/src/ml/notebooks/Sagemaker_Inference.ipynb
- project-name/src/ml/notebooks/1_Sagemaker_Processor.ipynb
- project-name/src/ml/notebooks/2_Sagemaker_Train.ipynb
- project-name/src/ml/notebooks/3_Sagemaker_Inference.ipynb

## Stepfunctions

Expand Down Expand Up @@ -240,5 +240,5 @@ Next, create and attach another new policy to the role you created:

To create and test the Step Functions state machines, execute the following notebooks:

- project-name/src/ml/notebooks/Sagemaker_StepFunctions_Train.ipynb
- project-name/src/ml/notebooks/Sagemaker_StepFunctions_Inference.ipynb
- project-name/src/ml/notebooks/4_Sagemaker_StepFunctions_Train.ipynb
- project-name/src/ml/notebooks/5_Sagemaker_StepFunctions_Inference.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@

import os
import logging
import pandas as pd
from joblib import load
from six import StringIO
import pandas as pd

from ml.model.wrapper import Wrapper
from sagemaker_inference.default_inference_handler import DefaultInferenceHandler
Expand All @@ -17,49 +17,106 @@
# Path to access the model
MODEL_DIR = '/opt/ml/model'

def _csv_to_pandas(string_like): # type: (str) -> pd.DataFrame
"""Convert a CSV object to a pandas DataFrame.
Args:
string_like (str): CSV string.
Returns:
(pd.DataFrame): pandas DataFrame
"""

def _csv_to_pandas(string_like):
"""
Convert a CSV object to a pandas DataFrame.
Parameters
----------
string_like : String
CSV string.
Returns
-------
pd.DataFrame : pandas DataFrame
"""
stream = StringIO(string_like)
res = pd.read_csv(stream)
return res


class HandlerService(DefaultHandlerService, DefaultInferenceHandler):
"""
Execute the inference step in the virtual environment
Execute the inference step in the virtual environment
"""
def __init__(self):
op = transformer.Transformer(default_inference_handler=self)
super(HandlerService, self).__init__(transformer=op)

# Loads the model from the disk

def default_model_fn(self, model_dir):
logging.info('Loading the model')
"""
Loads the model from the disk
Parameters
----------
model_dir : string
Path of the model
Returns
-------
pkl : model
"""
logging.info('Loading the model')
return load(os.path.join(MODEL_DIR, "model.pkl"))

# Parse and check the format of the input data

def default_input_fn(self, input_data, content_type):
"""
Parse and check the format of the input data
Parameters
----------
input_data : string
CSV string
content_type : string
Type of the file
Returns
-------
pd.DataFrame : pandas DataFrame
"""
global colunas
if content_type != "text/csv":
raise Exception("Invalid content-type: %s" % content_type)
return _csv_to_pandas(input_data)

# Run our model and do the prediction
return _csv_to_pandas(input_data)

def default_predict_fn(self, df, model):
logging.info('Predicting...')
resultados = model.predict(df,included_input=True)
logging.info('Prediction Complete')
"""
Run our model and do the prediction
Parameters
----------
df : pd.DataFrame
Data to be predicted
model : pkl
Model to predict the data
Returns
-------
pd.DataFrame : pandas DataFrame
"""
logging.info('Predicting...')
resultados = model.predict(df, included_input=True)
logging.info('Prediction Complete')
return resultados.reset_index(drop=True).T.reset_index().T

# Gets the prediction output and format it to be returned to the user

def default_output_fn(self, prediction, accept):
logging.info('Saving')
"""
Gets the prediction output and format it to be returned to the user
Parameters
----------
prediction : pd.DataFrame
Predicted dataset
accept : string
Output type
Returns
-------
CSV : CSV file
"""
logging.info('Saving')
if accept != "text/csv":
raise Exception("Invalid accept: %s" % accept)
return encoder.encode(prediction, accept)
return encoder.encode(prediction, accept)
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
import argparse
import sys
import os
import argparse
import logging
from sagemaker_inference import model_server

logging.getLogger().setLevel(logging.INFO)


if __name__ == "__main__":

model_server.start_model_server(handler_service="serving.handler")
model_server.start_model_server(handler_service="serving.handler")
Original file line number Diff line number Diff line change
@@ -1,71 +1,101 @@
from ml.preprocessing.preprocessing import Preprocessing
from ml.preprocessing.dataquality import DataQuality
from ml.data_source.spreadsheet import Spreadsheet
import great_expectations as ge
from datetime import date
import pandas as pd
import argparse
import logging
from datetime import date

import pandas as pd
import glob
import json
from joblib import dump, load
import great_expectations as ge

from ml.preprocessing.preprocessing import Preprocessing
from ml.preprocessing.dataquality import DataQuality
from ml.data_source.spreadsheet import Spreadsheet

logging.getLogger().setLevel('INFO')

if __name__=='__main__':
path_input = '/opt/ml/processing/input/'
path_output = '/opt/ml/processing/output/'
date = date.today().strftime('%Y%m%d')

def data_quality(df, step_train):
"""
If True, it creates the DataQuality object,
otherwise it loads an existing one
Parameters
----------
df : pd.Dataframe
Train or test dataset
step_train : boolean
Train or test
"""
Execute the processor step in the virtual environment
if step_train:
dq = DataQuality(discrete_cat_cols=['Sex', 'Pclass', 'Survived'])
df_ge = dq.perform(df)
df_ge.save_expectation_suite(path_output +
'expectations/expectations.json')
else:
df_ge = ge.dataset.PandasDataset(df)
ge_val = df_ge.validate(expectation_suite=path_input +
'expectations/expectations.json',
only_return_failures=False)
with open(f'{path_output}validations/{date}.json', 'w') as f:
json.dump(ge_val.to_json_dict(), f)


def preprocessing(df, step_train):
"""
If True, it creates the Preprocessing object,
otherwise it loads an existing one
Parameters
----------
df : pd.Dataframe
Train or test dataset
step_train : boolean
Train or test
"""
if step_train:
norm_cols = {'min-max': ['Age']}
oneHot_cols = ['Pclass', 'Sex']
p = Preprocessing(norm_cols, oneHot_cols)
train, test_train = p.execute(df, step_train=True, val_size=0.2)
logging.info("Saving")
dump(p, path_output+'preprocessing/preprocessing.pkl')
train.to_csv(path_output+'processed/train/train.csv', index=False)
test_train.to_csv(path_output+'processed/val/val.csv', index=False)
else:
p = load(path_input+'preprocessing/preprocessing.pkl')
test = p.execute(df, step_train=False)
logging.info("Saving")
test.to_csv(path_output+'processed/inference/inference.csv',
index=False)


if __name__ == '__main__':
"""
Execute the processor step in the virtual environment
"""
logging.info('Starting the preprocessing')

# Read the step argument (train or test)
parser = argparse.ArgumentParser()
parser.add_argument('--step', type=str, default='train')
args = parser.parse_args()
args = parser.parse_args()
step_train = True if args.step == "train" else False
logging.info(f'step_train: {step_train}')

logging.info('Reading the inputs')
file = glob.glob("/opt/ml/processing/input/raw_data/*.csv")[0]
file = glob.glob(path_input+"raw_data/*.csv")[0]
logging.info(f'Reading file: {file}')
df = Spreadsheet().get_data(file)

logging.info("Data Quality")
# If True, it creates the DataQuality object, otherwise it loads an existing one
if step_train:
dq = DataQuality(discrete_cat_cols=['Sex', 'Pclass'])
df_ge = dq.perform(df, target='Survived')
df_ge.save_expectation_suite('/opt/ml/processing/output/expectations/expectations.json')
else:
date = date.today().strftime('%Y%m%d')
df_without_target = df.copy()
if 'Survived' in df_without_target.columns:
df_without_target.drop(columns=['Survived'], inplace=True)
df_ge = ge.dataset.PandasDataset(df_without_target)
ge_val = df_ge.validate(expectation_suite='/opt/ml/processing/input/expectations/expectations.json', only_return_failures=False)
with open(f'/opt/ml/processing/output/validations/{date}.json', 'w') as f:
json.dump(ge_val.to_json_dict(), f)
data_quality(df, step_train)

logging.info("Preprocessing")
# If True, it creates the Preprocessing object, otherwise it loads an existing one
if step_train:
norm_cols = {'min-max': ['Age']}
oneHot_cols = ['Pclass','Sex']
p = Preprocessing(norm_cols, oneHot_cols)
train, test_train = p.execute(df, step_train = True, val_size = 0.2)
else:
p = load("/opt/ml/processing/input/preprocessing/preprocessing.pkl")
test = p.execute(df, step_train = False)

logging.info("Saving")
# If True, it saves the Preprocessing to be used later in the inference step
if step_train:
dump(p, '/opt/ml/processing/output/preprocessing/preprocessing.pkl')

# If True, it saves the train and val files, otherwise it saves only the inference file
if step_train:
train.to_csv('/opt/ml/processing/output/processed/train/train.csv', index=False)
test_train.to_csv('/opt/ml/processing/output/processed/val/val.csv', index=False)
else:
test.to_csv('/opt/ml/processing/output/processed/inference/inference.csv', index=False)
preprocessing(df, step_train)
Loading

0 comments on commit 26bd772

Please sign in to comment.