Padronização de código e comentários

A3Data · Jul 23, 2021 · 26bd772 · 26bd772
1 parent 5ad72b6
commit 26bd772
Show file tree

Hide file tree

Showing 14 changed files with 1,084 additions and 765 deletions.
diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/README.tpl.md b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/README.tpl.md
@@ -116,9 +116,9 @@ The bash command will access the Dockerfile in the folder, create the image and
 
 To test the images in ECR, execute the following notebooks:
 
-- project-name/src/ml/notebooks/Sagemaker_Processor.ipynb
-- project-name/src/ml/notebooks/Sagemaker_Train.ipynb
-- project-name/src/ml/notebooks/Sagemaker_Inference.ipynb
+- project-name/src/ml/notebooks/1_Sagemaker_Processor.ipynb
+- project-name/src/ml/notebooks/2_Sagemaker_Train.ipynb
+- project-name/src/ml/notebooks/3_Sagemaker_Inference.ipynb
 
 ## Stepfunctions
 
@@ -240,5 +240,5 @@ Next, create and attach another new policy to the role you created:
 
 To create and test the Step Functions state machines, execute the following notebooks:
 
-- project-name/src/ml/notebooks/Sagemaker_StepFunctions_Train.ipynb
-- project-name/src/ml/notebooks/Sagemaker_StepFunctions_Inference.ipynb
+- project-name/src/ml/notebooks/4_Sagemaker_StepFunctions_Train.ipynb
+- project-name/src/ml/notebooks/5_Sagemaker_StepFunctions_Inference.ipynb
diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/handler.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/handler.py
@@ -3,9 +3,9 @@
 
 import os
 import logging
-import pandas as pd
 from joblib import load
 from six import StringIO
+import pandas as pd
 
 from ml.model.wrapper import Wrapper
 from sagemaker_inference.default_inference_handler import DefaultInferenceHandler
@@ -17,49 +17,106 @@
 # Path to access the model
 MODEL_DIR = '/opt/ml/model'
 
-def _csv_to_pandas(string_like):  # type: (str) -> pd.DataFrame
-    """Convert a CSV object to a pandas DataFrame.
-    Args:
-        string_like (str): CSV string.
-        
-    Returns:
-        (pd.DataFrame): pandas DataFrame
-    """    
+
+def _csv_to_pandas(string_like):
+    """
+    Convert a CSV object to a pandas DataFrame.
+
+    Parameters
+    ----------
+    string_like : String
+                  CSV string.
+
+    Returns
+    -------
+    pd.DataFrame : pandas DataFrame
+    """
     stream = StringIO(string_like)
     res = pd.read_csv(stream)
     return res
 
+
 class HandlerService(DefaultHandlerService, DefaultInferenceHandler):
     """
-        Execute the inference step in the virtual environment
-        
+    Execute the inference step in the virtual environment
+
     """
     def __init__(self):
         op = transformer.Transformer(default_inference_handler=self)
         super(HandlerService, self).__init__(transformer=op)
-
-    # Loads the model from the disk
+
     def default_model_fn(self, model_dir):
-        logging.info('Loading the model')   
+        """
+        Loads the model from the disk
+
+        Parameters
+        ----------            
+        model_dir   : string
+                      Path of the model
+
+        Returns
+        -------
+        pkl : model
+        """
+        logging.info('Loading the model')
         return load(os.path.join(MODEL_DIR, "model.pkl"))
-
-    # Parse and check the format of the input data
+
     def default_input_fn(self, input_data, content_type):
+        """
+        Parse and check the format of the input data
+
+        Parameters
+        ----------            
+        input_data   : string
+                       CSV string
+        content_type : string
+                       Type of the file
+
+        Returns
+        -------
+        pd.DataFrame : pandas DataFrame
+        """
         global colunas
         if content_type != "text/csv":
             raise Exception("Invalid content-type: %s" % content_type)
-        return _csv_to_pandas(input_data)                           
-
-    # Run our model and do the prediction
+        return _csv_to_pandas(input_data)
+
     def default_predict_fn(self, df, model):
-        logging.info('Predicting...')        
-        resultados = model.predict(df,included_input=True)
-        logging.info('Prediction Complete')     
+        """
+        Run our model and do the prediction
+
+        Parameters
+        ----------            
+        df    : pd.DataFrame
+                Data to be predicted
+        model : pkl
+                Model to predict the data
+
+        Returns
+        -------
+        pd.DataFrame : pandas DataFrame
+        """
+        logging.info('Predicting...')
+        resultados = model.predict(df, included_input=True)
+        logging.info('Prediction Complete')
         return resultados.reset_index(drop=True).T.reset_index().T
-
-    # Gets the prediction output and format it to be returned to the user
+
     def default_output_fn(self, prediction, accept):
-        logging.info('Saving') 
+        """
+        Gets the prediction output and format it to be returned to the user
+
+        Parameters
+        ----------            
+        prediction    : pd.DataFrame
+                        Predicted dataset
+        accept        : string
+                        Output type
+
+        Returns
+        -------
+        CSV : CSV file
+        """
+        logging.info('Saving')
         if accept != "text/csv":
             raise Exception("Invalid accept: %s" % accept)
-        return encoder.encode(prediction, accept)
+        return encoder.encode(prediction, accept)
diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/main.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/main.py
@@ -1,12 +1,10 @@
-import argparse
 import sys
 import os
+import argparse
 import logging
 from sagemaker_inference import model_server
 
 logging.getLogger().setLevel(logging.INFO)
 
-
 if __name__ == "__main__":
-
-    model_server.start_model_server(handler_service="serving.handler")
+    model_server.start_model_server(handler_service="serving.handler")
diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/preprocessor.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/preprocessor.py
@@ -1,71 +1,101 @@
-from ml.preprocessing.preprocessing import Preprocessing
-from ml.preprocessing.dataquality import DataQuality
-from ml.data_source.spreadsheet import Spreadsheet
-import great_expectations as ge
-from datetime import date
-import pandas as pd
 import argparse
 import logging
+from datetime import date
+
+import pandas as pd
 import glob
 import json
 from joblib import dump, load
+import great_expectations as ge
+
+from ml.preprocessing.preprocessing import Preprocessing
+from ml.preprocessing.dataquality import DataQuality
+from ml.data_source.spreadsheet import Spreadsheet
 
 logging.getLogger().setLevel('INFO')
 
-if __name__=='__main__':
+path_input = '/opt/ml/processing/input/'
+path_output = '/opt/ml/processing/output/'
+date = date.today().strftime('%Y%m%d')
+
+def data_quality(df, step_train):
+    """
+    If True, it creates the DataQuality object,
+    otherwise it loads an existing one
+
+    Parameters
+    ----------            
+    df          : pd.Dataframe
+                  Train or test dataset
+    step_train  : boolean
+                  Train or test
+
     """
-        Execute the processor step in the virtual environment
-        
+    if step_train:
+        dq = DataQuality(discrete_cat_cols=['Sex', 'Pclass', 'Survived'])
+        df_ge = dq.perform(df)
+        df_ge.save_expectation_suite(path_output +
+                                     'expectations/expectations.json')
+    else:
+        df_ge = ge.dataset.PandasDataset(df)
+        ge_val = df_ge.validate(expectation_suite=path_input +
+                                'expectations/expectations.json',
+                                only_return_failures=False)
+        with open(f'{path_output}validations/{date}.json', 'w') as f:
+            json.dump(ge_val.to_json_dict(), f)
+
+
+def preprocessing(df, step_train):
+    """
+    If True, it creates the Preprocessing object,
+    otherwise it loads an existing one
+
+    Parameters
+    ----------            
+    df          : pd.Dataframe
+                  Train or test dataset
+    step_train  : boolean
+                  Train or test
+
+    """
+    if step_train:
+        norm_cols = {'min-max': ['Age']}
+        oneHot_cols = ['Pclass', 'Sex']
+        p = Preprocessing(norm_cols, oneHot_cols)
+        train, test_train = p.execute(df, step_train=True, val_size=0.2)
+        logging.info("Saving")
+        dump(p, path_output+'preprocessing/preprocessing.pkl')
+        train.to_csv(path_output+'processed/train/train.csv', index=False)
+        test_train.to_csv(path_output+'processed/val/val.csv', index=False)
+    else:
+        p = load(path_input+'preprocessing/preprocessing.pkl')
+        test = p.execute(df, step_train=False)
+        logging.info("Saving")
+        test.to_csv(path_output+'processed/inference/inference.csv',
+                    index=False)
+
+
+if __name__ == '__main__':
+    """
+    Execute the processor step in the virtual environment
+
     """
     logging.info('Starting the preprocessing')
-    
+
     # Read the step argument (train or test)
     parser = argparse.ArgumentParser()
     parser.add_argument('--step', type=str, default='train')
-    args = parser.parse_args()    
+    args = parser.parse_args()
     step_train = True if args.step == "train" else False
     logging.info(f'step_train: {step_train}')
-    
+
     logging.info('Reading the inputs')
-    file = glob.glob("/opt/ml/processing/input/raw_data/*.csv")[0]
+    file = glob.glob(path_input+"raw_data/*.csv")[0]
     logging.info(f'Reading file: {file}')
     df = Spreadsheet().get_data(file)
-      
+
     logging.info("Data Quality")
-    # If True, it creates the DataQuality object, otherwise it loads an existing one
-    if step_train:
-        dq = DataQuality(discrete_cat_cols=['Sex', 'Pclass']) 
-        df_ge = dq.perform(df, target='Survived')
-        df_ge.save_expectation_suite('/opt/ml/processing/output/expectations/expectations.json')
-    else:
-        date = date.today().strftime('%Y%m%d')
-        df_without_target = df.copy()
-        if 'Survived' in df_without_target.columns:
-            df_without_target.drop(columns=['Survived'], inplace=True)
-        df_ge = ge.dataset.PandasDataset(df_without_target)
-        ge_val = df_ge.validate(expectation_suite='/opt/ml/processing/input/expectations/expectations.json', only_return_failures=False)
-        with open(f'/opt/ml/processing/output/validations/{date}.json', 'w') as f: 
-            json.dump(ge_val.to_json_dict(), f)
+    data_quality(df, step_train)
 
     logging.info("Preprocessing")
-    # If True, it creates the Preprocessing object, otherwise it loads an existing one
-    if step_train:
-        norm_cols = {'min-max': ['Age']}
-        oneHot_cols = ['Pclass','Sex']
-        p = Preprocessing(norm_cols, oneHot_cols)
-        train, test_train = p.execute(df, step_train = True, val_size = 0.2)
-    else:
-        p = load("/opt/ml/processing/input/preprocessing/preprocessing.pkl")
-        test = p.execute(df, step_train = False)
-
-    logging.info("Saving")
-    # If True, it saves the Preprocessing to be used later in the inference step
-    if step_train:
-        dump(p, '/opt/ml/processing/output/preprocessing/preprocessing.pkl')
-
-    # If True, it saves the train and val files, otherwise it saves only the inference file    
-    if step_train:
-        train.to_csv('/opt/ml/processing/output/processed/train/train.csv', index=False)
-        test_train.to_csv('/opt/ml/processing/output/processed/val/val.csv', index=False)
-    else:
-        test.to_csv('/opt/ml/processing/output/processed/inference/inference.csv', index=False)
+    preprocessing(df, step_train)