diff --git a/.gitignore b/.gitignore index 8314989..a1549ea 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ __pycache__/ *egg-info -.vscode/ \ No newline at end of file +.vscode/ +.ipynb_checkpoints +*/.ipynb_checkpoints/* +hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.ipynb_checkpoints/* diff --git a/README.md b/README.md index bd6b2d7..eb7249f 100644 --- a/README.md +++ b/README.md @@ -74,13 +74,21 @@ After installed Hermione: hermione new project_hermione ``` -1. Hit Enter if you want to start with an example code +2. Hit Enter if you want to start with an example code ``` Do you want to start with an implemented example (recommended) [y/n]? [y]: ``` -3. Hermione already creates a virtual environment for the project. For Windows users, activate it with +3. If you choose an implemented example, select Sagemaker or Local version + +``` +Do you want to start with: + (1) Sagemaker + (2) Local version +``` + +4. Hermione already creates a virtual environment for the project. For Windows users, activate it with ```cmd _env\Scripts\activate @@ -93,13 +101,13 @@ source _env/bin/activate ``` -4. After activating, you should install some libraries. There are a few suggestions in “requirements.txt” file: +5. After activating, you should install some libraries. There are a few suggestions in “requirements.txt” file: ``` pip install -r requirements.txt ``` -1. Now we will train some models from the example, using MLflow ❤. To do so, inside *src* directory, just type: _hermione train_. The “hermione train” command will search for a `train.py` file and execute it. In the example, models and metrics are already controlled via MLflow. +1. Now, if you selected the Local version, we will train some models from the example, using MLflow ❤. To do so, inside *src* directory, just type: _hermione train_. The “hermione train” command will search for a `train.py` file and execute it. In the example, models and metrics are already controlled via MLflow. ![](https://cdn-images-1.medium.com/max/800/1*MmVcmAYspxWdzbd5r00W5g.png) @@ -118,16 +126,19 @@ mlflow ui ![](https://cdn-images-1.medium.com/max/800/1*c_rDEqERZR6r8JVI3TMTcQ.png) -8. To make batch predictions using your `predict.py` file, type `hermione predict`. The default implemented version will print some predictions for you in the terminal. +7. To make batch predictions using your `predict.py` file, type `hermione predict`. The default implemented version will print some predictions for you in the terminal. ``` hermione predict ``` -9. In the Titanic example, we also provide a step by step notebook. To view it, just type jupyter notebook inside directory `/src/notebooks/`. +8. In the Titanic example, we also provide a step by step notebook. To view it, just type jupyter notebook inside directory `/src/notebooks/`. ![](https://cdn-images-1.medium.com/max/800/1*U3ToR5jDjQJihT9EnxeDdg.png) + +9. If you selected the Sagemaker version, click [here](hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/README.tpl.md) to check a tutorial. + Do you want to create your **project from scratch**? There click [here](tutorial_base.md) to check a tutorial. @@ -186,6 +197,7 @@ Here we describe briefly what each class is doing: - **Preprocessing** - concentrates all preprocessing steps that must be performed on the data before the model is trained. - **Normalization** - applies normalization and denormalization to reported columns. This class contains the following normalization algorithms already implemented: StandardScaler e MinMaxScaler. - **TextVectorizer** - transforms text into vector. Implemented methods: Bag of words, TF_IDF, Embedding: mean, median e indexing. +- **DataQuality** - concentrates all data validation steps that must be performed on the data to ensure its quality. ### Visualization diff --git a/hermione/cli.py b/hermione/cli.py index 31a6609..73b67ef 100644 --- a/hermione/cli.py +++ b/hermione/cli.py @@ -53,7 +53,8 @@ def new(project_name, implemented): } os.makedirs(os.path.join(LOCAL_PATH, project_name)) if is_imp: - write_module(os.path.join(LOCAL_PATH, project_name), '__IMPLEMENTED_BASE__', True, custom_inputs) + option = click.prompt('Do you want to start with: \n\t(1) Sagemaker \n\t(2) Local version \n', type=int, default=2) + implemented_version_type(project_name,custom_inputs,option) else: write_module(os.path.join(LOCAL_PATH, project_name), '__NOT_IMPLEMENTED_BASE__', True, custom_inputs) @@ -66,7 +67,14 @@ def new(project_name, implemented): os.system('git init') print("A git repository was created. You should add your files and make your first commit.\n") - +def implemented_version_type(project_name,custom_inputs,option): + """ + Create a new hermione project + """ + if option == 1: + write_module(os.path.join(LOCAL_PATH, project_name), '__IMPLEMENTED_SAGEMAKER__', True, custom_inputs) + else: + write_module(os.path.join(LOCAL_PATH, project_name), '__IMPLEMENTED_BASE__', True, custom_inputs) @cli.command() def train(): diff --git a/hermione/module_templates/__IMPLEMENTED_BASE__/.ipynb_checkpoints/README.tpl-checkpoint.md b/hermione/module_templates/__IMPLEMENTED_BASE__/.ipynb_checkpoints/README.tpl-checkpoint.md new file mode 100644 index 0000000..98bb4e6 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_BASE__/.ipynb_checkpoints/README.tpl-checkpoint.md @@ -0,0 +1,6 @@ +# {{ inputs['project_name'] }} + +Project started in {{ inputs['project_start_date'] }}. + + +**Please, complete here information on using and testing this project.** diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__.json b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__.json new file mode 100644 index 0000000..aa8798f --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__.json @@ -0,0 +1,7 @@ +{ + "info": "Base files with implemented example", + "input_info": [ + ["project_name", "My Project", "Enter your project name"], + ["project_start_date", "01/01/21", "Enter the date your project started"] + ] +} \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.tpl.gitignore b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.tpl.gitignore new file mode 100644 index 0000000..95ac7a3 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.tpl.gitignore @@ -0,0 +1,8 @@ +.ipynb_checkpoints +mlruns/ +__pycache__/ +.vscode/ +catboost_info/ +.metaflow +data/ +*_env/ \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/README.tpl.md b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/README.tpl.md new file mode 100644 index 0000000..882eefe --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/README.tpl.md @@ -0,0 +1,244 @@ +# Hermione Sagemaker + +This notebook explains how to execute the Titanic project example + + +## Sagemaker + +Our code is divided in three steps: Processor, Train and Inference. In the Processor step, we preprocessed the training, validation and inference data. The Train step receives the preprocessed training and validation data, and uses them to train and validate a new model. The Inference step receives the inference data and model, and generates the prediction for the data. + +### Permitions + +If you are running this code on a SageMaker notebook instance, do the following to provide IAM permissions to the notebook: + +1. Open the Amazon [SageMaker console](https://console.aws.amazon.com/sagemaker/). +2. Select Notebook instances and choose the name of your notebook instance. +3. Under Permissions and encryption select the role ARN to view the role on the IAM console. +4. Under the Permissions tab, choose Attach policies and search for AmazonS3FullAccess. +5. Select the check box next to AmazonS3FullAccess. +6. Search for AmazonSageMakerFullAccess and AWSStepFunctionsFullAccess and select their check boxes. +7. Choose Attach policy. You will then be redirected to the details page for the role. +8. Copy and save the IAM role ARN for later use. + +Next, we will create a new policy to attach. + +12. Click Attach policies again and then Create policy. +13. Enter the following in the JSON tab: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "VisualEditor0", + "Effect": "Allow", + "Action": [ + "s3:PutObject", + "s3:GetObject", + "logs:CreateLogStream", + "codebuild:DeleteProject", + "codebuild:StartBuild", + "s3:DeleteObject", + "codebuild:CreateProject", + "codebuild:BatchGetBuilds" + ], + "Resource": [ + "arn:aws:s3:::sagemaker-*/*", + "arn:aws:codebuild:*:*:project/sagemaker-studio*", + "arn:aws:logs:*:*:log-group:/aws/codebuild/sagemaker-studio*" + ] + }, + { + "Sid": "VisualEditor1", + "Effect": "Allow", + "Action": [ + "logs:GetLogEvents", + "s3:CreateBucket", + "logs:PutLogEvents" + ], + "Resource": [ + "arn:aws:logs:*:*:log-group:/aws/codebuild/sagemaker-studio*:log-stream:*", + "arn:aws:s3:::sagemaker*" + ] + }, + { + "Sid": "VisualEditor2", + "Effect": "Allow", + "Action": [ + "iam:GetRole", + "ecr:CreateRepository", + "iam:ListRoles", + "ecr:GetAuthorizationToken", + "ecr:UploadLayerPart", + "ecr:ListImages", + "logs:CreateLogGroup", + "ecr:PutImage", + "iam:PassRole", + "sagemaker:*", + "ecr:BatchGetImage", + "ecr:CompleteLayerUpload", + "ecr:DescribeImages", + "ecr:DescribeRepositories", + "ecr:InitiateLayerUpload", + "ecr:BatchCheckLayerAvailability" + ], + "Resource": "*" + } + ] +} +``` + +14. Choose Next:Tags and add a tag, if you want to. +15. Choose Next:Review and add a name such as AmazonSageMaker-ExecutionPolicy. +16. Choose Create Policy. +17. Select Roles and search for your role. +18. Under the Permissions tab, click Attach policies. +19. Search for your newly created policy and select the check box next to it. +20. Choose Attach policy. + +### Docker images + +First, we need to create an image and upload it in ECR for each one of the steps. To do that, execute the following commands in the terminal: + +```bash +cd Sagemaker/project-name +source project-name_env/bin/activate +pip install -r requirements.txt +python -m ipykernel install --user --name project-name_env --display-name "project-name" +bash build_and_push.sh processor hermione-processor +bash build_and_push.sh train hermione-train +bash build_and_push.sh inference hermione-inference +``` + +The bash command will access the Dockerfile in the folder, create the image and save it in ECR with the specified name + +### Notebooks + +To test the images in ECR, execute the following notebooks: + +- project-name/src/ml/notebooks/1_Sagemaker_Processor.ipynb +- project-name/src/ml/notebooks/2_Sagemaker_Train.ipynb +- project-name/src/ml/notebooks/3_Sagemaker_Inference.ipynb + +## Stepfunctions + +We also create two Step Function state machines to execute the whole process. The first machine processes the training data and creates the model. And the second one processes the inference data and generates its prediction. + +### Permitions + +The Step Functions workflow requires an IAM role to interact with other services in AWS environment. To do that, follow these [AWS steps](https://github.com/aws/amazon-sagemaker-examples/blob/master/step-functions-data-science-sdk/step_functions_mlworkflow_processing/step_functions_mlworkflow_scikit_learn_data_processing_and_model_evaluation.ipynb): + + +1. Go to the [IAM console](https://console.aws.amazon.com/iam/). +2. Select Roles and then Create role. +3. Under Choose the service that will use this role select Step Functions. +4. Choose Next until you can enter a Role name. +5. Enter a name such as AmazonSageMaker-StepFunctionsWorkflowExecutionRole and then select Create role. +6. Search and click on the IAM Role you just created. +7. Click Attach policies and then select CloudWatchEventsFullAccess. +9. Click on Attach Policy + + +Next, create and attach another new policy to the role you created: + +9. Click Attach policies again and then Create policy. +10. Enter the following in the JSON tab: + + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "VisualEditor0", + "Effect": "Allow", + "Action": [ + "events:PutTargets", + "events:DescribeRule", + "events:PutRule" + ], + "Resource": [ + "arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTrainingJobsRule", + "arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTransformJobsRule", + "arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTuningJobsRule", + "arn:aws:events:*:*:rule/StepFunctionsGetEventsForECSTaskRule", + "arn:aws:events:*:*:rule/StepFunctionsGetEventsForBatchJobsRule", + "arn:aws:events:*:*:rule/StepFunctionsGetEventsForStepFunctionsExecutionRule", + "arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerProcessingJobsRule" + ] + }, + { + "Sid": "VisualEditor1", + "Effect": "Allow", + "Action": "iam:PassRole", + "Resource": "NOTEBOOK_ROLE_ARN", + "Condition": { + "StringEquals": { + "iam:PassedToService": "sagemaker.amazonaws.com" + } + } + }, + { + "Sid": "VisualEditor2", + "Effect": "Allow", + "Action": [ + "batch:DescribeJobs", + "batch:SubmitJob", + "batch:TerminateJob", + "dynamodb:DeleteItem", + "dynamodb:GetItem", + "dynamodb:PutItem", + "dynamodb:UpdateItem", + "ecs:DescribeTasks", + "ecs:RunTask", + "ecs:StopTask", + "glue:BatchStopJobRun", + "glue:GetJobRun", + "glue:GetJobRuns", + "glue:StartJobRun", + "lambda:InvokeFunction", + "sagemaker:CreateEndpoint", + "sagemaker:CreateEndpointConfig", + "sagemaker:CreateHyperParameterTuningJob", + "sagemaker:CreateModel", + "sagemaker:CreateProcessingJob", + "sagemaker:CreateTrainingJob", + "sagemaker:CreateTransformJob", + "sagemaker:DeleteEndpoint", + "sagemaker:DeleteEndpointConfig", + "sagemaker:DescribeHyperParameterTuningJob", + "sagemaker:DescribeProcessingJob", + "sagemaker:DescribeTrainingJob", + "sagemaker:DescribeTransformJob", + "sagemaker:ListProcessingJobs", + "sagemaker:ListTags", + "sagemaker:StopHyperParameterTuningJob", + "sagemaker:StopProcessingJob", + "sagemaker:StopTrainingJob", + "sagemaker:StopTransformJob", + "sagemaker:UpdateEndpoint", + "sns:Publish", + "sqs:SendMessage" + ], + "Resource": "*" + } + ] +} +``` + +11. Replace NOTEBOOK_ROLE_ARN with the ARN for your notebook that you used in the previous step in the above Sagemaker Permitions. +12. Choose Review policy and give the policy a name such as AmazonSageMaker-StepFunctionsWorkflowExecutionPolicy. +13. Choose Create policy. +14. Select Roles and search for your AmazonSageMaker-StepFunctionsWorkflowExecutionRole role. +15. Click Attach policies. +16. Search for your newly created AmazonSageMaker-StepFunctionsWorkflowExecutionPolicy policy and select the check box next to it. +17. Choose Attach policy. +18. Copy the AmazonSageMaker-StepFunctionsWorkflowExecutionRole Role ARN at the top of the Summary. You will use it in the next step. + + +### Notebooks + +To create and test the Step Functions state machines, execute the following notebooks: + +- project-name/src/ml/notebooks/4_Sagemaker_StepFunctions_Train.ipynb +- project-name/src/ml/notebooks/5_Sagemaker_StepFunctions_Inference.ipynb \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/build_and_push.sh b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/build_and_push.sh new file mode 100644 index 0000000..b1ea715 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/build_and_push.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash + +# This script shows how to build the Docker image and push it to ECR to be ready for use +# by SageMaker. + +# The argument to this script is the image name. This will be used as the image on the local +# machine and combined with the account and region to form the repository name for ECR. +mode=$1 +image=$2 + + +if [ "$image" == "" ] +then + echo "Usage: $0 " + exit 1 +fi + + +# Get the account number associated with the current IAM credentials +account=$(aws sts get-caller-identity --query Account --output text) + +if [ $? -ne 0 ] +then + exit 255 +fi + + +# Get the region defined in the current configuration (default to us-east-1 if none defined) +region=$(aws configure get region) +region=${region:-us-east-1} + + +fullname="${account}.dkr.ecr.${region}.amazonaws.com/${image}:latest" + +# If the repository doesn't exist in ECR, create it. + +aws ecr describe-repositories --repository-names "${image}" > /dev/null 2>&1 + +if [ $? -ne 0 ] +then + aws ecr create-repository --repository-name "${image}" > /dev/null +fi + +# Get the login command from ECR and execute it directly +aws ecr get-login-password --region "${region}" | docker login --username AWS --password-stdin "${account}".dkr.ecr."${region}".amazonaws.com + +# Build the docker image locally with the image name and then push it to ECR +# with the full name. +docker build -f ${mode}/Dockerfile -t ${image} . +docker tag ${image} ${fullname} + +docker push ${fullname} \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/data/raw/raw_test.csv b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/data/raw/raw_test.csv new file mode 100644 index 0000000..f0dbfb0 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/data/raw/raw_test.csv @@ -0,0 +1,269 @@ +Survived,Pclass,Sex,Age +1,3,female,1.0 +0,3,male, +0,1,male,30.0 +0,1,male,61.0 +0,3,male,27.0 +0,2,male,46.0 +1,2,female,40.0 +0,2,female,27.0 +0,3,female,18.0 +0,3,female,29.0 +0,3,female,9.0 +0,3,female,28.0 +1,1,female,52.0 +0,3,male,24.5 +0,1,female,50.0 +1,3,female,0.75 +1,1,female,58.0 +0,1,male, +0,3,female,45.0 +0,2,male,29.0 +0,3,female, +1,2,female,8.0 +0,2,male,39.0 +0,3,male,4.0 +0,3,male,2.0 +0,3,male,25.0 +0,3,male,22.0 +1,1,female,17.0 +0,3,male,19.0 +0,3,male, +0,3,female,2.0 +1,2,female,40.0 +0,2,male,34.0 +1,3,male,26.0 +0,2,male,19.0 +0,3,male,11.0 +0,3,male,42.0 +0,3,male,51.0 +1,2,female,24.0 +1,1,male,40.0 +1,1,female,14.0 +0,3,male, +0,3,female, +1,3,female,63.0 +0,3,male,16.0 +0,2,male,25.0 +1,1,female,39.0 +0,1,male,42.0 +0,3,male,20.0 +1,2,female,24.0 +1,3,female, +0,3,female,6.0 +0,3,male,20.5 +0,3,male,35.0 +0,2,male,24.0 +0,3,male, +0,3,male,16.0 +0,3,male,18.0 +0,1,male,29.0 +0,3,male,14.0 +1,1,female,33.0 +1,1,female,18.0 +1,1,male,11.0 +1,1,female, +0,3,male,24.0 +0,3,male,34.0 +0,2,male,48.0 +0,2,male,50.0 +0,1,male, +1,2,female, +1,1,female,49.0 +0,3,male,50.0 +1,1,male, +0,1,male,65.0 +0,3,male,21.0 +0,3,male,28.0 +0,3,male,41.0 +0,3,male,21.0 +1,1,female, +1,1,female, +0,3,female,26.0 +0,3,male,28.5 +0,3,male,9.0 +1,3,male, +0,3,male,24.0 +0,2,male,33.0 +1,3,female,1.0 +0,3,male,33.0 +1,1,male, +0,3,male,25.0 +1,3,female,18.0 +1,2,male, +0,2,male,54.0 +1,3,male,3.0 +0,1,male,37.0 +0,3,male,19.0 +1,1,female,24.0 +1,1,female,35.0 +0,3,male,33.0 +1,2,female,24.0 +0,3,male,1.0 +0,1,male,58.0 +0,1,male,45.0 +1,3,female,15.0 +0,3,male, +0,1,male,31.0 +0,3,male,26.0 +0,3,male,28.5 +0,3,male,35.0 +1,2,female,36.0 +1,2,male,0.83 +1,1,male,31.0 +1,1,female,31.0 +0,3,male,32.0 +0,3,male,26.0 +0,2,male,44.0 +0,1,male,60.0 +0,2,male,54.0 +0,3,male,18.0 +0,1,male,19.0 +0,3,male,19.0 +0,3,male,43.0 +1,1,male,42.0 +0,3,male, +0,2,male,27.0 +0,2,male,21.0 +0,3,female,43.0 +0,1,male, +1,3,female, +0,3,male,20.0 +1,1,female,44.0 +0,3,male,29.0 +0,3,male,25.0 +1,2,female,30.0 +1,1,female,48.0 +0,3,male,18.0 +1,1,female,50.0 +0,3,female,30.0 +0,3,male, +1,2,female,41.0 +0,3,male,65.0 +1,1,male,32.0 +0,3,male,17.0 +1,2,male,0.83 +1,3,female,5.0 +0,3,male,28.0 +0,3,male,39.0 +1,3,male,32.0 +1,3,male,27.0 +1,1,female,24.0 +1,1,male,49.0 +0,3,male,11.0 +0,3,female,23.0 +0,3,male,16.0 +1,1,female,22.0 +0,3,male, +1,1,female,58.0 +1,3,female,5.0 +1,2,female,42.0 +1,1,female,49.0 +1,2,female,40.0 +0,1,male,38.0 +1,1,male,25.0 +0,1,female,25.0 +0,1,male, +1,1,female,22.0 +1,2,female,54.0 +0,3,male,20.0 +1,3,female, +0,3,male,40.0 +0,3,male,23.0 +0,1,male, +0,3,female, +0,2,male,28.0 +0,3,male, +1,3,male,45.0 +1,2,male,1.0 +0,3,male, +0,2,male,27.0 +1,1,female,16.0 +0,3,male,31.0 +0,3,male,45.5 +0,3,male,21.0 +1,1,male,23.0 +0,2,male,52.0 +1,2,female,3.0 +0,2,male,16.0 +1,1,female,30.0 +0,3,male,21.0 +1,3,female,16.0 +1,2,male,19.0 +0,3,male, +1,2,female,25.0 +1,2,male,32.0 +0,3,female,30.5 +0,1,male,21.0 +1,3,male,25.0 +1,3,female,35.0 +1,2,female,17.0 +1,3,male, +1,1,female,16.0 +1,3,female, +1,1,male,42.0 +0,1,male,62.0 +0,1,male,40.0 +0,3,male,19.0 +0,3,male, +1,2,female,25.0 +0,1,male, +0,3,male,42.0 +0,3,male,29.0 +1,2,female,19.0 +0,3,female,3.0 +0,3,male,35.0 +0,3,male,30.5 +0,3,male, +0,2,male,34.0 +0,3,male, +1,3,male, +0,3,male,25.0 +0,3,male, +1,1,male,60.0 +0,3,male,30.0 +1,1,male,50.0 +0,3,female,9.0 +0,3,male,25.0 +0,2,male,27.0 +0,3,male,40.5 +0,2,male,30.0 +1,2,female,28.0 +1,3,female,30.0 +0,3,male,74.0 +0,3,female,25.0 +0,3,male,34.0 +0,2,male,31.0 +1,3,male,27.0 +1,1,female,35.0 +0,3,male,7.0 +1,1,female, +0,2,male,23.0 +0,3,male,30.0 +0,3,male,2.0 +1,3,female,24.0 +0,3,male, +0,3,male, +0,2,male,59.0 +0,2,male,51.0 +0,1,male,22.0 +1,1,male,34.0 +1,3,female,33.0 +0,3,male,24.0 +0,3,female,47.0 +0,1,male,47.0 +0,2,male,36.0 +0,3,male, +1,2,female,14.0 +0,3,female,41.0 +0,1,male, +1,3,female, +0,1,male, +0,3,male,33.0 +1,3,female,31.0 +0,3,male,17.0 +0,3,male,19.0 +0,3,female,2.0 +1,2,female,18.0 +0,3,male, +1,1,female,52.0 diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/data/raw/raw_train.csv b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/data/raw/raw_train.csv new file mode 100644 index 0000000..45c2b4a --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/data/raw/raw_train.csv @@ -0,0 +1,624 @@ +Survived,Pclass,Sex,Age +1,2,female,34.0 +1,2,female,31.0 +1,1,male,36.0 +1,3,male,29.0 +0,2,male,18.0 +1,1,female,63.0 +0,3,male, +1,2,female,28.0 +1,2,female,50.0 +1,3,female, +0,3,male,20.0 +0,3,male,22.0 +1,2,female,48.0 +0,3,female,40.0 +0,2,male,42.0 +1,1,female, +0,2,male, +1,3,female,0.75 +0,3,male, +0,1,male,54.0 +0,3,male, +1,1,female,19.0 +0,3,male,28.0 +1,3,female, +0,2,male,25.0 +0,3,female,39.0 +0,1,male,28.0 +1,3,male,3.0 +0,3,male,17.0 +0,3,male, +0,3,male,22.0 +1,2,male,1.0 +1,1,female,24.0 +0,3,male,49.0 +1,2,male,34.0 +1,2,female,45.0 +1,2,female,36.0 +1,3,male,19.0 +0,3,male, +0,3,male,24.0 +1,3,male,20.0 +0,3,male,44.0 +0,2,male,25.0 +0,2,male,21.0 +0,2,male,43.0 +1,3,male,24.0 +0,3,male,18.0 +0,2,male,70.0 +0,3,male,22.0 +1,3,female,4.0 +0,3,female, +0,3,female,18.0 +1,2,female,13.0 +1,3,female,19.0 +1,3,female,15.0 +0,2,male, +0,2,male,47.0 +0,1,male,55.0 +0,3,male, +0,2,male,32.5 +0,3,male,20.0 +1,2,female,28.0 +0,3,male,26.0 +0,3,male,22.0 +1,2,female,29.0 +0,3,male,28.0 +1,2,female,32.0 +1,2,female,17.0 +0,2,male,29.0 +0,1,male,51.0 +0,1,male,45.5 +0,3,male,25.0 +1,3,male,6.0 +1,3,male, +0,3,male, +0,2,male,18.0 +0,1,male,18.0 +0,3,male, +0,3,male,21.0 +1,1,female,56.0 +0,3,male, +0,3,male,59.0 +0,3,male, +1,1,male,36.0 +0,3,male,36.0 +0,1,male,44.0 +0,1,male,38.0 +0,3,female,31.0 +0,3,female,25.0 +0,1,male,31.0 +0,3,male,42.0 +1,3,female,22.0 +1,3,female,24.0 +0,2,male,34.0 +1,3,female,22.0 +0,2,male,60.0 +0,2,male,36.0 +0,3,male, +1,1,male,28.0 +0,3,male, +0,3,male,21.0 +0,3,male,34.0 +0,3,female, +0,3,male, +0,3,female,45.0 +1,1,female,58.0 +0,3,male,4.0 +1,3,male,20.0 +0,1,male, +1,2,female,27.0 +0,2,male,66.0 +0,1,male,47.0 +1,2,female,30.0 +1,3,male,32.0 +0,1,male,24.0 +1,2,female,55.0 +1,1,female,38.0 +0,3,male, +0,3,male,22.0 +0,1,male,56.0 +0,3,male,28.0 +0,3,male,24.0 +0,3,female,32.0 +0,3,male,26.0 +1,1,male,52.0 +1,2,female,28.0 +1,1,female,48.0 +1,1,male,27.0 +0,3,male,40.5 +0,3,male,70.5 +0,2,male,37.0 +0,3,male,33.0 +1,2,female,4.0 +0,3,male,28.0 +0,3,female,31.0 +1,1,male,28.0 +0,3,female,45.0 +0,1,male,40.0 +0,3,male, +0,3,male,22.0 +0,2,male,21.0 +1,3,male,0.42 +0,3,male, +1,2,female,19.0 +0,3,female,30.0 +0,3,male,21.0 +1,1,female,30.0 +0,1,male,58.0 +0,1,male,61.0 +0,3,male,44.0 +0,3,male,17.0 +0,3,female,18.0 +0,3,female,8.0 +0,2,male,28.0 +0,3,male,61.0 +0,2,female,24.0 +0,3,female,2.0 +0,3,male,22.0 +1,3,female, +0,1,male,36.0 +1,2,female,24.0 +1,3,male,22.0 +0,3,male, +1,1,female,18.0 +1,1,female,35.0 +0,3,female, +0,1,male,65.0 +0,3,female, +0,3,male,16.0 +0,3,male,33.0 +1,1,female, +0,2,male,30.0 +0,1,male, +0,3,male, +0,3,male,19.0 +0,3,female,11.0 +0,3,male,16.0 +1,1,female,43.0 +0,2,male,36.5 +0,3,male,20.0 +0,3,male,40.0 +0,3,male, +0,3,male,28.0 +0,3,female,16.0 +0,3,female,17.0 +0,2,male,24.0 +1,3,female, +1,3,female,15.0 +1,3,female, +0,3,female, +0,3,female,39.0 +0,3,female,21.0 +1,1,male,35.0 +0,3,female,9.0 +0,3,female,18.0 +0,3,male,22.0 +1,3,male, +0,3,male,19.0 +0,3,male,7.0 +0,3,male,36.0 +0,3,male, +0,3,male,30.0 +1,1,female,53.0 +0,3,male,25.0 +1,3,male,32.0 +0,3,male,29.0 +0,2,male,26.0 +0,2,male,36.0 +0,2,male,39.0 +0,1,male, +1,2,female,36.0 +1,1,female,22.0 +0,1,male,46.0 +1,1,male,49.0 +1,1,male, +1,1,female,29.0 +0,3,male, +0,1,female,2.0 +0,3,male,15.0 +1,3,female, +0,1,male,64.0 +0,3,male,1.0 +1,3,male,18.0 +1,3,male,12.0 +0,3,male,18.0 +0,2,male,24.0 +0,3,male, +0,3,female, +0,1,male,47.0 +1,3,female,2.0 +0,3,male, +0,3,male,39.0 +0,3,male, +0,2,male,35.0 +1,1,female,36.0 +0,3,male, +1,3,female, +1,3,male, +0,2,male, +0,2,male,23.0 +0,1,male, +1,2,female,4.0 +1,1,female, +1,3,female,4.0 +0,2,male,23.0 +0,3,male,20.0 +1,2,female,35.0 +1,1,female,39.0 +0,3,male,24.0 +0,3,female,20.0 +0,2,male,19.0 +1,1,female,39.0 +0,3,female, +1,1,male,27.0 +0,3,female,22.0 +0,3,male,48.0 +0,1,male,49.0 +0,1,male,19.0 +0,3,male, +1,3,female,27.0 +1,1,male,37.0 +0,3,male,36.0 +1,3,male,21.0 +0,2,male,32.0 +0,3,female, +1,3,female, +0,3,male,26.0 +0,3,male, +0,3,male,17.0 +0,2,male,30.0 +1,3,female,5.0 +1,3,male,44.0 +1,2,male,42.0 +0,3,male,24.0 +0,2,male,57.0 +1,1,female,60.0 +0,3,male,24.0 +0,3,female, +1,3,female,22.0 +0,3,male, +1,1,female,24.0 +0,3,male,16.0 +0,3,male,21.0 +0,3,male,37.0 +1,3,female, +0,2,male,25.0 +1,1,female,47.0 +1,3,female, +0,1,male,54.0 +1,1,female,18.0 +0,3,male,28.0 +0,2,male,23.0 +1,2,male,8.0 +0,3,male, +0,3,male,35.0 +1,1,female,38.0 +1,2,female,50.0 +1,1,male,4.0 +0,1,male,45.0 +0,3,male,21.0 +1,3,male,31.0 +0,3,male, +1,3,male,9.0 +0,3,male, +0,3,male,23.0 +1,1,male,17.0 +0,3,male,44.0 +1,3,male,39.0 +1,1,female,17.0 +0,3,male,20.0 +1,1,female,39.0 +0,3,male,19.0 +0,2,male,31.0 +1,3,male,30.0 +0,2,male,18.0 +0,3,male, +0,2,male, +1,3,male, +0,2,male, +0,3,male,29.0 +1,3,female,38.0 +1,3,male,29.0 +0,2,male,29.0 +1,2,female,29.0 +1,1,female,38.0 +0,3,male,36.0 +1,3,female, +0,3,male, +1,1,female, +0,3,female,21.0 +1,1,female,45.0 +0,1,male,36.0 +0,3,male,23.0 +0,1,male,28.0 +0,3,male, +1,3,female, +1,2,female,29.0 +0,3,male,32.0 +0,3,male,21.0 +0,3,female,21.0 +0,3,male,30.0 +1,2,male,62.0 +1,1,female,33.0 +0,1,male, +0,2,male, +1,1,male, +0,3,male,34.0 +0,3,female,28.0 +0,1,male,33.0 +0,2,male,42.0 +1,2,female,34.0 +1,1,female,32.0 +0,2,female,44.0 +0,3,male,21.0 +0,3,male, +0,3,male,19.0 +0,3,male, +0,3,male,55.5 +0,2,male,19.0 +0,3,male,47.0 +1,2,female,7.0 +0,3,male,43.0 +1,2,female,24.0 +1,3,female, +0,3,male,38.0 +1,1,female,35.0 +1,1,female,41.0 +0,3,male, +1,1,male,35.0 +1,3,female,19.0 +0,3,male, +0,2,male,34.0 +1,1,male,48.0 +1,2,female,33.0 +0,3,male, +0,2,male,16.0 +0,1,male, +0,1,male,37.0 +0,3,male,29.0 +1,3,male,16.0 +0,3,male,22.0 +0,3,male, +0,3,male,32.0 +0,3,male,8.0 +1,1,female,21.0 +1,3,male,32.0 +0,3,male,28.0 +1,1,female,15.0 +1,1,female,19.0 +0,2,male,25.0 +0,3,male,41.0 +1,2,male,31.0 +0,2,male,35.0 +0,3,male, +0,3,female,18.0 +0,3,female,24.0 +1,1,female,30.0 +1,3,female,4.0 +1,3,female,18.0 +1,1,female,36.0 +0,3,female,29.0 +1,3,male,26.0 +1,1,female,23.0 +0,3,male,32.0 +0,3,male,26.0 +1,1,female,44.0 +1,3,female,23.0 +1,1,male,36.0 +0,3,male,20.0 +0,3,male,22.0 +1,1,female,36.0 +1,3,female, +0,2,female,26.0 +1,2,female,42.0 +0,3,female, +1,1,female,24.0 +0,3,male, +1,3,male,1.0 +1,1,male, +1,3,male,20.0 +0,3,male,20.0 +0,3,male,28.0 +1,2,male,3.0 +1,1,male,45.0 +0,3,male,25.0 +0,3,female,9.0 +0,2,male,39.0 +0,3,male, +0,1,male,46.0 +0,3,male, +1,2,female,18.0 +0,3,male,16.0 +0,3,male,47.0 +0,3,female,37.0 +0,3,male,14.0 +0,1,male, +0,3,male,31.0 +0,1,male, +0,2,male,23.0 +0,3,male,33.0 +0,3,male,39.0 +0,2,male,28.0 +1,1,female,40.0 +0,3,male,42.0 +1,1,female,31.0 +0,3,male,33.0 +0,3,male,25.0 +1,3,female,31.0 +0,2,male, +0,3,male,19.0 +0,3,male,38.0 +1,2,female,45.0 +0,3,female,48.0 +1,1,male,27.0 +1,1,female,30.0 +0,3,male, +0,3,male,4.0 +0,3,male, +1,2,female,28.0 +0,3,male, +0,2,male,32.0 +1,1,male,48.0 +0,3,male, +0,3,male, +0,1,male,39.0 +1,3,female,18.0 +1,3,male, +0,3,male,10.0 +0,3,male,32.0 +1,3,female,27.0 +0,3,male, +0,2,female,57.0 +1,2,female,5.0 +1,2,female,33.0 +0,3,male,18.0 +0,1,male,27.0 +0,3,male,9.0 +1,1,female,54.0 +0,3,male,51.0 +1,1,male,51.0 +1,1,female,40.0 +0,3,male, +1,1,female,33.0 +1,3,female,22.0 +1,1,female,23.0 +1,1,male,56.0 +1,2,female,22.0 +0,3,male,27.0 +0,3,male,22.0 +1,1,female,35.0 +0,2,male,18.0 +0,3,female,10.0 +0,3,male,17.0 +1,3,male, +1,3,male,4.0 +0,3,male,23.5 +1,3,female,26.0 +1,1,female,62.0 +0,3,male, +0,3,male,16.0 +1,3,female, +0,3,male,40.0 +1,1,male,48.0 +0,3,male,2.0 +0,1,male,71.0 +1,2,male,3.0 +1,1,female, +0,3,male,26.0 +0,3,male,26.0 +1,1,female,22.0 +0,1,male,64.0 +0,3,female, +1,3,female, +0,1,male,56.0 +0,3,female,14.0 +0,3,male, +0,2,male,36.0 +0,1,male,52.0 +0,3,male,51.0 +1,2,male,0.67 +1,2,female,50.0 +1,1,female,35.0 +0,3,male, +1,3,male,9.0 +1,2,female,24.0 +1,2,female,23.0 +0,3,male,40.0 +0,1,male,24.0 +0,3,male,19.0 +0,1,male,47.0 +0,2,male,28.0 +1,2,female,21.0 +0,3,male, +0,1,male,70.0 +0,3,male,21.0 +1,1,female,54.0 +0,3,male,38.0 +1,2,female,2.0 +1,3,female,13.0 +1,1,male,0.92 +0,1,male,50.0 +1,1,female,30.0 +0,2,male,52.0 +0,2,female,38.0 +1,3,female, +0,1,male,62.0 +0,3,male,45.0 +1,3,female, +0,3,male,30.0 +1,3,female,29.0 +0,1,male,29.0 +1,1,male,36.0 +1,2,female,6.0 +0,3,male, +0,3,male,18.0 +1,1,male,26.0 +0,2,male,54.0 +0,3,male,35.0 +0,3,male, +1,1,female,51.0 +0,3,male,34.5 +1,3,male,25.0 +1,1,male,25.0 +1,3,male,27.0 +1,2,female,27.0 +0,3,male, +1,3,female,24.0 +0,3,male, +0,2,male,30.0 +1,2,female,34.0 +0,3,male, +0,3,female,22.0 +0,3,female, +0,3,male,22.0 +1,1,female,26.0 +0,3,male, +1,1,male,80.0 +0,2,male,30.0 +1,3,female, +0,3,female, +1,1,female,42.0 +0,2,male,34.0 +0,2,male,23.0 +0,3,male, +1,1,female,16.0 +1,3,female,36.0 +1,2,female, +1,1,female,19.0 +1,3,female,14.0 +1,2,male,2.0 +1,3,female,26.0 +0,3,male, +1,3,female,27.0 +0,2,male,31.0 +1,3,female,17.0 +1,3,female,16.0 +1,2,female,22.0 +0,3,male, +0,3,male,30.0 +0,3,male, +0,1,male,45.0 +0,3,female, +0,3,female,14.5 +1,1,male,38.0 +0,1,male,50.0 +0,3,male,26.0 +1,3,female, +0,3,female,41.0 +0,3,female,20.0 +0,1,male,71.0 +1,1,male,35.0 +0,3,male, +1,3,male,32.0 +1,2,female,32.5 +1,3,female,21.0 +0,3,male,36.0 +1,2,male, +1,1,female,21.0 +1,2,female,34.0 +1,2,female,30.0 +0,3,male,32.0 +0,3,male,30.0 +1,3,male,29.0 diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/Dockerfile b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/Dockerfile new file mode 100644 index 0000000..b9524cc --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/Dockerfile @@ -0,0 +1,59 @@ +FROM ubuntu:latest +# Set a docker label to advertise multi-model support on the container +LABEL com.amazonaws.sagemaker.capabilities.multi-models=false +# Set a docker label to enable container to use SAGEMAKER_BIND_TO_PORT environment variable if present +LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true + +# Install some handful libraries like curl, wget, git, build-essential, zlib +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + gcc \ + python3.7 \ + python3-dev \ + python3-pip \ + ca-certificates \ + git \ + curl \ + openjdk-8-jre-headless\ + wget &&\ + rm -rf /var/lib/apt/lists/* + +# install the SageMaker Inference Toolkit +RUN pip3 install --no-cache \ + multi-model-server \ + sagemaker-inference \ + retrying + +# Change working directory +WORKDIR / + +# Install requirements +COPY requirements.txt /opt/ml/code/src/requirements.txt +RUN pip3 install --no-cache -r /opt/ml/code/src/requirements.txt + +# set some environment variables +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PYTHONIOENCODING=UTF-8 \ + LANG=C.UTF-8 \ + LC_ALL=C.UTF-8 + +# copy folders for code +COPY src/config/ /opt/ml/code/config/ +COPY src/ml/ /opt/ml/code/ml/ +COPY src/util.py /opt/ml/code/util.py + +# Copy entrypoint script to the image and make it executable +COPY inference/main.py /opt/ml/code/main.py +COPY inference/handler.py /opt/ml/code/serving/handler.py + +# install sagemaker training +RUN pip3 install --no-cache --upgrade \ + boto3 \ + sagemaker + +# Setting PYTHONPATH to access the copied code +ENV PYTHONPATH="/opt/ml/code:${PATH}" + +# Add a Python script and configure Docker to run it +ENTRYPOINT ["python3", "/opt/ml/code/main.py"] diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/handler.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/handler.py new file mode 100644 index 0000000..b0a0ce5 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/handler.py @@ -0,0 +1,122 @@ +import sys +sys.path.append("..") + +import os +import logging +from joblib import load +from six import StringIO +import pandas as pd + +from ml.model.wrapper import Wrapper +from sagemaker_inference.default_inference_handler import DefaultInferenceHandler +from sagemaker_inference.default_handler_service import DefaultHandlerService +from sagemaker_inference import content_types, errors, transformer, encoder, decoder + +logging.getLogger().setLevel('INFO') + +# Path to access the model +MODEL_DIR = '/opt/ml/model' + + +def _csv_to_pandas(string_like): + """ + Convert a CSV object to a pandas DataFrame. + + Parameters + ---------- + string_like : String + CSV string. + + Returns + ------- + pd.DataFrame : pandas DataFrame + """ + stream = StringIO(string_like) + res = pd.read_csv(stream) + return res + + +class HandlerService(DefaultHandlerService, DefaultInferenceHandler): + """ + Execute the inference step in the virtual environment + + """ + def __init__(self): + op = transformer.Transformer(default_inference_handler=self) + super(HandlerService, self).__init__(transformer=op) + + def default_model_fn(self, model_dir): + """ + Loads the model from the disk + + Parameters + ---------- + model_dir : string + Path of the model + + Returns + ------- + pkl : model + """ + logging.info('Loading the model') + return load(os.path.join(MODEL_DIR, "model.pkl")) + + def default_input_fn(self, input_data, content_type): + """ + Parse and check the format of the input data + + Parameters + ---------- + input_data : string + CSV string + content_type : string + Type of the file + + Returns + ------- + pd.DataFrame : pandas DataFrame + """ + global colunas + if content_type != "text/csv": + raise Exception("Invalid content-type: %s" % content_type) + return _csv_to_pandas(input_data) + + def default_predict_fn(self, df, model): + """ + Run our model and do the prediction + + Parameters + ---------- + df : pd.DataFrame + Data to be predicted + model : pkl + Model to predict the data + + Returns + ------- + pd.DataFrame : pandas DataFrame + """ + logging.info('Predicting...') + resultados = model.predict(df, included_input=True) + logging.info('Prediction Complete') + return resultados.reset_index(drop=True).T.reset_index().T + + def default_output_fn(self, prediction, accept): + """ + Gets the prediction output and format it to be returned to the user + + Parameters + ---------- + prediction : pd.DataFrame + Predicted dataset + accept : string + Output type + + Returns + ------- + CSV : CSV file + """ + logging.info('Saving') + if accept != "text/csv": + raise Exception("Invalid accept: %s" % accept) + return encoder.encode(prediction, accept) diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/main.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/main.py new file mode 100644 index 0000000..803a9e9 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/main.py @@ -0,0 +1,10 @@ +import sys +import os +import argparse +import logging +from sagemaker_inference import model_server + +logging.getLogger().setLevel(logging.INFO) + +if __name__ == "__main__": + model_server.start_model_server(handler_service="serving.handler") diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/Dockerfile b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/Dockerfile new file mode 100644 index 0000000..38fa906 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/Dockerfile @@ -0,0 +1,60 @@ +FROM ubuntu:latest +# Set a docker label to advertise multi-model support on the container +LABEL com.amazonaws.sagemaker.capabilities.multi-models=false +# Set a docker label to enable container to use SAGEMAKER_BIND_TO_PORT environment variable if present +LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true + +# No question/dialog is asked during apt-get install +ARG DEBIAN_FRONTEND=noninteractive + +# Setting the Timezone Environment Variable +ENV TZ=America/Sao_Paulo + +# install ubuntu libraries +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + gcc \ + python3.7 \ + python3-dev \ + python3-pip \ + ca-certificates \ + git \ + curl \ + nginx \ + openjdk-8-jre-headless\ + wget &&\ + rm -rf /var/lib/apt/lists/* + +# Create folders for code +RUN mkdir /opt/ml && \ + mkdir /opt/ml/processing && \ + mkdir /opt/ml/processing/input && \ + mkdir /opt/ml/processing/input/raw_data && \ + mkdir /opt/ml/processing/input/preprocessing && \ + mkdir /opt/ml/processing/input/expectations && \ + mkdir /opt/ml/processing/output && \ + mkdir /opt/ml/processing/output/processed && \ + mkdir /opt/ml/processing/output/processed/train && \ + mkdir /opt/ml/processing/output/processed/val && \ + mkdir /opt/ml/processing/output/processed/inference && \ + mkdir /opt/ml/processing/output/expectations && \ + mkdir /opt/ml/processing/output/validations + +# Install requirements +COPY requirements.txt /opt/ml/code/src/requirements.txt +RUN pip3 install --no-cache -r /opt/ml/code/src/requirements.txt + +# Copy entrypoint script to the image and make it executable +COPY src/config/ /opt/ml/code/src/config/ +COPY src/ml/ /opt/ml/processing/ml/ +COPY src/util.py /opt/ml/processing/util.py +COPY processor/preprocessor.py /opt/ml/processing/preprocessor.py + +# Change working directory +WORKDIR /opt/ml/processing + +# Setting PYTHONPATH to access the copied code +ENV PYTHONPATH="/opt/ml/processing:${PATH}" + +# Add a Python script and configure Docker to run it +ENTRYPOINT ["python3", "preprocessor.py"] diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/preprocessor.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/preprocessor.py new file mode 100644 index 0000000..bb269eb --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/preprocessor.py @@ -0,0 +1,101 @@ +import argparse +import logging +from datetime import date + +import pandas as pd +import glob +import json +from joblib import dump, load +import great_expectations as ge + +from ml.preprocessing.preprocessing import Preprocessing +from ml.preprocessing.dataquality import DataQuality +from ml.data_source.spreadsheet import Spreadsheet + +logging.getLogger().setLevel('INFO') + +path_input = '/opt/ml/processing/input/' +path_output = '/opt/ml/processing/output/' +date = date.today().strftime('%Y%m%d') + +def data_quality(df, step_train): + """ + If True, it creates the DataQuality object, + otherwise it loads an existing one + + Parameters + ---------- + df : pd.Dataframe + Train or test dataset + step_train : boolean + Train or test + + """ + if step_train: + dq = DataQuality(discrete_cat_cols=['Sex', 'Pclass', 'Survived']) + df_ge = dq.perform(df) + df_ge.save_expectation_suite(path_output + + 'expectations/expectations.json') + else: + df_ge = ge.dataset.PandasDataset(df) + ge_val = df_ge.validate(expectation_suite=path_input + + 'expectations/expectations.json', + only_return_failures=False) + with open(f'{path_output}validations/{date}.json', 'w') as f: + json.dump(ge_val.to_json_dict(), f) + + +def preprocessing(df, step_train): + """ + If True, it creates the Preprocessing object, + otherwise it loads an existing one + + Parameters + ---------- + df : pd.Dataframe + Train or test dataset + step_train : boolean + Train or test + + """ + if step_train: + norm_cols = {'min-max': ['Age']} + oneHot_cols = ['Pclass', 'Sex'] + p = Preprocessing(norm_cols, oneHot_cols) + train, test_train = p.execute(df, step_train=True, val_size=0.2) + logging.info("Saving") + dump(p, path_output+'preprocessing/preprocessing.pkl') + train.to_csv(path_output+'processed/train/train.csv', index=False) + test_train.to_csv(path_output+'processed/val/val.csv', index=False) + else: + p = load(path_input+'preprocessing/preprocessing.pkl') + test = p.execute(df, step_train=False) + logging.info("Saving") + test.to_csv(path_output+'processed/inference/inference.csv', + index=False) + + +if __name__ == '__main__': + """ + Execute the processor step in the virtual environment + + """ + logging.info('Starting the preprocessing') + + # Read the step argument (train or test) + parser = argparse.ArgumentParser() + parser.add_argument('--step', type=str, default='train') + args = parser.parse_args() + step_train = True if args.step == "train" else False + logging.info(f'step_train: {step_train}') + + logging.info('Reading the inputs') + file = glob.glob(path_input+"raw_data/*.csv")[0] + logging.info(f'Reading file: {file}') + df = Spreadsheet().get_data(file) + + logging.info("Data Quality") + data_quality(df, step_train) + + logging.info("Preprocessing") + preprocessing(df, step_train) diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/requirements.txt b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/requirements.txt new file mode 100644 index 0000000..3b158af --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/requirements.txt @@ -0,0 +1,32 @@ +category-encoders +coverage +datetime +Flask +gunicorn +hermione-ml +matplotlib +mlflow +mlxtend +numpy +pandas +plotly +pytest +seaborn +scikit-learn +scipy +statsmodels +tqdm +yellowbrick +vega_datasets +altair +pandas_profiling +streamlit_pandas_profiling +interpret-community +lime +lightgbm +great_expectations +stepfunctions +sagemaker-inference +ipykernel +boto3 +sagemaker diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/config/config.json b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/config/config.json new file mode 100644 index 0000000..c34a7bc --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/config/config.json @@ -0,0 +1,7 @@ +{ +"project_name": "hermione-sagemaker", + "env_path": "hermione-sagemaker/hermione-sagemaker_env", + "files_path": "../data/raw/", + "key": "<<<>>>", + "user": "<<<>>>" + } \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/cluster.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/cluster.py new file mode 100644 index 0000000..5e5f7a6 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/cluster.py @@ -0,0 +1,166 @@ +from sklearn.mixture import GaussianMixture +from sklearn.cluster import KMeans +from sklearn import metrics +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + +class Cluster: + + @classmethod + def analyzeK(cls, X, k_min = 2, k_max = 20): + """ + Plot the result of the methods (elbow, silhouette and calinski_harabas) to find the best k + + Parameters + ---------- + X : array + values ​​that will be used to find the best k + k_min : int + minimum interval for K + k_max : int + maximum range for K + + Returns + ------- + None + """ + + if X is None: + raise Exception("Error: X is None.") + if k_min is None or k_max is None: + raise Exception("Error: Range is None.") + if k_min < 2: + raise Exception("Error: k_min < 2") + + wss = [] + s_gmm = [] + s_kmeans = [] + ch_gmm = [] + ch_kmeans = [] + + K = range(k_min, k_max) + + for k in K: + kmeans = KMeans(n_clusters=k) + kmeans.fit(X) + gmm = GaussianMixture(n_components=k, covariance_type='full') + gmm.fit(X) + + labels_kmeans = kmeans.predict(X) + labels_gmm = gmm.predict(X) + + s_kmeans.append(metrics.silhouette_score(X, labels_kmeans, metric='euclidean')) + s_gmm.append(metrics.silhouette_score(X, labels_gmm, metric='euclidean')) + + ch_kmeans.append(metrics.calinski_harabasz_score(X, labels_kmeans)) + ch_gmm.append(metrics.calinski_harabasz_score(X, labels_gmm)) + + wss.append(kmeans.inertia_) + + cls._elbow(K, wss) + cls._silhouette_coefficient(K, s_kmeans, s_gmm) + cls._calinski_harabaz(K, ch_kmeans, ch_gmm) + + @classmethod + def _elbow(cls, K, wss): + """ + Function plots the result of the elbow method + + Parameters + ---------- + k : array + possible k values + k_min : array + Total WSS measures cluster compression and we want it to be as small as possible + Returns + ------- + None + """ + plt.plot(K, wss, 'bx-') + plt.xlabel('k') + plt.ylabel('WSS') + plt.title('The Elbow Method showing the optimal k') + plt.show() + + @classmethod + def _silhouette_coefficient(cls, K, s_kmeans, s_gmm): + """ + Function plots the result of the silhouette method for kmeans and Gaussian Mixture Models + + Parameters + ---------- + k : array + k values + s_kmeans : array + Silhouette kmeans values + s_gmm : array + Silhouette Gaussian Mixture Models values + + Returns + ---- + None + """ + plt.plot(K, s_kmeans, 'xr-') # plotting t, a separately + plt.plot(K, s_gmm, 'ob-') + plt.legend(["kmeans", "gmm"]) + plt.xlabel('k') + plt.ylabel('Mean Silhouette Coefficient') + plt.title('Mean Silhouette Coefficient for each k') + plt.show() + + @classmethod + def _calinski_harabaz(cls, K, ch_kmeans, ch_gmm): + """ + Function plots the result of the calinski_harabaz method for kmeans and Gaussian Mixture Models + + Parameters + ---------- + k : array + possible k values + s_kmeans : array + calinski_harabaz kmeans values + s_gmm : array + Gaussian Mixture Models values + + Returns + ------- + None + """ + plt.plot(K, ch_kmeans, 'xr-') # plotting t, a separately + plt.plot(K, ch_gmm, 'ob-') + plt.legend(["kmeans", "gmm"]) + plt.xlabel('k') + plt.ylabel('Calinski and Harabaz score') + plt.title('Calinski and Harabaz score for each k') + plt.show() + + @classmethod + def plot_cluster(cls, df_res_algorithm, algorithm_name = "K-means"): + """ + Function that plots clusters + + Parameters + ---------- + df_res_algoritmo : pd.DataFrame + Dataframe must have the following columns (x, y, cluster) + algorithm_name : str + algorithm name + Return + ------- + None + """ + # verifica quantos clusters tem + qtde_cluster = df_res_algorithm.cluster.max()+1 + plots = [] + for cluster in range(qtde_cluster): + p = plt.scatter(df_res_algorithm[df_res_algorithm['cluster'] == cluster].x, + df_res_algorithm[df_res_algorithm['cluster'] == cluster].y) + plots.append(p) + plt.legend(tuple(plots), + (tuple(["Cluster {}".format(c) for c in range(1, qtde_cluster+1)])), + loc=2, fontsize=8, bbox_to_anchor=(1.05, 1)) + plt.xlabel("X") + plt.ylabel("Y") + plt.title("Clusters created by "+algorithm_name) + plt.show() \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/feature_selection.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/feature_selection.py new file mode 100644 index 0000000..4b3a7bf --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/feature_selection.py @@ -0,0 +1,387 @@ +from sklearn.feature_selection import VarianceThreshold +from sklearn.feature_selection import SelectKBest +from sklearn.feature_selection import SelectPercentile +from sklearn.feature_selection import RFE +from sklearn.feature_selection import SelectFromModel +from sklearn.feature_selection import SequentialFeatureSelector +from mlxtend.feature_selection import ExhaustiveFeatureSelector +from abc import ABC, abstractmethod +import numpy as np +import pandas as pd + +class SelectAlgorithm(ABC): + """ + Abstract class for feature selection algorithms + """ + def transform(self, df: pd.DataFrame): + """ + Select features based on fit + + Parameters + ---------- + df : pd.DataFrame + dataframe with features to be selected + + Returns + ------- + pd.DataFrame + dataframe with selected features only + """ + return df[df.columns[self.selected_columns]] + + def get_support(self): + """ + Get a mask, or integer index, of the features selected + + Parameters + ---------- + + Returns + ------- + np.array + """ + return self.selected_columns + + @abstractmethod + def fit(self) -> None: + """ + Abstract method that is implemented in classes that inherit it + """ + pass + +class SelectCoefficients(SelectAlgorithm): + """ + Class to select features based on model coefficients + """ + def __init__(self, model, num_feat = None): + """ + Constructor + + Parameters + ---------- + model : + should be an instance of a classification or regression model class from scikit-learn and have coef_.ravel method + + num_feats : int + number of features to be selected + Returns + ------- + SelectCoefficients + """ + self.model = model + self.num_feat = num_feat + + def fit(self, X: pd.DataFrame, y = None): + """ + Identify the features to be selected. + + Parameters + ---------- + X : pd.DataFrame + features to be selected + + y : pd.DataFrame + target values + + Returns + ------- + None + """ + self.num_feat = int(X.shape[1]/2) if self.num_feat == None else self.num_feat + trained_model = self.model.fit(X,y) + self.selected_columns = np.argsort(np.abs(trained_model.coef_.ravel()))[-self.num_feat:] + +class SelectCorrelation(SelectAlgorithm): + """ + Class to select features based on correlation between features + """ + def __init__(self, threshold = 1.0): + """ + Constructor + + Parameters + ---------- + threshold : float + correlation threshold + Returns + ------- + SelectCorrelation + """ + self.threshold = threshold + def fit(self, X: pd.DataFrame, y = None): + """ + Identify the features to be selected. + + Parameters + ---------- + X : pd.DataFrame + features to be selected + + y : pd.DataFrame + target values + + Returns + ------- + None + """ + corr = X.corr() + self.selected_columns = np.full((corr.shape[0],), True, dtype=bool) + [self.check_correlation(corr.iloc[i,j],j) for i in range(corr.shape[0]) for j in range(i+1, corr.shape[0])] + + def check_correlation(self,corr,j): + """ + Auxiliar method to check if correlation between features is above threshold + Parameters + ---------- + corr : float + correlation between two atributes + + j : int + index of column to be removed in case corr >= self.threshold + + Returns + ------- + None + """ + if np.abs(corr) >= self.threshold and self.selected_columns[j]: + self.selected_columns[j] = False + +class MyExhaustiveFeatureSelector(ExhaustiveFeatureSelector): + """ + Class that inherits from ExhaustiveFeatureSelector (from mlxtend) and implements get_support method for + compatibility issues + """ + def get_support(self): + return list(self.best_idx_) + +class SelectEnsemble(SelectAlgorithm): + """ + Class to select features based on ensemble of methods + """ + def __init__(self, dic_selection: dict, num_feat = None): + """ + Constructor + + Parameters + ---------- + dic_selection : dict + dict with name of the algorithm as keys and dicts of parameters as values + Ex: dic_selection = { 'variance': {'threshold' : 0.3}, + 'recursive': {'estimator' : LinearSVC(), 'n_features_to_select' : 2}} + num_feats : int + number of features to be selected + Returns + ------- + SelectCoefficients + """ + self.dic_selection = dic_selection + self.num_feat = num_feat + + def fit(self, X: pd.DataFrame, y = None): + """ + Identify the features to be selected. + + Parameters + ---------- + X : pd.DataFrame + features to be selected + + y : pd.DataFrame + target values + + Returns + ------- + None + """ + self.num_feat = int(X.shape[1]/2) if self.num_feat == None else self.num_feat + self.column_dic = {} + for i,column in enumerate(X.columns): + self.column_dic[column] = i + self.column_count = [0 for column in X.columns] + selections = [FeatureSelector(selector,**self.dic_selection[selector]) for selector in self.dic_selection] + [selection.fit(X,y) for selection in selections] + [self.increment_count(column) for selection in selections for column in selection.selected_columns] + self.selected_columns = np.argsort(self.column_count)[-self.num_feat:] + + def increment_count(self,column): + """ + Auxiliar method to increment the count of a column + Parameters + ---------- + column : int + column which the count will be incremented + + Returns + ------- + None + """ + self.column_count[self.column_dic[column]]+=1 + +class FeatureSelector: + + def __init__(self, selector, **kwargs): + """ + Constructor + + Parameters + ---------- + selector : str + name of algorithm to be applied + **kwargs : + optional and positional arguments of the choosen algorithm (selector) + Returns + ------- + FeatureSelector + Examples + --------- + variance thresholding: f = FeatureSelector('variance', threshold=0.3) #Instantiating + f.fit(X[,y]) #fitting (y is optional for variance thresholding) + X = f.transform(X) #transforming + + filter-based, k best (MAD): f = FeatureSelector('univariate_kbest', score_func=FeatureSelector.mean_abs_diff, k=2) #Instantiating + #score_func can be any function f: R^n -> R^n (n = number of columns) + f.fit(X,y) #fitting + X = f.transform(X) #transforming + + wrapper, recursive: f = FeatureSelector('recursive', estimator = LinearSVC(), n_features_to_select=2) #Instantiating + #estimator should be an instance of a classification or regression model class from scikit-learn + #one can use a custom class but it must be compatible with scikit-learn arquitecture + f.fit(X,y) #fitting + X = f.transform(X) #transforming + + wrapper, sequential: f = FeatureSelector('sequential', estimator = LinearSVC(), direction='forward') #Instantiating + #estimator should be an instance of a classification or regression model class from scikit-learn + #one can use a custom class but it must be compatible with scikit-learn arquitecture + f.fit(X,y) #fitting + X = f.transform(X) #transforming + + to better understand the optional arguments of each algorithm see: https://scikit-learn.org/stable/modules/feature_selection.html + """ + self.selector = selector + self.selectors = {'variance': VarianceThreshold, + 'univariate_kbest': SelectKBest, + 'univariate_percentile': SelectPercentile, + 'recursive': RFE, + 'model':SelectFromModel, + 'sequential':SequentialFeatureSelector, + 'exaustive':MyExhaustiveFeatureSelector, + 'correlation':SelectCorrelation, + 'coefficients':SelectCoefficients, + 'ensemble':SelectEnsemble} + self.kwargs = kwargs + self.fitted = False + + def fit(self, X: pd.DataFrame, y = None): + """ + Identify the features to be selected. + + Parameters + ---------- + X : pd.DataFrame + features to be selected + + y : pd.DataFrame + target values + + Returns + ------- + None + """ + self.columns = X.columns + self.selection = self.selectors[self.selector](**self.kwargs) + self.selection.fit(X,y) + self.selected_columns = self.columns[self.selection.get_support()] + self.fitted = True + + def transform(self, df: pd.DataFrame): + """ + Select features based on fit + + Parameters + ---------- + pd.DataFrame + dataframe with features to be selected + + Returns + ------- + df : pd.DataFrame + dataframe with selected features only + """ + if not self.fitted: + raise Exception("Not yet trained.") + + + #return self.selection.transform(df) + return df[self.selected_columns] + + def inverse_transform(self, df: pd.DataFrame): + """ + Apply the invese_transform of vectorizer to each column + Options: index, bag_of_words and tf_idf + + Parameters + ---------- + df : pd.DataFrame + dataframe with columns to be unvectorizer + + Returns + ------- + pd.DataFrame + """ + pass + + #return df + + @staticmethod + def mean_abs_diff(X, y=None): + """ + method to compute the mean absolute difference (MAD) of all atributes of X + + Parameters + ---------- + X : pd.DataFrame + dataframe + y: any type + not necessary, used only for compatibility issues + + Returns + ------- + pd.DataFrame + """ + return np.sum(np.abs(X - np.mean(X, axis = 0)), axis = 0)/X.shape[0] + + @staticmethod + def variance(X, y=None): + """ + method to compute the mean variance of all atributes of X + + Parameters + ---------- + X : pd.DataFrame + dataframe + y: any type + not necessary, used only for compatibility issues + + Returns + ------- + pd.DataFrame + """ + return np.sum((X - np.mean(X, axis = 0)**2), axis = 0)/X.shape[0] + + @staticmethod + def disp_ratio(X, y=None): + """ + method to compute the dispersion ratio of all atributes od X + + Parameters + ---------- + X : pd.DataFrame + dataframe + y: any type + not necessary, used only for compatibility issues + + Returns + ------- + pd.DataFrame + """ + return np.mean(X, axis = 0)/np.power(np.prod(X, axis = 0),1/X.shape[0]) diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/pca.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/pca.py new file mode 100644 index 0000000..2596a64 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/pca.py @@ -0,0 +1,149 @@ +import pandas as pd +from sklearn.decomposition import PCA as PCA_sklearn +from sklearn import metrics + +class PCA: + + def __init__(self, columns, prefix="prefix", k=2): + """ + Constructor + + Parameters + ---------- + columns : list + Columns for dimensionality reduction + prefix : bool + column prefix + k : int + Number of dimensions + + Returns + ------- + PCA + """ + self.columns = columns + self.prefix = prefix + self.k = k + + + def __find_k(self, df, threshold): + """ + Find how many k dimensions will be reduced + + Parameters + ---------- + df : pd.Dataframe + dataframe to be reduced + + Returns + ------- + int + """ + self.pca = PCA_sklearn(n_components=len(self.columns)) + self.pca.fit(df[ self.columns ].values) + for i in range(len(self.columns)-1): + if self.pca.explained_variance_ratio_[i]+self.pca.explained_variance_ratio_[i+1] < threshold: + if i == 0: + raise Expecption("Not reduced by poor explicability") + return i+1 + + def __check(self, df: pd.DataFrame): + """ + Check dataframe contains all columns + + Parameters + ---------- + df : pd.Dataframe + dataframe to be reduced + + Returns + ------- + bool + """ + if not all(col in list(df.columns) for col in self.columns): + raise Exception('Missing columns') + return True + + + def transform(self, df: pd.DataFrame): + """ + Transform the data + + Parameters + ---------- + df : pd.Dataframe + dataframe to be reduced + + Returns + ------- + None + """ + self.__check(df) + if self.pca is None: + raise Exception("Error - object not fitted") + reduced = self.pca.transform(df[self.columns].values) + for col in range(self.k): + df[self.prefix+"_"+str(col)] = [line[col] for line in reduced] + df.drop(self.columns, axis=1, inplace=True) + + + def fit(self, df : pd.DataFrame, threshold=0.4): + """ + Compute PCA object + + Parameters + ---------- + df : pd.Dataframe + dataframe to be reduced + + Returns + ------- + None + """ + self.__check(df) + if self.k is None: + self.k = self.__find_k(df,threshold) + self.pca = PCA_sklearn(n_components=self.k) + self.pca.fit(df[ self.columns ].values) + + + def fit_transform (self, df : pd.DataFrame, threshold=0.4): + """ + Fit to data, then transform it. + + Parameters + ---------- + df : pd.Dataframe + dataframe to be reduced + + Returns + ------- + None + """ + self.__check(df) + if self.k is None: + self.k = self.__find_k(df,threshold) + self.pca = PCA_sklearn(n_components=self.k) + self.pca.fit(df[ self.columns ].values) + self.transform(df) + self.report() + + + + + def report(self): + """ + Returns explained variance + + Parameters + ---------- + None + + Returns + ------- + None + """ + for col in range(self.k): + print("Explained variance ({col}): {ratio}". + format(col = self.prefix+"_"+str(col), + ratio = str(self.pca.explained_variance_ratio_[col]))) \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/vif.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/vif.py new file mode 100644 index 0000000..79535f8 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/vif.py @@ -0,0 +1,48 @@ +import pandas as pd +from statsmodels.stats.outliers_influence import variance_inflation_factor + +class VIF: + + @classmethod + def analyze(cls, df: pd.DataFrame, thresh=5.0, verbose=True): + """ + Multicollinearity analysis + + Parameters + ---------- + df : pd.DataFrame + Dataframe must have the following columns (x, y, cluster) + thresh : int + value of cut + verbose : bool + if true prints possible variables to be removed + + + Return + ------- + pd.DataFrame + """ + variables = list(range(df.shape[1])) + dropped = True + while dropped: + dropped = False + vif = [variance_inflation_factor(df.iloc[:, variables].values, ix) + for ix in range(df.iloc[:, variables].shape[1])] + + maxloc = vif.index(max(vif)) + if max(vif) > thresh: + m = max(vif) + index_max = [i for i, j in enumerate(vif) if j == m] + if verbose: + cols_possibles_remove = [str(df.iloc[:, variables].columns[i]) for i in index_max] + print("Columns that can be removed -> " + ", ".join(cols_possibles_remove)) + print("------") + print('dropping \'' + str(df.iloc[:, variables].columns[maxloc]) + + '\' at index: ' + str(maxloc)) + print("_____________________________________________________________") + del variables[maxloc] + dropped = True + + print('Remaining variables:') + print(df.columns[variables]) + return df.iloc[:, variables] \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/data_source/base.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/data_source/base.py new file mode 100644 index 0000000..82d4d10 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/data_source/base.py @@ -0,0 +1,12 @@ +from abc import ABC, abstractmethod +import pandas as pd + +class DataSource(ABC): + + @abstractmethod + def get_data(self) -> pd.DataFrame: + """ + Abstract method that is implemented in classes that inherit it + """ + pass + diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/data_source/database.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/data_source/database.py new file mode 100644 index 0000000..a5554d8 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/data_source/database.py @@ -0,0 +1,70 @@ +import pandas as pd + +from ml.data_source.base import DataSource + +class DataBase(DataSource): + + def __init__(self): + """ + Constructor. + + Parameters + ----------- + arg : type + description + + Returns + ------- + class Object + """ + pass + + def get_data(self)->pd.DataFrame: + """ + Returns a flat table in Dataframe + + Parameters + ----------- + arg : type + description + + Returns + ------- + pd.DataFrame + Dataframe with data + """ + pass + + def open_connection(self, connection): + """ + Opens the connection to the database + + Parameters + ----------- + connection : string + Connection with database + + Returns + ------- + bool + Check if connection is open or not + + """ + pass + + def close_connection(self, connection ): + """ + Close the connection database + + Parameters + ----------- + connection : string + Connection with database + + Returns + ------- + bool + Check if connection was closed + + """ + pass diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/data_source/spreadsheet.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/data_source/spreadsheet.py new file mode 100644 index 0000000..7f48cff --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/data_source/spreadsheet.py @@ -0,0 +1,24 @@ +import pandas as pd + +from ml.data_source.base import DataSource + +class Spreadsheet(DataSource): + """ + Class to read files from spreadsheets or raw text files + """ + + def get_data(self, path)->pd.DataFrame: + """ + Returns a flat table in Dataframe + + Parameters + ---------- + arg : type + description + + Returns + ------- + pd.DataFrame + Dataframe with data + """ + return pd.read_csv(path)[['Survived', 'Pclass', 'Sex', 'Age']] \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/metrics.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/metrics.py new file mode 100644 index 0000000..f9ed342 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/metrics.py @@ -0,0 +1,222 @@ +from sklearn.metrics import * +import numpy as np +from sklearn.metrics import make_scorer +from sklearn.model_selection import cross_validate + + +class Metrics: + + @classmethod + def smape(cls, A, F): + """ + Calculates the smape value between the real and the predicted + + Parameters + ---------- + A : array + Target values + F : array + Predicted values + + Returns + ------- + float: smape value + """ + return 100/len(A) * np.sum(np.abs(F - A) / (np.abs(A) + np.abs(F))) + + @classmethod + def __custom_score(cls, y_true, y_pred): + """ + Creates a custom metric + + Parameters + ---------- + y_true : array + Target values + y_pred : array + Predicted values + + Returns + ------- + sklearn.metrics + """ + #return sklearn.metrics.fbeta_score(y_true, y_pred, 2) + pass + + @classmethod + def customized(cls, y_true, y_pred): + """ + Creates a custom metric + + Parameters + ---------- + y_true : array + Target values + y_pred : array + Predicted values + + Returns + ------- + float + """ + custom_metric = make_scorer(cls.__custom_score, greater_is_better=True) + return custom_metric + + @classmethod + def mape(cls, y_true, y_pred): + """ + Calculates the map value between the real and the predicted + + Parameters + ---------- + y_true : array + Target values + y_pred : array + Predicted values + + Returns + ------- + float : value of mape + """ + y_true, y_pred = np.array(y_true), np.array(y_pred) + return np.mean(np.abs(((y_true+1) - (y_pred+1)) / (y_true+1))) * 100 + + @classmethod + def regression(cls, y_true, y_pred): + """ + Calculates some metrics for regression problems + + Parameters + ---------- + y_true : array + Target values + y_pred : array + Predicted values + + Returns + ------- + dict : metrics results + """ + results = {'mean_absolute_error': round(mean_absolute_error( + y_true, y_pred), 7), + 'root_mean_squared_error': round(np.sqrt( + mean_squared_error(y_true, y_pred)), 7), + 'r2': round(r2_score(y_true, y_pred), 7), + 'smape': round(cls.smape(y_true, y_pred), 7), + 'mape': round(cls.mape(y_true, y_pred), 7) + } + return results + + @classmethod + def crossvalidation(cls, model, X, y, classification: bool, + cv=5, agg=np.mean): + if classification: + if len(set(y)) > 2: + metrics = ['accuracy', 'f1_weighted', + 'recall_weighted', 'precision_weighted'] + else: + metrics = ['accuracy', 'f1', 'recall', 'precision', 'roc_auc'] + else: + metrics = ['mean_absolute_error', 'r2', 'root_mean_squared_error', + 'smape', 'mape'] + res_metrics = cross_validate(model, X, y, cv=cv, + return_train_score=False, + scoring=metrics) + results = {metric.replace("test_", ""): round(agg( + res_metrics[metric]), 7) + for metric in res_metrics} + return results + + @classmethod + def __multiclass_classification(cls, y_true, y_pred): + """ + Calculates some metrics for multiclass classification problems + + Parameters + ---------- + y_true : array + Target values + y_pred : array + Predicted values + + Returns + ------- + dict : metrics results + """ + results = {'accuracy': accuracy_score(y_true, y_pred), + 'f1': f1_score(y_true, y_pred, average='weighted'), + 'precision': precision_score(y_true, y_pred, + average='weighted'), + 'recall': recall_score(y_true, y_pred, + average='weighted')} + return results + + @classmethod + def __binary_classification(cls, y_true, y_pred, y_probs): + """ + Calculates some metrics for binary classification problems + + Parameters + ---------- + y_true : array + Target values + y_pred : array + Predicted values + + Returns + ------- + dict : metrics results + """ + results = {'accuracy': accuracy_score(y_true, y_pred), + 'f1': f1_score(y_true, y_pred), + 'precision': precision_score(y_true, y_pred), + 'recall': recall_score(y_true, y_pred), + 'roc_auc': roc_auc_score(y_true, y_probs)} + return results + + @classmethod + def classification(cls, y_true, y_pred, y_probs): + """ + Checks which classification method will be applied: + binary or multiclass + + Parameters + ---------- + y_true : array + Target values + y_pred : array + Predicted values + y_probs : array + Probabilities values + + Returns + ------- + dict: metrics results + """ + if len(set(y_true)) > 2: + results = cls.__multiclass_classification(y_true, y_pred) + else: + results = cls.__binary_classification(y_true, y_pred, y_probs) + return results + + @classmethod + def clusterization(cls, X, labels): + """ + Calculates some metrics on clustering quality + + Parameters + ---------- + X : array[array], shape (n_linha, n_colunas) + Matrix with the values that were used in the cluster + labels : array, shape (n_linha, 1) + Vector with labels selected by the clustering method + (eg KMeans) + + Returns + ------- + dict : metrics results + """ + results = {'silhouette': silhouette_score(X, labels, + metric='euclidean'), + 'calinski_harabaz': calinski_harabaz_score(X, labels)} + return results diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/trainer.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/trainer.py new file mode 100644 index 0000000..e73706c --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/trainer.py @@ -0,0 +1,253 @@ +from joblib import dump, load +from datetime import date +import mlflow.pyfunc +from mlflow import pyfunc +from interpret.ext.blackbox import TabularExplainer, MimicExplainer +from interpret.ext.glassbox import * +import pandas as pd + +from util import load_yaml, load_json + + +class Wrapper(mlflow.pyfunc.PythonModel): + def __init__(self, model=None, metrics=None, columns=None): + """ + Constructor + + Parameters + ---------- + model : object + If it's just a model: enter all parameters + if it is more than one model: do not enter parameters + and use the add method to add each of the models + metrics : dict + Dictionary with the metrics of the result + of the model + columns : list + list with columns names + Returns + ------- + WrapperModel + """ + self.artifacts = dict() + self.artifacts["model"] = model + self.artifacts["metrics"] = metrics + self.artifacts["columns"] = columns + self.artifacts["creation_date"] = date.today() + + def predict(self, model_input, included_input=False): + """ + Method that returns the result of the prediction on a dataset + + Parameters + ---------- + df : pd.DataFrame + Data to be predicted + + Returns + ------- + list + """ + df_processed = model_input.copy() + model = self.artifacts["model"] + columns = self.artifacts["columns"] + result = model.predict(df_processed[columns]) + if included_input: + model_input['predict'] = result + result = model_input + return result + + def predict_proba(self, model_input, binary=False): + """ + Method that returns the result of the prediction on a dataset + + Parameters + ---------- + df : pd.DataFrame + data to be predicted + + Returns + ------- + list + """ + df_processed = model_input.copy() + model = self.artifacts["model"] + columns = self.artifacts["columns"] + if binary: + return model.predict_proba(df_processed[columns])[:, 1] + else: + return model.predict_proba(df_processed[columns]) + + def save_model(self, path): + """ + Saves the model object to a specific path + + Parameters + ---------- + path : str + path where the model object will be saved + + Returns + ------- + None + """ + dump(self, path) + + @staticmethod + def load_model(path): + """ + Loads the model object in a specific path + + Parameters + ---------- + path : str + path where the model object will be loaded. + + Returns + ------- + None + """ + model = load(path) + return model + + def save(self, path): + """ + Save model as a Wrapper class + + Parameters + ---------- + path : str + path where the model object will be loaded. + + Returns + ------- + None + """ + path_artifacts = path + "_artifacts.pkl" + dump(self.artifacts, path_artifacts) + content = load_json("config/arquivos.json") + conda_env = load_yaml(content["path_yaml"]) + mlflow.pyfunc.save_model( + path=path, + python_model=self, + artifacts={"model": path_artifacts}, + conda_env=conda_env, + ) + + def get_metrics(self): + """ + Return metrics + + Parameters + ---------- + self : object Wrapper + + Returns + ------- + dict + """ + return self.artifacts["metrics"] + + def get_columns(self): + """ + Return columns + + Parameters + ---------- + self : object Wrapper + + Returns + ------- + list + """ + return self.artifacts["columns"] + + def get_model(self): + """ + Return model + + Parameters + ---------- + self : object Wrapper + + Returns + ------- + dict + """ + return self.artifacts["model"] + + def train_interpret(self, X, model="tabular"): + """ + Train a interpret model + + Parameters + ---------- + self : object Wrapper + X : pd.DataFrame + Data that were used in the train for interpret + model : string, optional + Model to use for the interpret [tabular,mimic_LGBME, + mimic_Linear,mimic_SGDE,mimic_Dec_Tree] + Returns + ------- + None + """ + mimic_models = { + "mimic_LGBME": LGBMExplainableModel, + "mimic_Linear": LinearExplainableModel, + "mimic_SGDE": SGDExplainableModel, + "mimic_Dec_Tree": DecisionTreeExplainableModel, + } + if model == "tabular": + explainer = TabularExplainer( + self.artifacts["model"], X, features=self.artifacts["columns"] + ) + else: + explainer = MimicExplainer( + self.artifacts["model"], + X, + mimic_models[model], + augment_data=True, + max_num_of_augmentations=10, + features=self.artifacts["columns"], + ) + self.artifacts["explainer"] = explainer + + def local_interpret(self, X, n_feat=3, norm=True): + """ + Return a local interpret for each row in data + + Parameters + ---------- + self : object Wrapper + X : array[array], shape (n_linha, n_colunas) + Matrix with the data that were used to return interpret + n_feat : int, optional + Number of features to return + norm : bool, optional + if True, do normalization in the features importances + + Returns + ------- + pd.DataFrame + """ + local_explanation = self.artifacts["explainer"].explain_local(X) + n_obs = X.shape[0] + predictions = self.artifacts["model"].predict(X) + local_values = local_explanation.get_ranked_local_values() + local_values = [local_values[predictions[i]][i] for i in range(n_obs)] + local_names = local_explanation.get_ranked_local_names() + local_names = [local_names[predictions[i]][i] for i in range(n_obs)] + if norm: + local_values = [ + [(i - min(l)) / (max(l) - min(l)) for i in l] for l in local_values + ] + result = [ + (local_names[i][:n_feat] + local_values[i][:n_feat]) for i in range(n_obs) + ] + column_names = [ + f"Importance_{item}_{str(i)}" + for item in ["Name", "Value"] + for i in range(n_feat) + ] + return pd.DataFrame(result, columns=column_names) diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/wrapper.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/wrapper.py new file mode 100644 index 0000000..7aeaf19 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/wrapper.py @@ -0,0 +1,252 @@ +from joblib import dump, load +from datetime import date +import mlflow.pyfunc +from mlflow import pyfunc +from interpret.ext.blackbox import TabularExplainer, MimicExplainer +from interpret.ext.glassbox import * +import pandas as pd + +from util import load_yaml, load_json + + +class Wrapper(mlflow.pyfunc.PythonModel): + def __init__(self, model=None, metrics=None, columns=None): + """ + Constructor + + Parameters + ---------- + model : object + If it's just a model: enter all parameters + if it is more than one model: do not enter parameters and use + the add method to add each of the models + metrics : dict + Dictionary with the metrics of the result of the model + columns : list + list with columns names + Returns + ------- + WrapperModel + """ + self.artifacts = dict() + self.artifacts["model"] = model + self.artifacts["metrics"] = metrics + self.artifacts["columns"] = columns + self.artifacts["creation_date"] = date.today() + + def predict(self, model_input, included_input=False): + """ + Method that returns the result of the prediction on a dataset + + Parameters + ---------- + df : pd.DataFrame + Data to be predicted + + Returns + ------- + list + """ + df_processed = model_input.copy() + model = self.artifacts["model"] + columns = self.artifacts["columns"] + result = model.predict(df_processed[columns]) + if included_input: + model_input['predict'] = result + result = model_input + return result + + def predict_proba(self, model_input, binary=False): + """ + Method that returns the result of the prediction on a dataset + + Parameters + ---------- + df : pd.DataFrame + data to be predicted + + Returns + ------- + list + """ + df_processed = model_input.copy() + model = self.artifacts["model"] + columns = self.artifacts["columns"] + if binary: + return model.predict_proba(df_processed[columns])[:, 1] + else: + return model.predict_proba(df_processed[columns]) + + def save_model(self, path): + """ + Saves the model object to a specific path + + Parameters + ---------- + path : str + path where the model object will be saved + + Returns + ------- + None + """ + dump(self, path) + + @staticmethod + def load_model(path): + """ + Loads the model object in a specific path + + Parameters + ---------- + path : str + path where the model object will be loaded. + + Returns + ------- + None + """ + model = load(path) + return model + + def save(self, path): + """ + Save model as a Wrapper class + + Parameters + ---------- + path : str + path where the model object will be loaded. + + Returns + ------- + None + """ + path_artifacts = path + "_artifacts.pkl" + dump(self.artifacts, path_artifacts) + content = load_json("config/arquivos.json") + conda_env = load_yaml(content["path_yaml"]) + mlflow.pyfunc.save_model( + path=path, + python_model=self, + artifacts={"model": path_artifacts}, + conda_env=conda_env, + ) + + def get_metrics(self): + """ + Return metrics + + Parameters + ---------- + self : object Wrapper + + Returns + ------- + dict + """ + return self.artifacts["metrics"] + + def get_columns(self): + """ + Return columns + + Parameters + ---------- + self : object Wrapper + + Returns + ------- + list + """ + return self.artifacts["columns"] + + def get_model(self): + """ + Return model + + Parameters + ---------- + self : object Wrapper + + Returns + ------- + dict + """ + return self.artifacts["model"] + + def train_interpret(self, X, model="tabular"): + """ + Train a interpret model + + Parameters + ---------- + self : object Wrapper + X : pd.DataFrame + Data that were used in the train for interpret + model : string, optional + Model to use for the interpret [tabular,mimic_LGBME, + mimic_Linear,mimic_SGDE,mimic_Dec_Tree] + Returns + ------- + None + """ + mimic_models = { + "mimic_LGBME": LGBMExplainableModel, + "mimic_Linear": LinearExplainableModel, + "mimic_SGDE": SGDExplainableModel, + "mimic_Dec_Tree": DecisionTreeExplainableModel, + } + if model == "tabular": + explainer = TabularExplainer( + self.artifacts["model"], X, features=self.artifacts["columns"] + ) + else: + explainer = MimicExplainer( + self.artifacts["model"], + X, + mimic_models[model], + augment_data=True, + max_num_of_augmentations=10, + features=self.artifacts["columns"], + ) + self.artifacts["explainer"] = explainer + + def local_interpret(self, X, n_feat=3, norm=True): + """ + Return a local interpret for each row in data + + Parameters + ---------- + self : object Wrapper + X : array[array], shape (n_linha, n_colunas) + Matrix with the data that were used to return interpret + n_feat : int, optional + Number of features to return + norm : bool, optional + if True, do normalization in the features importances + + Returns + ------- + pd.DataFrame + """ + local_explanation = self.artifacts["explainer"].explain_local(X) + n_obs = X.shape[0] + predictions = self.artifacts["model"].predict(X) + local_values = local_explanation.get_ranked_local_values() + local_values = [local_values[predictions[i]][i] for i in range(n_obs)] + local_names = local_explanation.get_ranked_local_names() + local_names = [local_names[predictions[i]][i] for i in range(n_obs)] + if norm: + local_values = [ + [(i - min(l)) / (max(l) - min(l)) for i in l] for l in local_values + ] + result = [ + (local_names[i][:n_feat] + local_values[i][:n_feat]) for i in range(n_obs) + ] + column_names = [ + f"Importance_{item}_{str(i)}" + for item in ["Name", "Value"] + for i in range(n_feat) + ] + return pd.DataFrame(result, columns=column_names) diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/1_Sagemaker_Processor.ipynb b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/1_Sagemaker_Processor.ipynb new file mode 100644 index 0000000..98961ea --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/1_Sagemaker_Processor.ipynb @@ -0,0 +1,409 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0080c0d0", + "metadata": {}, + "source": [ + "# Sagemaker Processor" + ] + }, + { + "cell_type": "markdown", + "id": "7d7b0036", + "metadata": {}, + "source": [ + "This script generates the train, val and inference files with the processor previous uploaded in ECR." + ] + }, + { + "cell_type": "markdown", + "id": "3f2a0229", + "metadata": {}, + "source": [ + "## Import modules" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "6e679a79", + "metadata": {}, + "outputs": [], + "source": [ + "import boto3\n", + "import time\n", + "from datetime import datetime\n", + "from sagemaker import get_execution_role\n", + "from sagemaker.processing import Processor, ProcessingInput, ProcessingOutput" + ] + }, + { + "cell_type": "markdown", + "id": "a9066e74", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "markdown", + "id": "60ec8b7b", + "metadata": {}, + "source": [ + "Modify according to your configurations." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "9d9b2d23", + "metadata": {}, + "outputs": [], + "source": [ + "# Bucket name in S3\n", + "bucket = \"hermione-sagemaker\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1cd1aa77", + "metadata": {}, + "outputs": [], + "source": [ + "# Set session\n", + "region_name=\"us-east-1\"\n", + "boto3.setup_default_session(region_name=region_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "464d9cec", + "metadata": {}, + "outputs": [], + "source": [ + "# Get user role\n", + "role = get_execution_role()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "a0649d24", + "metadata": {}, + "outputs": [], + "source": [ + "# Get AWS Account ID\n", + "account_number = boto3.client(\"sts\").get_caller_identity()[\"Account\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "f71c6f3c", + "metadata": {}, + "outputs": [], + "source": [ + "# Image previous uploaded in ECR\n", + "image_name = \"hermione-processor\"\n", + "image_uri = f\"{account_number}.dkr.ecr.{region_name}.amazonaws.com/{image_name}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "db98e9a2", + "metadata": {}, + "outputs": [], + "source": [ + "# Input and output paths to execute train and inference\n", + "paths = {\n", + " 'train_raw': f\"s3://{bucket}/TRAIN_RAW\",\n", + " 'expectations': f\"s3://{bucket}/PREPROCESSING/EXPECTATIONS\",\n", + " 'preprocessing': f\"s3://{bucket}/PREPROCESSING/PREPROCESSING\",\n", + " 'train_processed': f\"s3://{bucket}/PREPROCESSING/TRAIN_PROCESSED\",\n", + " 'val_processed': f\"s3://{bucket}/PREPROCESSING/VAL_PROCESSED\",\n", + " 'test_raw': f\"s3://{bucket}/TEST_RAW\",\n", + " 'inference_processed': f\"s3://{bucket}/PREPROCESSING/INFERENCE_PROCESSED\",\n", + " 'validations': f\"s3://{bucket}/PREPROCESSING/VALIDATIONS\"\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "52ba34ff", + "metadata": {}, + "outputs": [], + "source": [ + "# upload train and test data in S3\n", + "s3 = boto3.resource('s3') \n", + "s3.Bucket(bucket).upload_file('../../../data/raw/raw_train.csv', 'TRAIN_RAW/raw_train.csv')\n", + "s3.Bucket(bucket).upload_file('../../../data/raw/raw_test.csv', 'TEST_RAW/raw_test.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "b1744737", + "metadata": {}, + "outputs": [], + "source": [ + "# instance to run the code\n", + "instance_type_train=\"ml.t3.medium\"\n", + "instance_type_inference=\"ml.t3.medium\"" + ] + }, + { + "cell_type": "markdown", + "id": "281216e9", + "metadata": {}, + "source": [ + "## Processor - Train" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "3191cd98", + "metadata": {}, + "outputs": [], + "source": [ + "# Receives a raw data in S3\n", + "inputs=[\n", + " ProcessingInput(source=paths['train_raw'], \n", + " destination='/opt/ml/processing/input/raw_data', \n", + " input_name=\"raw_data\")\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "9998dd3a", + "metadata": {}, + "outputs": [], + "source": [ + "# Returns the great expectation object, preprocessing object, \n", + "# processed training data and processed validation data, and saves them in S3\n", + "outputs = [\n", + " ProcessingOutput(\n", + " source=\"/opt/ml/processing/output/expectations\",\n", + " destination=paths['expectations'],\n", + " output_name=\"expectations\",\n", + " ),\n", + " ProcessingOutput(\n", + " source=\"/opt/ml/processing/output/preprocessing\",\n", + " destination=paths['preprocessing'],\n", + " output_name=\"preprocessing\",\n", + " ),\n", + " ProcessingOutput(\n", + " source=\"/opt/ml/processing/output/processed/train\",\n", + " destination=paths['train_processed'],\n", + " output_name=\"train_data\",\n", + " ),\n", + " ProcessingOutput(\n", + " source=\"/opt/ml/processing/output/processed/val\",\n", + " destination=paths['val_processed'],\n", + " output_name=\"val_data\",\n", + " )\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "a0d4af1b", + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the processor to access the ECR image\n", + "processor = Processor(image_uri=image_uri,\n", + " role=role,\n", + " instance_count=1,\n", + " instance_type=instance_type_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "065f6fca", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Job Name: hermione-processor-2021-07-22-19-53-22-425\n", + "Inputs: [{'InputName': 'raw_data', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://hermione-sagemaker/TRAIN_RAW', 'LocalPath': '/opt/ml/processing/input/raw_data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]\n", + "Outputs: [{'OutputName': 'expectations', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/EXPECTATIONS', 'LocalPath': '/opt/ml/processing/output/expectations', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'preprocessing', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/PREPROCESSING', 'LocalPath': '/opt/ml/processing/output/preprocessing', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'train_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/TRAIN_PROCESSED', 'LocalPath': '/opt/ml/processing/output/processed/train', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'val_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/VAL_PROCESSED', 'LocalPath': '/opt/ml/processing/output/processed/val', 'S3UploadMode': 'EndOfJob'}}]\n", + "......................................................\u001b[34mINFO:root:Starting the preprocessing\u001b[0m\n", + "\u001b[34mINFO:root:step_train: True\u001b[0m\n", + "\u001b[34mINFO:root:Reading the inputs\u001b[0m\n", + "\u001b[34mINFO:root:Reading file: /opt/ml/processing/input/raw_data/raw_train.csv\u001b[0m\n", + "\u001b[34mINFO:root:Data Quality\u001b[0m\n", + "\u001b[34mINFO:great_expectations.data_asset.data_asset:#01110 expectation(s) included in expectation_suite. Omitting 1 expectation(s) that failed when last run; set discard_failed_expectations=False to include them. result_format settings filtered.\u001b[0m\n", + "\u001b[34mINFO:root:Preprocessing\u001b[0m\n", + "\u001b[34mINFO:root:Cleaning data\u001b[0m\n", + "\u001b[34mINFO:root:One hot encoding\u001b[0m\n", + "\u001b[34mWARNING:py.warnings:/usr/local/lib/python3.8/dist-packages/category_encoders/utils.py:21: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead\n", + " elif pd.api.types.is_categorical(cols):\n", + "\u001b[0m\n", + "\u001b[34mINFO:root:Divide train and test\u001b[0m\n", + "\u001b[34mINFO:root:Normalizing\u001b[0m\n", + "\u001b[34mWARNING:py.warnings:/usr/local/lib/python3.8/dist-packages/pandas/core/indexing.py:1835: SettingWithCopyWarning: \u001b[0m\n", + "\u001b[34mA value is trying to be set on a copy of a slice from a DataFrame.\u001b[0m\n", + "\u001b[34mTry using .loc[row_indexer,col_indexer] = value instead\n", + "\u001b[0m\n", + "\u001b[34mSee the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " self._setitem_single_column(loc, value[:, i].tolist(), pi)\n", + "\u001b[0m\n", + "\u001b[34mINFO:root:Normalizing\u001b[0m\n", + "\u001b[34mINFO:root:shape train (393, 7) val (99, 7)\u001b[0m\n", + "\u001b[34mINFO:root:Saving\u001b[0m\n", + "\n", + "CPU times: user 1.09 s, sys: 71.1 ms, total: 1.16 s\n", + "Wall time: 9min 48s\n" + ] + } + ], + "source": [ + "%%time\n", + "# Runs the processor to access the ECR image and process the training data\n", + "processor.run(inputs=inputs,\n", + " outputs= outputs,\n", + " arguments=[\"--step\", \"train\"] \n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "5db80626", + "metadata": {}, + "source": [ + "## Processor - Inference" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "8d08c6c9", + "metadata": {}, + "outputs": [], + "source": [ + "# Receives a raw data in S3, the preprocessing and great expectation objects created in the training\n", + "inputs=[\n", + " ProcessingInput(source=paths['test_raw'],\n", + " destination='/opt/ml/processing/input/raw_data', \n", + " input_name='raw_data'),\n", + " ProcessingInput(source=paths['preprocessing'], \n", + " destination='/opt/ml/processing/input/preprocessing', \n", + " input_name='preprocessing'),\n", + " ProcessingInput(source=paths['expectations'], \n", + " destination='/opt/ml/processing/input/expectations', \n", + " input_name='expectations')\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "4273ba95", + "metadata": {}, + "outputs": [], + "source": [ + "# Returns the processed inference data and validations, and saves them in S3\n", + "outputs = [\n", + " ProcessingOutput(\n", + " source=\"/opt/ml/processing/output/processed/inference\",\n", + " destination=paths['inference_processed'],\n", + " output_name=\"inference_data\",\n", + " ),\n", + " ProcessingOutput(\n", + " source=\"/opt/ml/processing/output/validations\",\n", + " destination=paths['validations'],\n", + " output_name=\"validations\",\n", + " )\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "b4d816d3", + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the processor to access the ECR image\n", + "processor = Processor(image_uri=image_uri,\n", + " role=role,\n", + " instance_count=1,\n", + " instance_type=instance_type_inference)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "28aa9b95", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Job Name: hermione-processor-2021-07-22-19-40-48-848\n", + "Inputs: [{'InputName': 'raw_data', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://hermione-sagemaker/TEST_RAW', 'LocalPath': '/opt/ml/processing/input/raw_data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'preprocessing', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/PREPROCESSING', 'LocalPath': '/opt/ml/processing/input/preprocessing', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'expectations', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/EXPECTATIONS', 'LocalPath': '/opt/ml/processing/input/expectations', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]\n", + "Outputs: [{'OutputName': 'inference_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/INFERENCE_PROCESSED', 'LocalPath': '/opt/ml/processing/output/processed/inference', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'validations', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/VALIDATIONS', 'LocalPath': '/opt/ml/processing/output/validations', 'S3UploadMode': 'EndOfJob'}}]\n", + "...........................................................\u001b[34mINFO:root:Starting the preprocessing\u001b[0m\n", + "\u001b[34mINFO:root:step_train: False\u001b[0m\n", + "\u001b[34mINFO:root:Reading the inputs\u001b[0m\n", + "\u001b[34mINFO:root:Reading file: /opt/ml/processing/input/raw_data/raw_test.csv\u001b[0m\n", + "\u001b[34mINFO:root:Data Quality\u001b[0m\n", + "\u001b[34mINFO:root:Preprocessing\u001b[0m\n", + "\u001b[34mINFO:root:Cleaning data\u001b[0m\n", + "\u001b[34mINFO:root:One hot encoding\u001b[0m\n", + "\u001b[34mINFO:root:Normalizing\u001b[0m\n", + "\u001b[34mINFO:root:shape (222, 7)\u001b[0m\n", + "\u001b[34mINFO:root:Saving\u001b[0m\n", + "\n", + "CPU times: user 1.18 s, sys: 39.6 ms, total: 1.22 s\n", + "Wall time: 10min 15s\n" + ] + } + ], + "source": [ + "%%time\n", + "# Runs the processor to access the ECR image and process the inference data\n", + "processor.run(inputs=inputs,\n", + " outputs= outputs,\n", + " arguments=[\"--step\", \"test\"] \n", + " )" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/2_Sagemaker_Train.ipynb b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/2_Sagemaker_Train.ipynb new file mode 100644 index 0000000..5951690 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/2_Sagemaker_Train.ipynb @@ -0,0 +1,398 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0481ea58", + "metadata": {}, + "source": [ + "# Sagemaker Train" + ] + }, + { + "cell_type": "markdown", + "id": "c14f3a6e", + "metadata": {}, + "source": [ + "This script creates and trains the model with the uploaded image in ECR." + ] + }, + { + "cell_type": "markdown", + "id": "737135a7", + "metadata": {}, + "source": [ + "## Import modules" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "010b1646", + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "import boto3\n", + "import sagemaker\n", + "from sagemaker import get_execution_role" + ] + }, + { + "cell_type": "markdown", + "id": "ed6ec079", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "markdown", + "id": "ff8d388c", + "metadata": {}, + "source": [ + "Modify according to your configurations." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "6278a767", + "metadata": {}, + "outputs": [], + "source": [ + "# Bucket name in S3\n", + "bucket = \"hermione-sagemaker\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1fe9ed45", + "metadata": {}, + "outputs": [], + "source": [ + "# Set session\n", + "region_name=\"us-east-1\"\n", + "boto3.setup_default_session(region_name=region_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "f6216acf", + "metadata": {}, + "outputs": [], + "source": [ + "# Get user role\n", + "role = get_execution_role()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "c9a8d55b", + "metadata": {}, + "outputs": [], + "source": [ + "# Get AWS Account ID\n", + "account_number = boto3.client(\"sts\").get_caller_identity()[\"Account\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "f281ac39", + "metadata": {}, + "outputs": [], + "source": [ + "# Image previous uploaded in ECR\n", + "image_name = \"hermione-train\"\n", + "image_uri = f\"{account_number}.dkr.ecr.{region_name}.amazonaws.com/{image_name}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "4eee7169", + "metadata": {}, + "outputs": [], + "source": [ + "# Input and output paths to execute train\n", + "paths = {\n", + " 'train_processed': f\"s3://{bucket}/PREPROCESSING/TRAIN_PROCESSED\",\n", + " 'val_processed': f\"s3://{bucket}/PREPROCESSING/VAL_PROCESSED\",\n", + " 'model': f\"s3://{bucket}/PREPROCESSING/MODEL\"\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "44002452", + "metadata": {}, + "outputs": [], + "source": [ + "# instance to run the code\n", + "instance_type=\"ml.m5.large\"" + ] + }, + { + "cell_type": "markdown", + "id": "6aa3f5a8", + "metadata": {}, + "source": [ + "## Train" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "77e64d0c", + "metadata": {}, + "outputs": [], + "source": [ + "# Receives the processed train data in S3\n", + "train_config = sagemaker.inputs.TrainingInput(\n", + " paths['train_processed'],\n", + " content_type='text/csv',\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "33726510", + "metadata": {}, + "outputs": [], + "source": [ + "# Receives the processed validation data in S3\n", + "val_config = sagemaker.inputs.TrainingInput(\n", + " paths['val_processed'],\n", + " content_type='text/csv'\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "1f0350b8", + "metadata": {}, + "outputs": [], + "source": [ + "# Saves the model object in S3\n", + "output_path = paths['model']" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "0832ebb9", + "metadata": {}, + "outputs": [], + "source": [ + "# Metrics to visualize in the Monitor\n", + "metrics = [\n", + " {\n", + " \"Name\": \"accuracy\",\n", + " \"Regex\": \"accuracy=(.*?);\",\n", + " },\n", + " {\n", + " \"Name\": \"f1\",\n", + " \"Regex\": \"f1=(.*?);\",\n", + " },\n", + " {\n", + " \"Name\": \"precision\",\n", + " \"Regex\": \"precision=(.*?);\",\n", + " },\n", + " {\n", + " \"Name\": \"recall\",\n", + " \"Regex\": \"recall=(.*?);\",\n", + " },\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "7a2931e1", + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the estimator to access the ECR image\n", + "est = sagemaker.estimator.Estimator(\n", + " image_uri,\n", + " role, \n", + " instance_count=1, \n", + " instance_type=instance_type,\n", + " volume_size = 30,\n", + " output_path = output_path,\n", + " base_job_name = \"Hermione-train\",\n", + " use_spot_instances=True,\n", + " max_run = 24*60*60,\n", + " max_wait = 24*60*60, # timeout in seconds. Required if use_spot_instances == True\n", + " metric_definitions=metrics\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "d12aa777", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2021-07-22 20:15:35 Starting - Starting the training job...\n", + "2021-07-22 20:15:59 Starting - Launching requested ML instancesProfilerReport-1626984935: InProgress\n", + "...\n", + "2021-07-22 20:16:35 Starting - Preparing the instances for training.........\n", + "2021-07-22 20:18:00 Downloading - Downloading input data...\n", + "2021-07-22 20:18:20 Training - Downloading the training image.....\u001b[34m2021-07-22 17:19:11,614 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34m2021-07-22 17:19:11,630 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34m2021-07-22 17:19:11,640 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34m2021-07-22 17:19:11,648 sagemaker-training-toolkit INFO Invoking user script\n", + "\u001b[0m\n", + "\u001b[34mTraining Env:\n", + "\u001b[0m\n", + "\u001b[34m{\n", + " \"additional_framework_parameters\": {},\n", + " \"channel_input_dirs\": {\n", + " \"validation\": \"/opt/ml/input/data/validation\",\n", + " \"train\": \"/opt/ml/input/data/train\"\n", + " },\n", + " \"current_host\": \"algo-1\",\n", + " \"framework_module\": null,\n", + " \"hosts\": [\n", + " \"algo-1\"\n", + " ],\n", + " \"hyperparameters\": {},\n", + " \"input_config_dir\": \"/opt/ml/input/config\",\n", + " \"input_data_config\": {\n", + " \"validation\": {\n", + " \"ContentType\": \"text/csv\",\n", + " \"TrainingInputMode\": \"File\",\n", + " \"S3DistributionType\": \"FullyReplicated\",\n", + " \"RecordWrapperType\": \"None\"\n", + " },\n", + " \"train\": {\n", + " \"ContentType\": \"text/csv\",\n", + " \"TrainingInputMode\": \"File\",\n", + " \"S3DistributionType\": \"FullyReplicated\",\n", + " \"RecordWrapperType\": \"None\"\n", + " }\n", + " },\n", + " \"input_dir\": \"/opt/ml/input\",\n", + " \"is_master\": true,\n", + " \"job_name\": \"Hermione-train-2021-07-22-20-15-35-496\",\n", + " \"log_level\": 20,\n", + " \"master_hostname\": \"algo-1\",\n", + " \"model_dir\": \"/opt/ml/model\",\n", + " \"module_dir\": \"/opt/ml/code\",\n", + " \"module_name\": \"train\",\n", + " \"network_interface_name\": \"eth0\",\n", + " \"num_cpus\": 2,\n", + " \"num_gpus\": 0,\n", + " \"output_data_dir\": \"/opt/ml/output/data\",\n", + " \"output_dir\": \"/opt/ml/output\",\n", + " \"output_intermediate_dir\": \"/opt/ml/output/intermediate\",\n", + " \"resource_config\": {\n", + " \"current_host\": \"algo-1\",\n", + " \"hosts\": [\n", + " \"algo-1\"\n", + " ],\n", + " \"network_interface_name\": \"eth0\"\n", + " },\n", + " \"user_entry_point\": \"train.py\"\u001b[0m\n", + "\u001b[34m}\n", + "\u001b[0m\n", + "\u001b[34mEnvironment variables:\n", + "\u001b[0m\n", + "\u001b[34mSM_HOSTS=[\"algo-1\"]\u001b[0m\n", + "\u001b[34mSM_NETWORK_INTERFACE_NAME=eth0\u001b[0m\n", + "\u001b[34mSM_HPS={}\u001b[0m\n", + "\u001b[34mSM_USER_ENTRY_POINT=train.py\u001b[0m\n", + "\u001b[34mSM_FRAMEWORK_PARAMS={}\u001b[0m\n", + "\u001b[34mSM_RESOURCE_CONFIG={\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"}\u001b[0m\n", + "\u001b[34mSM_INPUT_DATA_CONFIG={\"train\":{\"ContentType\":\"text/csv\",\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"},\"validation\":{\"ContentType\":\"text/csv\",\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}}\u001b[0m\n", + "\u001b[34mSM_OUTPUT_DATA_DIR=/opt/ml/output/data\u001b[0m\n", + "\u001b[34mSM_CHANNELS=[\"train\",\"validation\"]\u001b[0m\n", + "\u001b[34mSM_CURRENT_HOST=algo-1\u001b[0m\n", + "\u001b[34mSM_MODULE_NAME=train\u001b[0m\n", + "\u001b[34mSM_LOG_LEVEL=20\u001b[0m\n", + "\u001b[34mSM_FRAMEWORK_MODULE=\u001b[0m\n", + "\u001b[34mSM_INPUT_DIR=/opt/ml/input\u001b[0m\n", + "\u001b[34mSM_INPUT_CONFIG_DIR=/opt/ml/input/config\u001b[0m\n", + "\u001b[34mSM_OUTPUT_DIR=/opt/ml/output\u001b[0m\n", + "\u001b[34mSM_NUM_CPUS=2\u001b[0m\n", + "\u001b[34mSM_NUM_GPUS=0\u001b[0m\n", + "\u001b[34mSM_MODEL_DIR=/opt/ml/model\u001b[0m\n", + "\u001b[34mSM_MODULE_DIR=/opt/ml/code\u001b[0m\n", + "\u001b[34mSM_TRAINING_ENV={\"additional_framework_parameters\":{},\"channel_input_dirs\":{\"train\":\"/opt/ml/input/data/train\",\"validation\":\"/opt/ml/input/data/validation\"},\"current_host\":\"algo-1\",\"framework_module\":null,\"hosts\":[\"algo-1\"],\"hyperparameters\":{},\"input_config_dir\":\"/opt/ml/input/config\",\"input_data_config\":{\"train\":{\"ContentType\":\"text/csv\",\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"},\"validation\":{\"ContentType\":\"text/csv\",\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}},\"input_dir\":\"/opt/ml/input\",\"is_master\":true,\"job_name\":\"Hermione-train-2021-07-22-20-15-35-496\",\"log_level\":20,\"master_hostname\":\"algo-1\",\"model_dir\":\"/opt/ml/model\",\"module_dir\":\"/opt/ml/code\",\"module_name\":\"train\",\"network_interface_name\":\"eth0\",\"num_cpus\":2,\"num_gpus\":0,\"output_data_dir\":\"/opt/ml/output/data\",\"output_dir\":\"/opt/ml/output\",\"output_intermediate_dir\":\"/opt/ml/output/intermediate\",\"resource_config\":{\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"},\"user_entry_point\":\"train.py\"}\u001b[0m\n", + "\u001b[34mSM_USER_ARGS=[]\u001b[0m\n", + "\u001b[34mSM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate\u001b[0m\n", + "\u001b[34mSM_CHANNEL_VALIDATION=/opt/ml/input/data/validation\u001b[0m\n", + "\u001b[34mSM_CHANNEL_TRAIN=/opt/ml/input/data/train\u001b[0m\n", + "\u001b[34mPYTHONPATH=/usr/local/bin:/opt/ml/code:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/lib/python38.zip:/usr/lib/python3.8:/usr/lib/python3.8/lib-dynload:/usr/local/lib/python3.8/dist-packages:/usr/lib/python3/dist-packages\n", + "\u001b[0m\n", + "\u001b[34mInvoking script with the following command:\n", + "\u001b[0m\n", + "\u001b[34m/usr/bin/python3 train.py\n", + "\n", + "\u001b[0m\n", + "\u001b[34mINFO:root:Starting the training\u001b[0m\n", + "\u001b[34mINFO:root:Reading the inputs\u001b[0m\n", + "\u001b[34mINFO:root:Training the model\u001b[0m\n", + "\u001b[34mINFO:root:Saving\u001b[0m\n", + "\u001b[34mINFO:root:accuracy=0.7373737373737373; f1=0.6976744186046512; precision=0.6382978723404256; recall=0.7692307692307693;\u001b[0m\n", + "\u001b[34mINFO:root:Training complete.\u001b[0m\n", + "\u001b[34m2021-07-22 17:19:17,315 sagemaker-training-toolkit INFO Reporting training SUCCESS\u001b[0m\n", + "\n", + "2021-07-22 20:19:30 Uploading - Uploading generated training model\n", + "2021-07-22 20:19:30 Completed - Training job completed\n", + "Training seconds: 96\n", + "Billable seconds: 39\n", + "Managed Spot Training savings: 59.4%\n", + "CPU times: user 491 ms, sys: 48.5 ms, total: 539 ms\n", + "Wall time: 4min 12s\n" + ] + } + ], + "source": [ + "%%time\n", + "# Train the model and validate\n", + "est.fit({'train':train_config, 'validation':val_config}, wait=True, logs=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf57258c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/3_Sagemaker_Inference.ipynb b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/3_Sagemaker_Inference.ipynb new file mode 100644 index 0000000..525a5a0 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/3_Sagemaker_Inference.ipynb @@ -0,0 +1,374 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4558d673", + "metadata": {}, + "source": [ + "# Sagemaker Inference" + ] + }, + { + "cell_type": "markdown", + "id": "733a4c1b", + "metadata": {}, + "source": [ + "This script predicts new data with the uploaded image in ECR." + ] + }, + { + "cell_type": "markdown", + "id": "73ec63de", + "metadata": {}, + "source": [ + "## Import modules" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "9f4bb4b1", + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "import boto3\n", + "import sagemaker\n", + "from sagemaker import get_execution_role" + ] + }, + { + "cell_type": "markdown", + "id": "cf4f0baf", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "markdown", + "id": "a36daf9a", + "metadata": {}, + "source": [ + "Modify according to your configurations." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "ff34a81c", + "metadata": {}, + "outputs": [], + "source": [ + "# Bucket name in S3\n", + "bucket = \"hermione-sagemaker\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "aa6732aa", + "metadata": {}, + "outputs": [], + "source": [ + "# Set session\n", + "region_name=\"us-east-1\"\n", + "boto3.setup_default_session(region_name=region_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0515bb41", + "metadata": {}, + "outputs": [], + "source": [ + "# Get user role\n", + "role = get_execution_role()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "ef2ae3ae", + "metadata": {}, + "outputs": [], + "source": [ + "# Get AWS Account ID\n", + "account_number = boto3.client(\"sts\").get_caller_identity()[\"Account\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "31861461", + "metadata": {}, + "outputs": [], + "source": [ + "# Image previous uploaded in ECR\n", + "image_name = \"hermione-inference\"\n", + "image_uri = f\"{account_number}.dkr.ecr.{region_name}.amazonaws.com/{image_name}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "1eec0163", + "metadata": {}, + "outputs": [], + "source": [ + "# Input and output paths to execute inference\n", + "paths = {\n", + " 'inference_processed': f\"s3://{bucket}/PREPROCESSING/INFERENCE_PROCESSED/inference.csv\",\n", + " 'model': f\"s3://{bucket}/PREPROCESSING/MODEL/Hermione-train-2021-05-26-12-41-29-505/output/model.tar.gz\",\n", + " 'output_path': f\"s3://{bucket}/PREPROCESSING/OUTPUT\"\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "76ce3950", + "metadata": {}, + "outputs": [], + "source": [ + "# instance to run the code\n", + "instance_type=\"ml.m5.large\"" + ] + }, + { + "cell_type": "markdown", + "id": "f44e5b91", + "metadata": {}, + "source": [ + "## Inference" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "a78cd291", + "metadata": {}, + "outputs": [], + "source": [ + "# Receives the processed inference data in S3\n", + "input_path = paths['inference_processed']" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "c8f2a674", + "metadata": {}, + "outputs": [], + "source": [ + "# Receives the model created during the training in S3\n", + "model_path = paths['model']" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "6ec78d16", + "metadata": {}, + "outputs": [], + "source": [ + "# Saves the prediction in S3\n", + "output_path = paths['output_path']" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "c167eff0", + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the model to access the ECR image\n", + "model = sagemaker.model.Model(\n", + " image_uri= image_uri,\n", + " model_data=model_path,\n", + " role=role)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "0b2651c1", + "metadata": {}, + "outputs": [], + "source": [ + "# Creates a transformer object from the trained model\n", + "transformer = model.transformer(\n", + " instance_count=1,\n", + " instance_type=instance_type, \n", + " output_path=output_path,\n", + " accept = 'text/csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "1c5bd0b8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "..........................\u001b[34mWarning: MMS is using non-default JVM parameters: -XX:-UseContainerSupport\u001b[0m\n", + "\u001b[35mWarning: MMS is using non-default JVM parameters: -XX:-UseContainerSupport\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:24,272 [INFO ] main com.amazonaws.ml.mms.ModelServer - \u001b[0m\n", + "\u001b[34mMMS Home: /usr/local/lib/python3.8/dist-packages\u001b[0m\n", + "\u001b[34mCurrent directory: /\u001b[0m\n", + "\u001b[34mTemp directory: /tmp\u001b[0m\n", + "\u001b[34mNumber of GPUs: 0\u001b[0m\n", + "\u001b[34mNumber of CPUs: 2\u001b[0m\n", + "\u001b[34mMax heap size: 1726 M\u001b[0m\n", + "\u001b[34mPython executable: /usr/bin/python3\u001b[0m\n", + "\u001b[34mConfig file: /etc/sagemaker-mms.properties\u001b[0m\n", + "\u001b[34mInference address: http://0.0.0.0:8080\u001b[0m\n", + "\u001b[34mManagement address: http://0.0.0.0:8080\u001b[0m\n", + "\u001b[34mModel Store: /.sagemaker/mms/models\u001b[0m\n", + "\u001b[34mInitial Models: ALL\u001b[0m\n", + "\u001b[34mLog dir: /logs\u001b[0m\n", + "\u001b[34mMetrics dir: /logs\u001b[0m\n", + "\u001b[34mNetty threads: 0\u001b[0m\n", + "\u001b[34mNetty client threads: 0\u001b[0m\n", + "\u001b[34mDefault workers per model: 2\u001b[0m\n", + "\u001b[34mBlacklist Regex: N/A\u001b[0m\n", + "\u001b[34mMaximum Response Size: 6553500\u001b[0m\n", + "\u001b[34mMaximum Request Size: 6553500\u001b[0m\n", + "\u001b[34mPreload model: false\u001b[0m\n", + "\u001b[34mPrefer direct buffer: false\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:24,384 [WARN ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerLifeCycle - attachIOStreams() threadName=W-9000-model\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:24,452 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - model_service_worker started with args: --sock-type unix --sock-name /tmp/.mms.sock.9000 --handler serving.handler --model-path /.sagemaker/mms/models/model --model-name model --preload-model false --tmp-dir /tmp\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:24,272 [INFO ] main com.amazonaws.ml.mms.ModelServer - \u001b[0m\n", + "\u001b[35mMMS Home: /usr/local/lib/python3.8/dist-packages\u001b[0m\n", + "\u001b[35mCurrent directory: /\u001b[0m\n", + "\u001b[35mTemp directory: /tmp\u001b[0m\n", + "\u001b[35mNumber of GPUs: 0\u001b[0m\n", + "\u001b[35mNumber of CPUs: 2\u001b[0m\n", + "\u001b[35mMax heap size: 1726 M\u001b[0m\n", + "\u001b[35mPython executable: /usr/bin/python3\u001b[0m\n", + "\u001b[35mConfig file: /etc/sagemaker-mms.properties\u001b[0m\n", + "\u001b[35mInference address: http://0.0.0.0:8080\u001b[0m\n", + "\u001b[35mManagement address: http://0.0.0.0:8080\u001b[0m\n", + "\u001b[35mModel Store: /.sagemaker/mms/models\u001b[0m\n", + "\u001b[35mInitial Models: ALL\u001b[0m\n", + "\u001b[35mLog dir: /logs\u001b[0m\n", + "\u001b[35mMetrics dir: /logs\u001b[0m\n", + "\u001b[35mNetty threads: 0\u001b[0m\n", + "\u001b[35mNetty client threads: 0\u001b[0m\n", + "\u001b[35mDefault workers per model: 2\u001b[0m\n", + "\u001b[35mBlacklist Regex: N/A\u001b[0m\n", + "\u001b[35mMaximum Response Size: 6553500\u001b[0m\n", + "\u001b[35mMaximum Request Size: 6553500\u001b[0m\n", + "\u001b[35mPreload model: false\u001b[0m\n", + "\u001b[35mPrefer direct buffer: false\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:24,384 [WARN ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerLifeCycle - attachIOStreams() threadName=W-9000-model\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:24,452 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - model_service_worker started with args: --sock-type unix --sock-name /tmp/.mms.sock.9000 --handler serving.handler --model-path /.sagemaker/mms/models/model --model-name model --preload-model false --tmp-dir /tmp\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:24,454 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Listening on port: /tmp/.mms.sock.9000\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:24,454 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - [PID] 24\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:24,455 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - MMS worker started.\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:24,455 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Python runtime: 3.8.10\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:24,456 [INFO ] main com.amazonaws.ml.mms.wlm.ModelManager - Model model loaded.\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:24,460 [INFO ] main com.amazonaws.ml.mms.ModelServer - Initialize Inference server with: EpollServerSocketChannel.\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:24,472 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Connecting to: /tmp/.mms.sock.9000\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:24,476 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Connecting to: /tmp/.mms.sock.9000\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:24,536 [INFO ] main com.amazonaws.ml.mms.ModelServer - Inference API bind to: http://0.0.0.0:8080\u001b[0m\n", + "\u001b[34mModel server started.\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:24,555 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Connection accepted: /tmp/.mms.sock.9000.\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:24,555 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Connection accepted: /tmp/.mms.sock.9000.\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:24,567 [WARN ] pool-2-thread-1 com.amazonaws.ml.mms.metrics.MetricCollector - worker pid is not available yet.\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:24,454 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Listening on port: /tmp/.mms.sock.9000\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:24,454 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - [PID] 24\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:24,455 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - MMS worker started.\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:24,455 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Python runtime: 3.8.10\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:24,456 [INFO ] main com.amazonaws.ml.mms.wlm.ModelManager - Model model loaded.\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:24,460 [INFO ] main com.amazonaws.ml.mms.ModelServer - Initialize Inference server with: EpollServerSocketChannel.\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:24,472 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Connecting to: /tmp/.mms.sock.9000\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:24,476 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Connecting to: /tmp/.mms.sock.9000\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:24,536 [INFO ] main com.amazonaws.ml.mms.ModelServer - Inference API bind to: http://0.0.0.0:8080\u001b[0m\n", + "\u001b[35mModel server started.\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:24,555 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Connection accepted: /tmp/.mms.sock.9000.\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:24,555 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Connection accepted: /tmp/.mms.sock.9000.\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:24,567 [WARN ] pool-2-thread-1 com.amazonaws.ml.mms.metrics.MetricCollector - worker pid is not available yet.\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:27,441 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - generated new fontManager\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:27,450 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - generated new fontManager\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:27,839 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Loading the model\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:27,854 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Loading the model\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:27,441 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - generated new fontManager\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:27,450 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - generated new fontManager\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:27,839 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Loading the model\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:27,854 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Loading the model\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:27,886 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Model model loaded io_fd=0242a9fffefeff83-0000000a-00000000-2860f330bbe7ac20-d219266e\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:27,898 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Backend response time: 3268\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:27,900 [WARN ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerLifeCycle - attachIOStreams() threadName=W-model-1\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:27,916 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Model model loaded io_fd=0242a9fffefeff83-0000000a-00000001-9aea1030bbe7ac23-7076a78a\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:27,916 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Backend response time: 3285\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:27,916 [WARN ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerLifeCycle - attachIOStreams() threadName=W-model-2\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:27,886 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Model model loaded io_fd=0242a9fffefeff83-0000000a-00000000-2860f330bbe7ac20-d219266e\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:27,898 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Backend response time: 3268\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:27,900 [WARN ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerLifeCycle - attachIOStreams() threadName=W-model-1\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:27,916 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Model model loaded io_fd=0242a9fffefeff83-0000000a-00000001-9aea1030bbe7ac23-7076a78a\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:27,916 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Backend response time: 3285\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:27,916 [WARN ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerLifeCycle - attachIOStreams() threadName=W-model-2\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:31,830 [INFO ] pool-1-thread-4 ACCESS_LOG - /169.254.255.130:60460 \"GET /ping HTTP/1.1\" 200 15\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:31,840 [INFO ] epollEventLoopGroup-3-2 ACCESS_LOG - /169.254.255.130:60464 \"GET /execution-parameters HTTP/1.1\" 404 1\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:31,965 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Predicting...\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:31,981 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Prediction Complete\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:31,983 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Saving\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:31,985 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Backend response time: 26\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:31,986 [INFO ] W-9000-model ACCESS_LOG - /169.254.255.130:60468 \"POST /invocations HTTP/1.1\" 200 30\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:31,830 [INFO ] pool-1-thread-4 ACCESS_LOG - /169.254.255.130:60460 \"GET /ping HTTP/1.1\" 200 15\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:31,840 [INFO ] epollEventLoopGroup-3-2 ACCESS_LOG - /169.254.255.130:60464 \"GET /execution-parameters HTTP/1.1\" 404 1\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:31,965 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Predicting...\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:31,981 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Prediction Complete\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:31,983 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Saving\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:31,985 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Backend response time: 26\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:31,986 [INFO ] W-9000-model ACCESS_LOG - /169.254.255.130:60468 \"POST /invocations HTTP/1.1\" 200 30\u001b[0m\n", + "\u001b[32m2021-07-22T20:28:31.846:[sagemaker logs]: MaxConcurrentTransforms=1, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD\u001b[0m\n", + "\n", + "CPU times: user 602 ms, sys: 31.4 ms, total: 634 ms\n", + "Wall time: 4min 43s\n" + ] + } + ], + "source": [ + "%%time\n", + "# Predicts the data\n", + "transformer.transform(data=input_path, data_type='S3Prefix', content_type='text/csv', split_type='Line')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79b282ec", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/4_Sagemaker_StepFunctions_Train.ipynb b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/4_Sagemaker_StepFunctions_Train.ipynb new file mode 100644 index 0000000..a4c655a --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/4_Sagemaker_StepFunctions_Train.ipynb @@ -0,0 +1,540 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Build machine learning workflow to train a model with Amazon SageMaker and AWS Step Functions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This script creates a Step Function state machine to preprocess the training data and train a model with the images in ECR." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import modules" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import uuid\n", + "import boto3\n", + "import sagemaker\n", + "from sagemaker import get_execution_role\n", + "from sagemaker.processing import Processor, ProcessingInput, ProcessingOutput\n", + "import stepfunctions\n", + "from stepfunctions.inputs import ExecutionInput\n", + "from stepfunctions.workflow import Workflow\n", + "from stepfunctions.steps import (\n", + " TrainingStep, \n", + " Chain,\n", + " ProcessingStep,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Modify according to your configurations." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Bucket name in S3\n", + "bucket = \"hermione-sagemaker\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Set session\n", + "region_name=\"us-east-1\"\n", + "boto3.setup_default_session(region_name=region_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Get user role\n", + "role = get_execution_role()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Role to create and execute step functions\n", + "# paste the AmazonSageMaker-StepFunctionsWorkflowExecutionRole ARN\n", + "workflow_execution_role = \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# SageMaker expects unique names for each job, model and endpoint.\n", + "# Otherwise, the execution will fail. The ExecutionInput creates\n", + "# dynamically names for each execution.\n", + "execution_input = ExecutionInput(\n", + " schema={\n", + " \"PreprocessingJobName\": str,\n", + " \"TrainingJobName\": str\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# Get AWS Account ID\n", + "account_number = boto3.client(\"sts\").get_caller_identity()[\"Account\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# Processor image name previous uploaded in ECR\n", + "image_name_processor = \"hermione-processor\"" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Training image name previous uploaded in ECR\n", + "image_name_train = \"hermione-train\"" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Input and output paths to execute\n", + "paths = {\n", + " 'train_raw': f\"s3://{bucket}/TRAIN_RAW\",\n", + " 'expectations': f\"s3://{bucket}/PREPROCESSING/EXPECTATIONS\",\n", + " 'preprocessing': f\"s3://{bucket}/PREPROCESSING/PREPROCESSING\",\n", + " 'train_processed': f\"s3://{bucket}/PREPROCESSING/TRAIN_PROCESSED\",\n", + " 'val_processed': f\"s3://{bucket}/PREPROCESSING/VAL_PROCESSED\",\n", + " 'model': f\"s3://{bucket}/PREPROCESSING/MODEL\"\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# instance to run the code\n", + "instance_type_preprocessing=\"ml.t3.medium\"\n", + "instance_type_train=\"ml.m5.large\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preprocessing Step" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# Processor image previous uploaded in ECR\n", + "image_uri_processor = f\"{account_number}.dkr.ecr.{region_name}.amazonaws.com/{image_name_processor}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the processor to access the ECR image\n", + "processor = Processor(image_uri=image_uri_processor,\n", + " role=role,\n", + " instance_count=1,\n", + " instance_type=instance_type_preprocessing)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates input and output objects for ProcessingStep\n", + "inputs=[\n", + " ProcessingInput(source=paths['train_raw'], \n", + " destination='/opt/ml/processing/input/raw_data', \n", + " input_name=\"raw_data\")\n", + "]\n", + "outputs = [\n", + " ProcessingOutput(\n", + " source=\"/opt/ml/processing/output/expectations\",\n", + " destination=paths['expectations'],\n", + " output_name=\"expectations\",\n", + " ),\n", + " ProcessingOutput(\n", + " source=\"/opt/ml/processing/output/preprocessing\",\n", + " destination=paths['preprocessing'],\n", + " output_name=\"preprocessing\",\n", + " ),\n", + " ProcessingOutput(\n", + " source=\"/opt/ml/processing/output/processed/train\",\n", + " destination=paths['train_processed'],\n", + " output_name=\"train_data\",\n", + " ),\n", + " ProcessingOutput(\n", + " source=\"/opt/ml/processing/output/processed/val\",\n", + " destination=paths['val_processed'],\n", + " output_name=\"val_data\",\n", + " )\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the ProcessingStep\n", + "processing_step = ProcessingStep(\n", + " \"Preprocessing step\",\n", + " processor=processor,\n", + " job_name=execution_input[\"PreprocessingJobName\"],\n", + " inputs=inputs,\n", + " outputs=outputs,\n", + " container_arguments=[\"--step\", \"train\"]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## TrainingStep" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# Training image previous uploaded in ECR\n", + "image_uri_train = f\"{account_number}.dkr.ecr.{region_name}.amazonaws.com/{image_name_train}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates input and output objects for TrainingStep\n", + "train_config = sagemaker.inputs.TrainingInput(\n", + " paths['train_processed'],\n", + " content_type='text/csv',\n", + ")\n", + "val_config = sagemaker.inputs.TrainingInput(\n", + " paths['val_processed'],\n", + " content_type='text/csv'\n", + ")\n", + "output_path = paths['model']" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the estimator to access the ECR image\n", + "est = sagemaker.estimator.Estimator(\n", + " image_uri_train,\n", + " role, \n", + " instance_count=1, \n", + " instance_type=instance_type_train,\n", + " volume_size = 30,\n", + " output_path = output_path,\n", + " base_job_name = \"Hermione-Train\",\n", + " use_spot_instances=True, # Usar instâncias SPOT\n", + " max_run = 24*60*60,\n", + " max_wait = 24*60*60 # timeout em segundos. Required if use_spot_instances == True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the TrainingStep\n", + "training_step = TrainingStep(\n", + " 'TrainStep',\n", + " estimator=est,\n", + " data={\n", + " 'train': train_config,\n", + " 'validation': val_config\n", + " }, \n", + " job_name=execution_input[\"TrainingJobName\"] \n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Workflow and Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates Fail state to mark the workflow failed in case any of the steps fail.\n", + "failed_state_sagemaker_processing_failure = stepfunctions.steps.states.Fail(\n", + " \"ML Workflow failed\", cause=\"SageMakerProcessingJobFailed\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "# Adds the Error handling in the workflow\n", + "catch_state_processing = stepfunctions.steps.states.Catch(\n", + " error_equals=[\"States.TaskFailed\"],\n", + " next_step=failed_state_sagemaker_processing_failure,\n", + ")\n", + "\n", + "processing_step.add_catch(catch_state_processing)\n", + "training_step.add_catch(catch_state_processing)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates workflow with Pre-Processing Job and Training Job\n", + "workflow_graph = Chain([processing_step, training_step])\n", + "branching_workflow = Workflow(\n", + " name=\"SFN_Hermione_Train\",\n", + " definition=workflow_graph,\n", + " role=workflow_execution_role,\n", + ")\n", + "branching_workflow.create()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "# Generates unique names for Pre-Processing Job and Training Job\n", + "# Each job requires a unique name\n", + "preprocessing_job_name = \"Hermione-Preprocessing-{}\".format(\n", + " uuid.uuid1().hex\n", + ") \n", + "training_job_name = \"Hermione-Training-{}\".format(\n", + " uuid.uuid1().hex\n", + ") " + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + " \n", + " \n", + "
\n", + "
    \n", + "
  • \n", + "
    \n", + " Success\n", + "
  • \n", + "
  • \n", + "
    \n", + " Failed\n", + "
  • \n", + "
  • \n", + "
    \n", + " Cancelled\n", + "
  • \n", + "
  • \n", + "
    \n", + " In Progress\n", + "
  • \n", + "
  • \n", + "
    \n", + " Caught Error\n", + "
  • \n", + "
\n", + "
\n", + "\n", + " \n", + " Inspect in AWS Step Functions \n", + "
\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Executes the workflow\n", + "execution = branching_workflow.execute(\n", + " inputs={\n", + " \"PreprocessingJobName\": preprocessing_job_name,\n", + " \"TrainingJobName\": training_job_name\n", + " }\n", + ")\n", + "execution_output = execution.get_output(wait=False)\n", + "execution.render_progress()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/5_Sagemaker_StepFunctions_Inference.ipynb b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/5_Sagemaker_StepFunctions_Inference.ipynb new file mode 100644 index 0000000..1c9af76 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/5_Sagemaker_StepFunctions_Inference.ipynb @@ -0,0 +1,737 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Build machine learning workflow to predict new data with Amazon SageMaker and AWS Step Functions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This script creates a Step Function state machine to preprocess the inference data and predict with the images in ECR." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import modules" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import uuid\n", + "import boto3\n", + "import sagemaker\n", + "from sagemaker.amazon.amazon_estimator import get_image_uri\n", + "from sagemaker.s3 import S3Uploader\n", + "from sagemaker import get_execution_role\n", + "from sagemaker.sklearn.processing import SKLearnProcessor\n", + "from sagemaker.processing import Processor, ProcessingInput, ProcessingOutput\n", + "import stepfunctions\n", + "from stepfunctions.steps import (\n", + " Chain,\n", + " ProcessingStep,\n", + " TransformStep\n", + ")\n", + "from stepfunctions.inputs import ExecutionInput\n", + "from stepfunctions.workflow import Workflow" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Modify according to your configurations." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Bucket name in S3\n", + "bucket = \"hermione-sagemaker\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Set session\n", + "region_name=\"us-east-1\"\n", + "boto3.setup_default_session(region_name=region_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Get user role\n", + "role = get_execution_role()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Role to create and execute step functions\n", + "# paste the AmazonSageMaker-StepFunctionsWorkflowExecutionRole ARN\n", + "workflow_execution_role = \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# SageMaker expects unique names for each job, model and endpoint.\n", + "# Otherwise, the execution will fail. The ExecutionInput creates\n", + "# dynamically names for each execution.\n", + "execution_input = ExecutionInput(\n", + " schema={\n", + " \"PreprocessingJobName\": str,\n", + " \"TransformJobName\": str \n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# Get AWS Account ID\n", + "account_number = boto3.client(\"sts\").get_caller_identity()[\"Account\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# Processor image name previous uploaded in ECR\n", + "image_name_processor = \"hermione-processor\"" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Inference image name previous uploaded in ECR\n", + "image_name_inference = \"hermione-inference\"" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Input and output paths to execute train and inference\n", + "paths = {\n", + " 'expectations': f\"s3://{bucket}/PREPROCESSING/EXPECTATIONS\",\n", + " 'preprocessing': f\"s3://{bucket}/PREPROCESSING/PREPROCESSING\",\n", + " 'test_raw': f\"s3://{bucket}/TEST_RAW\",\n", + " 'inference_processed': f\"s3://{bucket}/PREPROCESSING/INFERENCE_PROCESSED\",\n", + " 'validations': f\"s3://{bucket}/PREPROCESSING/VALIDATIONS\",\n", + " 'model': f\"s3://{bucket}/PREPROCESSING/MODEL/Hermione-train-2021-05-26-12-41-29-505/output/model.tar.gz\",\n", + " 'output_path': f\"s3://{bucket}/PREPROCESSING/OUTPUT\"\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# instance to run the code\n", + "instance_type_preprocessing=\"ml.t3.medium\"\n", + "instance_type_inference=\"ml.m5.large\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preprocessing Step" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# Processor image previous uploaded in ECR\n", + "image_uri_processor = f\"{account_number}.dkr.ecr.{region_name}.amazonaws.com/{image_name_processor}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the processor to access the ECR image\n", + "processor = Processor(image_uri=image_uri_processor,\n", + " role=role,\n", + " instance_count=1,\n", + " instance_type=instance_type_preprocessing)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates input and output objects for ProcessingStep\n", + "inputs=[\n", + " ProcessingInput(source=paths['test_raw'],\n", + " destination='/opt/ml/processing/input/raw_data', \n", + " input_name='raw_data'),\n", + " ProcessingInput(source=paths['preprocessing'], \n", + " destination='/opt/ml/processing/input/preprocessing', \n", + " input_name='preprocessing'),\n", + " ProcessingInput(source=paths['expectations'], \n", + " destination='/opt/ml/processing/input/expectations', \n", + " input_name='expectations')\n", + "]\n", + "outputs = [\n", + " ProcessingOutput(\n", + " source=\"/opt/ml/processing/output/processed/inference\",\n", + " destination=paths['inference_processed'],\n", + " output_name=\"inference_data\",\n", + " ),\n", + " ProcessingOutput(\n", + " source=\"/opt/ml/processing/output/validations\",\n", + " destination=paths['validations'],\n", + " output_name=\"validations\",\n", + " )\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the ProcessingStep\n", + "processing_step = ProcessingStep(\n", + " \"SageMaker Preprocessing step\",\n", + " processor=processor,\n", + " job_name=execution_input[\"PreprocessingJobName\"],\n", + " inputs=inputs,\n", + " outputs=outputs,\n", + " container_arguments=[\"--step\", \"test\"]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Inference Step" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# Inference image previous uploaded in ECR\n", + "image_uri_inference = f\"{account_number}.dkr.ecr.{region_name}.amazonaws.com/{image_name_inference}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates input and output objects for TransformStep\n", + "input_path = paths['inference_processed']\n", + "model_path = paths['model']\n", + "output_path = paths['output_path']" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the model to access the ECR image\n", + "model = sagemaker.model.Model(\n", + " image_uri = image_uri_inference,\n", + " model_data=model_path,\n", + " role=role)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates a transformer object from the trained model\n", + "transformer = model.transformer(\n", + " instance_count=1,\n", + " instance_type=instance_type_inference, \n", + " output_path=output_path,\n", + " accept = 'text/csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the TransformStep\n", + "transform_step = TransformStep(\n", + " \"Inference Step\",\n", + " transformer=transformer,\n", + " job_name=execution_input[\"TransformJobName\"],\n", + " data=input_path,\n", + " content_type='text/csv',\n", + " wait_for_completion=True,\n", + " model_name=model.name\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Workflow and Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates Fail state to mark the workflow failed in case any of the steps fail.\n", + "failed_state_sagemaker_processing_failure = stepfunctions.steps.states.Fail(\n", + " \"ML Workflow failed\", cause=\"SageMakerProcessingJobFailed\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "# Adds the Error handling in the workflow\n", + "catch_state_processing = stepfunctions.steps.states.Catch(\n", + " error_equals=[\"States.TaskFailed\"],\n", + " next_step=failed_state_sagemaker_processing_failure,\n", + ")\n", + "\n", + "processing_step.add_catch(catch_state_processing)\n", + "transform_step.add_catch(catch_state_processing)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates workflow with Pre-Processing Job and Transform Job\n", + "workflow_graph = Chain([processing_step, transform_step])\n", + "branching_workflow = Workflow(\n", + " name=\"SFN_Hermione_Inference\",\n", + " definition=workflow_graph,\n", + " role=workflow_execution_role,\n", + ")\n", + "branching_workflow.create()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "# Generates unique names for Pre-Processing Job and Training Job\n", + "# Each job requires a unique name\n", + "preprocessing_job_name = \"Hermione-Preprocessing-{}\".format(\n", + " uuid.uuid1().hex\n", + ") \n", + "inference_job_name = \"Hermione-Inference-{}\".format(\n", + " uuid.uuid1().hex\n", + ") " + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + " \n", + " \n", + "
\n", + "
    \n", + "
  • \n", + "
    \n", + " Success\n", + "
  • \n", + "
  • \n", + "
    \n", + " Failed\n", + "
  • \n", + "
  • \n", + "
    \n", + " Cancelled\n", + "
  • \n", + "
  • \n", + "
    \n", + " In Progress\n", + "
  • \n", + "
  • \n", + "
    \n", + " Caught Error\n", + "
  • \n", + "
\n", + "
\n", + "\n", + " \n", + " Inspect in AWS Step Functions \n", + "
\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Executes the workflow\n", + "execution = branching_workflow.execute(\n", + " inputs={\n", + " \"PreprocessingJobName\": preprocessing_job_name,\n", + " \"TransformJobName\": inference_job_name\n", + " }\n", + ")\n", + "execution_output = execution.get_output(wait=False)\n", + "execution.render_progress()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Results" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SurvivedAgePclass_1Pclass_2Pclass_3Sex_1Sex_2predict
01.00.0072880.00.01.01.00.01.0
10.00.3717010.01.00.00.01.00.0
20.00.7612470.01.00.00.01.00.0
30.00.3340040.00.01.00.01.00.0
40.00.5727571.00.00.00.01.00.0
...........................
2170.00.2083440.00.01.00.01.00.0
2180.00.2334760.00.01.00.01.00.0
2190.00.0198540.00.01.01.00.01.0
2201.00.2209101.00.00.01.00.01.0
2211.00.6481530.01.00.01.00.01.0
\n", + "

222 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " Survived Age Pclass_1 Pclass_2 Pclass_3 Sex_1 Sex_2 predict\n", + "0 1.0 0.007288 0.0 0.0 1.0 1.0 0.0 1.0\n", + "1 0.0 0.371701 0.0 1.0 0.0 0.0 1.0 0.0\n", + "2 0.0 0.761247 0.0 1.0 0.0 0.0 1.0 0.0\n", + "3 0.0 0.334004 0.0 0.0 1.0 0.0 1.0 0.0\n", + "4 0.0 0.572757 1.0 0.0 0.0 0.0 1.0 0.0\n", + ".. ... ... ... ... ... ... ... ...\n", + "217 0.0 0.208344 0.0 0.0 1.0 0.0 1.0 0.0\n", + "218 0.0 0.233476 0.0 0.0 1.0 0.0 1.0 0.0\n", + "219 0.0 0.019854 0.0 0.0 1.0 1.0 0.0 1.0\n", + "220 1.0 0.220910 1.0 0.0 0.0 1.0 0.0 1.0\n", + "221 1.0 0.648153 0.0 1.0 0.0 1.0 0.0 1.0\n", + "\n", + "[222 rows x 8 columns]" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "pd.read_csv('s3://hermione-sagemaker/PREPROCESSING/OUTPUT/inference.csv.out')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/dataquality.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/dataquality.py new file mode 100644 index 0000000..68d8ad2 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/dataquality.py @@ -0,0 +1,61 @@ +import pandas as pd +import great_expectations as ge + +class DataQuality: + """ + Class to perform data quality before training + """ + def __init__(self, continuous_cols=None, discrete_cat_cols=None): + """ + Constructor + + Parameters + ---------- + continuous_cols : array + Receives an array with the name of the continuous columns + discrete_cat_cols : array + Receives an array with the name of the dicrete/categorical columns + Returns + ------- + DataQuality + """ + self.continuous_cols = continuous_cols + self.discrete_cat_cols = discrete_cat_cols + + def perform(self, df: pd.DataFrame, target=None, cut_off = 2): + """ + Perform data quality + + Parameters + ---------- + df : pd.Dataframe + Dataframe to be processed + + Returns + ------- + json + """ + if target != None: + df.drop(columns=[target], inplace=True) + df_ge = ge.dataset.PandasDataset(df) + cols = df_ge.columns + df_ge.expect_table_columns_to_match_ordered_list(cols) + for col in cols: + df_ge.expect_column_values_to_not_be_null(col) + cut_off = 2 + if self.continuous_cols != None: + for col in self.continuous_cols: + measures = df_ge[col].describe() + df_ge.expect_column_values_to_be_of_type(col, 'int64') + df_ge.expect_column_mean_to_be_between(col, measures['mean'] - cut_off * measures['std'], measures['mean'] + cut_off * measures['std']) + df_ge.expect_column_max_to_be_between(col, measures['max'] - cut_off * measures['std'], measures['max'] + cut_off * measures['std']) + df_ge.expect_column_min_to_be_between(col, measures['min'] - cut_off * measures['std'], measures['min'] + cut_off * measures['std']) + expected_partition = ge.dataset.util.continuous_partition_data(df_ge[col]) + df_ge.expect_column_bootstrapped_ks_test_p_value_to_be_greater_than(col, expected_partition) + if len(self.discrete_cat_cols) != None: + for col in self.discrete_cat_cols: + possible_cat = df_ge[col].unique() + df_ge.expect_column_values_to_be_in_set(col, possible_cat) + expected_partition = ge.dataset.util.categorical_partition_data(df_ge[col]) + df_ge.expect_column_chisquare_test_p_value_to_be_greater_than(col, expected_partition) + return df_ge \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/normalization.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/normalization.py new file mode 100644 index 0000000..6d5822b --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/normalization.py @@ -0,0 +1,159 @@ +import pandas as pd +import numpy as np +from sklearn.preprocessing import StandardScaler, normalize, MinMaxScaler +from scipy.stats import zscore + +class Normalizer: + + def __init__(self, norm_cols: dict): + """ + Constructor + + Parameters + ---------- + norm_cols : dict + Receives dict with the name of the normalization to be + performed and which are the columns + Ex: norm_cols = {'zscore': ['salary', 'price'], + 'min-max': ['heigth', 'age']} + + Returns + ------- + Normalization + """ + self.norm_cols = norm_cols + self.col_names = [name for norm in norm_cols for name in norm_cols[norm]] + self.norms = {'min-max': MinMaxScaler, + 'standard': StandardScaler} + self.fitted = False + + def statistics(self, df : pd.DataFrame): + """ + Calculates dataframe statistics + + Parameters + ---------- + df : dataframe to calculate the statistics for each column + + Returns + ------- + None + """ + zip_cols = lambda result: zip(result.index.values, result.values) + self.col_min = {col: value for col, value in zip_cols(df[self.col_names].min())} + self.col_max = {col: value for col, value in zip_cols(df[self.col_names].max())} + self.col_std = {col: value for col, value in zip_cols(df[self.col_names].std())} + self.col_mean = {col: value for col, value in zip_cols(df[self.col_names].mean())} + self.col_median = {col: value for col, value in zip_cols(df[self.col_names].median())} + + def __apply_func(self, X, normalization): + """ + Creates the normalization object + + Parameters + ---------- + X : array + Data to be normalized + normalization : Normalization + Normalization to be applied + + Returns + ------- + Normalization + """ + normalization.fit(X) + return normalization + + def fit(self, df: pd.DataFrame): + """ + Generates normalization object for each column + + Parameters + ---------- + df : pd.DataFrame + dataframe with columns to be normalized + + Returns + ------- + None + """ + self.statistics(df) + self.normalization = dict() + for norm in self.norm_cols: + if norm in ['zscore', 'log10']: + continue + for col in self.norm_cols[norm]: + self.normalization[col] = self.__apply_func(df[col].values.reshape(-1, 1), self.norms[norm]()) + self.fitted = True + + def transform(self, df: pd.DataFrame): + """ + Apply normalization to each column + + Parameters + ---------- + df : pd.DataFrame + dataframe with columns to be normalized + + Returns + ------- + pd.DataFrame + """ + if not self.fitted: + raise Exception("Not yet fitted.") + + for norm in self.norm_cols: + if norm == 'zscore': + for col in self.norm_cols[norm]: + df.loc[:,col] = (df[col].values - self.col_mean[col])/self.col_std[col] + elif norm == 'log10': + for col in self.norm_cols[norm]: + df.loc[:,col] = np.log10(df[col].values) + else: + for col in self.norm_cols[norm]: + df.loc[:,col] = self.normalization[col].transform(df[col].values.reshape(-1, 1)) + return df + + def inverse_transform(self, df: pd.DataFrame): + """ + Apply the denormalized to each column + + Parameters + ---------- + df : pd.DataFrame + dataframe with columns to be denormalized + + Returns + ------- + pd.DataFrame + """ + if not self.fitted: + raise Exception("Not yet trained.") + + for norm in self.norm_cols: + if norm == 'zscore': + for col in self.norm_cols[norm]: + df.loc[:,col] = df[col].apply(lambda z: self.col_std[col]*z + self.col_mean[col]) + elif norm == 'log10': + for col in self.norm_cols[norm]: + df.loc[:,col] = df[col].apply(lambda x: 10 ** x) + else: + for col in self.norm_cols[norm]: + df.loc[:,col] = self.normalization[col].inverse_transform(df[col].values.reshape(-1, 1)) + return df + + def fit_transform(self, df: pd.DataFrame): + """ + Creates object and apply it normalization + + Parameters + ---------- + df : pd.DataFrame + dataframe with columns to be normalized + + Returns + ------- + pd.DataFrame + """ + self.fit(df) + return self.transform(df) diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/preprocessing.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/preprocessing.py new file mode 100644 index 0000000..dea90fa --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/preprocessing.py @@ -0,0 +1,141 @@ +import pandas as pd + +from ml.preprocessing.normalization import Normalizer +from sklearn.preprocessing import OneHotEncoder +from sklearn.model_selection import train_test_split +from category_encoders import * +import logging + +logging.getLogger().setLevel(logging.INFO) + +class Preprocessing: + """ + Class to perform data preprocessing before training + """ + + def __init__(self, norm_cols=None, oneHot_cols=None): + """ + Constructor + + Parameters + ---------- + norm_cols : dict + Receives dict with the name of the normalization to be + performed and which are the columns + Ex: norm_cols = {'zscore': ['salary', 'price'], + 'min-max': ['heigth', 'age']} + oneHot_cols : array + Receives an array with columns names to be categorized with One Hot Encoding + Returns + ------- + Preprocessing + """ + self.norm_cols = norm_cols + self.oneHot_cols = oneHot_cols + self.ohe = OneHotEncoder(handle_unknown='ignore') + + def clean_data(self, df: pd.DataFrame): + """ + Perform data cleansing. + + Parameters + ---------- + df : pd.Dataframe + Dataframe to be processed + + Returns + ------- + pd.Dataframe + Cleaned Data Frame + """ + logging.info("Cleaning data") + df_copy = df.copy() + df_copy['Pclass'] = df_copy.Pclass.astype('object') + df_copy = df_copy.dropna() + return df_copy + + def categ_encoding_oneHot(self, df: pd.DataFrame, step_train = False): + """ + Perform encoding of the categorical variables using One Hot Encoding + + Parameters + ---------- + df : pd.Dataframe + Dataframe to be processed + step_train : bool + if True, the fit function is executed + + Returns + ------- + pd.Dataframe + Encoded Data Frame + """ + logging.info("One hot encoding") + df_copy = df.copy() + + if step_train: + self.ohe.fit(df_copy[self.oneHot_cols]) + + arr = self.ohe.transform(df_copy[self.oneHot_cols]) + df_copy = df_copy.join(arr).drop(self.oneHot_cols, axis=1) + return df_copy + + def normalize(self, df: pd.DataFrame, step_train = False): + """ + Apply normalization to the selected columns + + Parameters + ---------- + df : pd.DataFrame + dataframe with columns to be normalized + step_train : bool + if True, the Normalizer is created and applied, + otherwise it is only applied + + Returns + ------- + pd.DataFrame + Normalized dataframe + """ + logging.info("Normalizing") + if step_train: + self.norm = Normalizer(self.norm_cols) + df = self.norm.fit_transform(df) + else: + df = self.norm.transform(df.copy()) + return df + + def execute(self, df, step_train = False, val_size = 0.2): + """ + Apply all preprocessing steps on the Dataframe + + Parameters + ---------- + df : pd.DataFrame + dataframe with columns to be normalized + step_train : bool + if True, data is splited in train and val + step_train : val_size + Size of the validation dataset + + Returns + ------- + pd.DataFrame + - One Preprocessed dataframe, if step_train is False + - Two Preprocessed dataframes, if step_train is True + """ + df = self.clean_data(df) + df = self.categ_encoding_oneHot(df, step_train) + + if step_train: + logging.info("Divide train and test") + X_train, X_val = train_test_split(df, test_size=val_size, random_state=123) + X_train = self.normalize(X_train, step_train = True) + X_val = self.normalize(X_val, step_train = False) + logging.info(f"shape train {X_train.shape} val {X_val.shape}") + return X_train, X_val + else: + X = self.normalize(df, step_train = False) + logging.info(f"shape {X.shape}") + return X + diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/text_vectorizer.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/text_vectorizer.py new file mode 100644 index 0000000..674458e --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/text_vectorizer.py @@ -0,0 +1,201 @@ +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.feature_extraction.text import TfidfVectorizer +import numpy as np +import pandas as pd + +class TextVectorizer: + + def __init__(self, vectorizer_cols : dict, word2vec=None): + """ + Constructor + + Parameters + ---------- + vectorizer_cols : dict + Receives a dict with the name of the vectorizer to be + performed and which are the columns + Ex: vectorizer_cols = {'embedding_median': ['col'], + 'embedding_mean': ['col'], + 'tf_idf': ['col'], + 'bag_of_words' : [col]} + Returns + ------- + Normalization + """ + self.word2vec = word2vec + self.index_ini_fim = len(self.word2vec.index2word) if word2vec != None else 0 + self.vectorizer_cols = vectorizer_cols + self.vectorizer_vects = {'bag_of_words': self.bag_of_words, + 'tf_idf': self.tf_idf_vect} + self.fitted = False + + def fit(self, df: pd.DataFrame): + """ + Generates the vectorizer object for each column. The text must be preprocessed. + + Parameters + ---------- + df : pd.DataFrame + dataframe with columns to be vectorizer + + Returns + ------- + None + """ + self.vectorizers_fitted = dict() + for vectorizer in self.vectorizer_cols: + if vectorizer in ['index', 'embedding_median', 'embedding_mean']: + continue + for col in self.vectorizer_cols[vectorizer]: + self.vectorizers_fitted[vectorizer] = {} + self.vectorizers_fitted[vectorizer][col] = self.vectorizer_vects[vectorizer](df[col].values) + self.fitted = True + + def transform(self, df: pd.DataFrame): + """ + Apply the vectorizer object for each column. The text must be preprocessed. + + Parameters + ---------- + df : pd.DataFrame + dataframe with columns to be vectorizer + + Returns + ------- + pd.DataFrame + """ + if not self.fitted: + raise Exception("Not yet trained.") + + for vectorizer in self.vectorizer_cols: + if vectorizer == 'index': + for col in self.vectorizer_cols[vectorizer]: + df.loc[:, col+"_"+vectorizer] = df[col].apply(lambda x: self.embedding(x, 3)) + elif vectorizer == 'embedding_median': + for col in self.vectorizer_cols[vectorizer]: + df.loc[:, col+"_"+vectorizer] = df[col].apply(lambda x: self.embedding(x, 1)) + elif vectorizer == 'embedding_mean': + for col in self.vectorizer_cols[vectorizer]: + df.loc[:, col+"_"+vectorizer] = df[col].apply(lambda x: self.embedding(x, 2)) + elif (vectorizer == 'bag_of_words') | (vectorizer == 'tf_idf'): + for col in self.vectorizer_cols[vectorizer]: + values = self.vectorizers_fitted[vectorizer][col].transform(df[col]) + df.loc[:,col+"_"+vectorizer] = pd.Series(values.toarray().tolist()) + + return df + + def embedding(self, X, typ_transform=1): + """ + Apply the embedding in X. The text must be preprocessed. + + Parameters + ---------- + X : pd.Series + row to be encoded + typ_transform : int + type of transformation + 1 - apply embedding median + 2 - apply embedding mean + 2 - apply index + + Returns + ------- + pd.DataFrame + """ + if X is None or type(X) == float: + return None + vector = [] + if typ_transform == 1: # mediana + vector = np.median([self.word2vec[x] for x in X.split() if x in self.word2vec], axis=0) + elif typ_transform == 2: # média + vector = np.mean([self.word2vec[x] for x in X.split() if x in self.word2vec], axis=0)#[0] + elif typ_transform == 3: # indexação + idx = self.word2vec.index2word + set_idx = set(idx) + indexes = [idx.index(token) for token in X.split() if token in set_idx] + indexes = [self.index_ini_fim] + indexes + [self.index_ini_fim] + # Create vector + X_length = len(indexes) + vector = np.zeros(X_length, dtype=np.int64) + vector[:len(indexes)] = indexes + else: + vector = [] + return vector + + def bag_of_words(self, corpus): + """ + Generate object bag of words + + Parameters + ---------- + corpus : str + text to generate object bag of words + Returns + ------- + model + """ + vectorizer = CountVectorizer() + model = vectorizer.fit(corpus) + return model + + def tf_idf_vect(self, corpus): + """ + Generate object td idf + + Parameters + ---------- + corpus : str + text to generate object tf idf + Returns + ------- + model + """ + vectorizer = TfidfVectorizer() + model = vectorizer.fit(corpus) + return model + + def inverse_transform(self, df: pd.DataFrame): + """ + Apply the invese_transform of vectorizer to each column + Options: index, bag_of_words and tf_idf + + Parameters + ---------- + df : pd.DataFrame + dataframe with columns to be unvectorizer + + Returns + ------- + pd.DataFrame + """ + if not self.fitted: + raise Exception("Not yet trained.") + + for vectorizer in self.vectorizer_cols: + if vectorizer == 'index': + for col in self.vectorizer_cols[vectorizer]: + df.loc[:, col+"_remove_"+vectorizer] = df[col].apply(lambda x: self.unvectorize(x)) + elif (vectorizer == 'bag_of_words') | (vectorizer == 'tf_idf'): + for col in self.vectorizer_cols[vectorizer]: + values = self.vectorizers_fitted[vectorizer][col].inverse_transform(df[col]) + df.loc[:,col+"_remove_"+vectorizer] = pd.Series(values.toarray().tolist()) + + return df + + def unvectorize(self, vector): + """ + Apply unvectorize in vector index + + Parameters + ---------- + vector : array + array with index + + Returns + ------- + array + """ + idx = self.word2vec.index2word + tokens = [idx[index] for index in vector if index != self.index_ini_fim] + X = " ".join(token for token in tokens) + return X \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/visualization/app-streamlit-titanict.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/visualization/app-streamlit-titanict.py new file mode 100644 index 0000000..727aca8 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/visualization/app-streamlit-titanict.py @@ -0,0 +1,84 @@ +import streamlit as st +from vega_datasets import data +import pandas as pd +import altair as alt +import sys +from pandas_profiling import ProfileReport +from streamlit_pandas_profiling import st_profile_report + +@st.cache +def load_data(): + dataframe = pd.read_csv("../../../data/raw/train.csv") + dataframe["Survived"] = dataframe["Survived"].replace([0,1],["Died", "Survived"]) + return dataframe + +def instructions(): + + st.markdown( + """ + Sample streamlit page using the Titanic dataset. + This library is interesting for presenting the results and generating a web page. + + + ### Questions? + + Streamlit community -> https://discuss.streamlit.io + """) + +def dataset_analysis(df): + survived = ["All"] + survived.extend(df["Survived"].unique()) + + selected = st.selectbox("Survived:", survived) + if selected == "All": + st.write('## Dataset Titanic', df) + else: + st.write('## Dataset Titanic', df[df["Survived"] == selected]) + + if st.checkbox("Graphical Display", False): + st.subheader("Dataset Graphical Display") + + st.altair_chart(alt.Chart(df).mark_circle().encode( + alt.X('Age', scale=alt.Scale(zero=False)), + alt.Y('Fare', scale=alt.Scale(zero=False, padding=1)), + color='Survived', + size='Pclass', + tooltip=['Age','Survived', 'Sex', 'Pclass'], + ).interactive(), use_container_width=True) + if st.checkbox("Show Summary", False): + st.write(df.describe()) + + +def profilling_analysis(df): + try: + pr = ProfileReport(df, explorative=True) + st.title("Pandas Profiling in Streamlit") + st.write(df) + st_profile_report(pr) + except: + st.title("Error - Pandas profiling was not generated") + + +def main(): + st.title("Titanic Dataset") + + df = load_data() + + st.sidebar.title("What to do") + menu = ["Instructions", "DataSet Exploration - Profilling", "DataSet Exploration - General"] + app_mode = st.sidebar.selectbox("Select an option:", + menu) + if app_mode == menu[0]: + st.sidebar.success('Next "'+menu[1]+'".') + instructions() + elif app_mode == menu[1]: + st.sidebar.success('Next "'+menu[2]+'".') + profilling_analysis(df) + elif app_mode == menu[2]: + #st.sidebar.success('Para continuar selecione "'+menu[3]+'".') + dataset_analysis(df) + + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/visualization/visualization.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/visualization/visualization.py new file mode 100644 index 0000000..a516e9d --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/visualization/visualization.py @@ -0,0 +1,428 @@ +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt +import altair as alt +import numpy as np +from yellowbrick.target import FeatureCorrelation + +class Visualization: + + @staticmethod + def general_analysis(df): + """ + Plot function general analysis of graphs + + Parameters + ---------- + df : pd.DataFrame + dataframe to be analyzed + + Returns + ---- + None + """ + pass + + @staticmethod + def missing_analysis(df): + """ + Function plots the percentage of missings in all columns of the DataFrame + + Parameters + ---------- + df : pd.DataFrame + dataframe on which the missing will be analyzed + + Returns + ------- + None + """ + df_isnull = (df.isnull().sum() / len(df))*100 + df_isnull = df_isnull.drop(df_isnull[df_isnull ==0].index).sort_values(ascending = False) + missing_data = pd.DataFrame({'Percentual Missing': df_isnull}) + missing_data.plot.bar() + + @staticmethod + def count_values(df, feature, title): + """ + Plot of count of distinct values ​​of a feature + + Parameters + ---------- + df : pd.DataFrame + dataframe with the values + feature : str + name of the feature to be counted + title : str + chart title + + Returns + ---- + None + """ + g = sns.catplot(feature, data=df, aspect=4, kind="count") + g.set_xticklabels(rotation=90) + g = plt.title(title) + + @staticmethod + def regression_analysis( y_true, y_pred, path=None): + """ + Analysis of the real and predicted y of the regression model + + Parameters + ---------- + y_true : array + true values + y_pred : array + predicted values + path : str + path where the graphics will be saved + + Returns + ------- + None + """ + residual = y_true - y_pred + print("Histogram") + Visualization.histogram(residual, "Residual") + print("Scatter") + Visualization.scatter(y_pred, residual, "pred", "residual", path=path) + print("Scatter") + Visualization.scatter(y_true, y_pred, "y_test", "pred", path=path) + + @staticmethod + def histogram(values, title, fig_size=(4,3), path=None): + """ + Histogram plot of a set of values + + Parameters + ---------- + values : array + values + title : str + title + fig_size : tuple + figure size + path : str + path where the graphics will be saved + + Returns + ------- + None + """ + plt.clf() + f, ax = plt.subplots(1, figsize=fig_size) + ax.hist(values, bins=60) + ax.set_title(title) + f.tight_layout() + if(path != None): + f.savefig(path+'/hist_'+title+'.png') + + + @staticmethod + def correlation_analysis(df, fig_size=(5,4), path=None): + """ + Correlation of variables in the dataframe + + Parameters + ---------- + df : pd.DataFrame + dataframe + fig_size : tuple + figure size + path : str + path where the graphics will be saved + + Returns + ------- + None + """ + plt.clf() + f, ax = plt.subplots(1, figsize=fig_size) + corr = round(df.corr(), 4) + sns.heatmap(corr, + xticklabels=corr.columns.values, + yticklabels=corr.columns.values, ax=ax) + f.tight_layout() + if(path != None): + f.savefig(path+'/correlation.png') + + @staticmethod + def features_correlation(df, cols, target, fig_size=(6,6), path=None): + """ + Correlation of variables in the dataframe with respect to the target + + Parameters + ---------- + df : pd.Dataframe + dataframe with the data to calculate the correlation + cols : array + columns to be correlated with the target + target : str + target name + fig_size : tuple + figure size + path : str + path where the graphics will be saved + + Returns + ------- + None + """ + f, ax = plt.subplots(1, figsize=fig_size) + ax.set_xlabel("Feature Correlation") + visualizer = FeatureCorrelation(labels=list(cols)) + visualizer.fit(df[cols], df[target]) + f.tight_layout() + if(path != None): + f.savefig(path+'/features_correlation.png') + + @staticmethod + def scatter(x, y, xlabel, ylabel, fig_size=(5,4), groups=None, group_color=None, path=None): + """ + Plot scatter + + Parameters + ---------- + x : array + list of x-axis values + y : array + list of y-axis values + x_label : str + label x + y_label : array + label y + fig_size : tuple + figure size + groups : array + group list + group_color : dict + group colors + path : str + path where the graphics will be saved + + Returns + ------- + None + """ + f, ax = plt.subplots(1, figsize=fig_size) + sns.scatterplot(x, y, hue=groups, palette=group_color, legend="full", ax=ax) + ax.set_xlabel(xlabel) + ax.set_ylabel(ylabel) + f.tight_layout() + if(path != None): + f.savefig(path+'/scatter_'+xlabel+'_'+ylabel+'.png') + + @staticmethod + def bar(x, y, xlabel, ylabel, fig_size=(5,4), est=np.mean, groups=None, group_color=None, path=None): + """ + Plot bar + + Parameters + ---------- + x : array + list of x-axis values + y : array + list of y-axis values + x_label : str + label x + y_label : array + label y + fig_size : tuple + figure size + est : np + numpy function for aggregating the bars + groups : array + group list + group_color : dict + group colors + path : str + path where the graphics will be saved + + Returns + ------- + None + """ + f, ax = plt.subplots(1, figsize=fig_size) + sns.barplot(x, y, ax=ax, hue=groups, estimator=est, color=group_color) + ax.set_xlabel(xlabel) + ax.set_ylabel(ylabel) + f.tight_layout() + if(path != None): + f.savefig(path+'/barr_'+xlabel+'_'+ylabel+'.png') + + @staticmethod + def line(x, y, xlabel, ylabel, fig_size=(5,4), est=np.mean, groups=None, group_color=None, path=None): + """ + Plot bar + + Parameters + ---------- + x : array + list of x-axis values + y : array + list of y-axis values + x_label : str + label x + y_label : array + label y + fig_size : tuple + figure size + est : np + numpy function for aggregating the bars + groups : array + group list + group_color : dict + group colors + path : str + path where the graphics will be saved + + Returns + ------- + None + """ + f, ax = plt.subplots(1, figsize=fig_size) + sns.lineplot(x, y, hue=groups, estimator=est, color=group_color, ax=ax) + ax.set_xlabel(xlabel) + ax.set_ylabel(ylabel) + f.tight_layout() + if(path != None): + f.savefig(path+'/linha_'+xlabel+'_'+ylabel+'.png') + + @staticmethod + def box_plot(x, y, xlabel, ylabel, fig_size=(5,4), path=None): + """ + Plot line + + Parameters + ---------- + x : array + list of x-axis values + y : array + list of y-axis values + x_label : str + label x + y_label : array + label y + fig_size : tuple + figure size + path : str + path where the graphics will be saved + + Returns + ------- + None + """ + f, ax = plt.subplots(1, figsize=fig_size) + sns.boxplot(x=x, y=y, ax=ax) + ax.set_xlabel(xlabel) + ax.set_ylabel(ylabel) + f.tight_layout() + if(path != None): + f.savefig(path+'/boxplot_'+xlabel+'_'+ylabel+'.png') + + @staticmethod + def scatter_interactive(df, col_name_x, col_name_y, xlabel, ylabel, hover, fig_size=(400,300), **kwargs): + """ + Interactive plotter + + Parameters + ---------- + df : pd.Dataframe + dataframe + col_name_x : str + col name in x + col_name_y : str + col name in y + x_label : str + label x + y_label : str + label y + hover : list + values show when pass mouse + fig_size : tuple + figure size + **kwargs : **kwargs + to inform other properties of the chart. For example, + to set the color to a type, just pass color = "blue" + Returns + ------- + None + """ + alt.Chart(df, width=fig_size[0], height=fig_size[1]).mark_circle().encode( + alt.X(col_name_x, title=xlabel), + alt.Y(col_name_y, title=ylabel), + tooltip=hover, + **kwargs + ).interactive().display() + + @staticmethod + def bar_interactive(df, col_name_x, col_name_y, xlabel, ylabel, hover, fig_size=(400,300), **kwargs): + """ + Interactive plotter + + Parameters + ---------- + df : pd.Dataframe + dataframe + col_name_x : str + col name in x + col_name_y : str + col name in y + x_label : str + label x + y_label : str + label y + hover : list + values show when pass mouse + fig_size : tuple + figure size + **kwargs : **kwargs + to inform other properties of the chart. For example, + to set the color to a type, just pass color = "blue" + Returns + ------- + None + """ + alt.Chart(df, width=fig_size[0], height=fig_size[1]).mark_bar().encode( + alt.X(col_name_x, title=xlabel), + alt.Y(col_name_y, title=ylabel), + tooltip=hover, + **kwargs + ).interactive().display() + + @staticmethod + def line_interactive(df, col_name_x, col_name_y, xlabel, ylabel, hover, fig_size=(400,300), **kwargs): + """ + Interactive plotter + + Parameters + ---------- + df : pd.Dataframe + dataframe + col_name_x : str + col name in x + col_name_y : str + col name in y + x_label : str + label x + y_label : str + label y + hover : list + values show when pass mouse + fig_size : tuple + figure size + **kwargs : **kwargs + to inform other properties of the chart. For example, + to set the color to a type, just pass color = "blue" + Returns + ------- + None + """ + alt.Chart(df, width=fig_size[0], height=fig_size[1]).mark_line().encode( + alt.X(col_name_x, title=xlabel), + alt.Y(col_name_y, title=ylabel), + tooltip=hover, + **kwargs + ).interactive().display() + diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/tests/README.md b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/tests/README.md new file mode 100644 index 0000000..ae81f9e --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/tests/README.md @@ -0,0 +1,41 @@ +# Hermione test files + +In this folder, you can develop unit tests for your Data Science project. + +Unit testing is a regular process in software development but, unfortunately, not so common in Data Science projects. To ensure your code quality and that the project is running flawless at all times, it is extremely important that you code unit tests, specially if you are not working alone but in a Data Science team. + +The tests you have in the implemented example project test, for instance, if the project has its minimum directory structure, if your dataset is correctly imported, if the dataset has no missing values and that some columns that should be there are there indeed after preprocessing. + +There are no "written in stone" rules to good testing in Data Science. You just have to figure out what tests are best for you. + +## How to run the tests + +When working locally, you should run your tests before pushing to a remote repository or sharing your code to others. To do that, **ensure that you are inside `tests` folder**. + +```bash +cd src/tests +``` + +Then, run the `pytest` command. + +```bash +pytest +``` + +If you want to have a coverage report, do so: + +```bash +coverage run -m pytest +coverage report -m +``` + +Both `coverage` and `pytest` libraries are already in the `requirements.txt` file. + +## Include tests on CI/CD files + +If you are working with a remote repository, it is a great practice to code a CI/CD `.yml` file. For more information, visit + +- [CI/CD for Machine Learning](https://www.infoq.com/presentations/ci-cd-ml/) +- [CI/CD for Machine Learning & AI](https://blog.paperspace.com/ci-cd-for-machine-learning-ai/) +- [Accelerate MLOps: using CI/CD with machine learning models +](https://algorithmia.com/blog/accelerate-mlops-using-ci-cd-with-machine-learning-models) diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/tests/test_project.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/tests/test_project.py new file mode 100644 index 0000000..87a62d9 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/tests/test_project.py @@ -0,0 +1,110 @@ +import os +import pytest +import pandas as pd +import sys +sys.path.append('..') + +@pytest.fixture(scope='module') +def read_data_train(): + from ml.data_source.spreadsheet import Spreadsheet + yield Spreadsheet().get_data('../../data/raw/raw_train.csv') + +@pytest.fixture(scope='module') +def read_data_test(): + from ml.data_source.spreadsheet import Spreadsheet + yield Spreadsheet().get_data('../../data/raw/raw_test.csv') + +@pytest.fixture(scope='module') +def cleaned_data_train(read_data_train): + from ml.preprocessing.preprocessing import Preprocessing + p = Preprocessing() + yield p.clean_data(read_data_train) + +@pytest.fixture(scope='module') +def cleaned_data_test(read_data_test): + from ml.preprocessing.preprocessing import Preprocessing + p = Preprocessing() + yield p.clean_data(read_data_test) + +def test_tree(): + """ + Test if the project has a good minimum structure + """ + assert os.path.exists(os.path.join('..','..', 'data', 'raw')) + assert os.path.exists(os.path.join('..','..', 'output')) + assert os.path.exists(os.path.join('..','..', 'src', 'api')) + assert os.path.exists(os.path.join('..','..', 'src', 'config')) + assert os.path.exists(os.path.join('..','..', 'src', 'ml', 'data_source')) + assert os.path.exists(os.path.join('..','..', 'src', 'ml', 'model')) + assert os.path.exists(os.path.join('..','..', 'src', 'ml', 'notebooks')) + assert os.path.exists(os.path.join('..','..', 'src', 'ml', 'preprocessing')) + assert os.path.exists(os.path.join('..','..', 'src', 'tests')) + +def test_spreadsheet(read_data_train): + """ + Test that spreadsheet is importing correctly + """ + assert read_data_train.shape[0] > 1 + + +def test_clean_data(cleaned_data_train): + """ + Test that the df is cleaned correctly + """ + assert cleaned_data_train.Pclass.dtype == 'object' + assert pd.isnull(cleaned_data_train.Age).sum() == 0 + +def all_columns(df, names): + """ + Test if df has all columns + """ + array = [name in df.columns for name in names] + return sum(array) == len(array) + +def values_between(df, col, min_value, max_value): + """ + Test if column has values between min and max + """ + array = [value >= min_value and max_value <= 1 for value in df[col]] + return sum(array) == len(array) + +def test_categ_encoding(cleaned_data_train,cleaned_data_test): + """ + Test if column PClass is encoding + """ + from ml.preprocessing.preprocessing import Preprocessing + names = ['Survived', 'Age', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_1', 'Sex_2'] + p = Preprocessing(oneHot_cols=['Pclass','Sex']) + df_train = p.categ_encoding_oneHot(cleaned_data_train, step_train=True) + assert all_columns(df_train,names) + df_test = p.categ_encoding_oneHot(cleaned_data_test, step_train=False) + assert all_columns(df_test,names) + +def test_normalize(cleaned_data_train,cleaned_data_test): + """ + Test if column Age is normalized + """ + from ml.preprocessing.preprocessing import Preprocessing + p = Preprocessing(norm_cols={'min-max': ['Age']}) + df_train = p.normalize(cleaned_data_train, step_train=True) + assert values_between(df_train,'Age',0,1) + df_test = p.normalize(cleaned_data_test, step_train=False) + assert values_between(df_test,'Age',0,1) + +def test_execute_train(read_data_train,read_data_test): + """ + Test if execute is correct + """ + from ml.preprocessing.preprocessing import Preprocessing + names = ['Survived', 'Age', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_1', 'Sex_2'] + norm_cols={'min-max': ['Age']} + oneHot_cols=['Pclass','Sex'] + p = Preprocessing(norm_cols,oneHot_cols) + X_train, X_val = p.execute(read_data_train, step_train=True) + assert all_columns(X_train,names) + assert values_between(X_train,'Age',0,1) + assert all_columns(X_val,names) + assert values_between(X_val,'Age',0,1) + X_test = p.execute(read_data_test, step_train=False) + assert all_columns(X_test,names) + assert values_between(X_test,'Age',0,1) \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/util.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/util.py new file mode 100644 index 0000000..02c7bde --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/util.py @@ -0,0 +1,49 @@ +import os +import sys +import collections +import copy +import json +import numpy as np +import pandas as pd +import re +from shutil import copyfile +import time +import yaml +import io + +def create_dirs(dirpath): + """Creating directories.""" + if not os.path.exists(dirpath): + os.makedirs(dirpath) + +def load_yaml(filepath): + with open(filepath, 'r') as stream: + return yaml.safe_load(stream) + + +def load_json(filepath): + """Load a json file.""" + with open(filepath, "r", encoding='utf8') as fp: + obj = json.load(fp) + return obj + + +def save_json(obj, filepath): + """Save a dictionary to a json file.""" + with open(filepath, "w") as fp: + json.dump(obj, fp, indent=4) + +def wrap_text(text): + """Pretty box print.""" + box_width = len(text) + 2 + print ('\n╒{}╕'.format('═' * box_width)) + print ('│ {} │'.format(text.upper())) + print ('╘{}╛'.format('═' * box_width)) + + +def load_data(data_csv): + """Load data from CSV to Pandas DataFrame.""" + df = pd.read_csv(data_csv, header=0) + wrap_text("Raw data") + print (df.head(5)) + return df diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/train/Dockerfile b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/train/Dockerfile new file mode 100644 index 0000000..207b1f7 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/train/Dockerfile @@ -0,0 +1,66 @@ +FROM ubuntu:latest +# Set a docker label to advertise multi-model support on the container +LABEL com.amazonaws.sagemaker.capabilities.multi-models=false +# Set a docker label to enable container to use SAGEMAKER_BIND_TO_PORT environment variable if present +LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true + +# No question/dialog is asked during apt-get install +ARG DEBIAN_FRONTEND=noninteractive + +# Setting the Timezone Environment Variable +ENV TZ=America/Sao_Paulo + +# install ubuntu libraries +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + gcc \ + python3.7 \ + python3-dev \ + python3-pip \ + ca-certificates \ + git \ + curl \ + nginx \ + openjdk-8-jre-headless\ + wget &&\ + rm -rf /var/lib/apt/lists/* + +# Create folders for code +RUN mkdir /opt/ml && \ + mkdir /opt/ml/output && \ + mkdir /opt/ml/code && \ + mkdir /opt/ml/code/train && \ + mkdir /opt/ml/code/src + +# Install requirements +COPY requirements.txt /opt/ml/code/src/requirements.txt +RUN pip3 install --no-cache -r /opt/ml/code/src/requirements.txt + +# Install the SageMaker Training Toolkit +RUN pip3 install --no-cache \ + boto3 \ + sagemaker \ + sagemaker-training + +# copy folders for code +COPY src/config/ /opt/ml/code/src/config/ +COPY src/ml/ /opt/ml/code/src/ml/ +COPY src/util.py /opt/ml/code/src/util.py +COPY train/train.py /opt/ml/code/train.py + +# Copy entrypoint script to the image and make it executable +WORKDIR /opt/ml/code + +# Environment variables +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PYTHONIOENCODING=UTF-8 \ + LANG=C.UTF-8 \ + LC_ALL=C.UTF-8 + +# Setting PYTHONPATH to access the copied code +ENV PYTHONPATH="/opt/ml/code:${PATH}" + +# Add a Python script and configure Docker to run it +RUN chmod +x train.py +ENV SAGEMAKER_PROGRAM train.py diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/train/train.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/train/train.py new file mode 100644 index 0000000..183ee0a --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/train/train.py @@ -0,0 +1,102 @@ +import sys +sys.path.append("src/") + +import os +from util import * +import traceback + +import logging +import pandas as pd + +from sklearn.metrics import * +from ml.model.trainer import TrainerSklearn +from sklearn.ensemble import RandomForestClassifier + +logging.getLogger().setLevel('INFO') + +# Paths to access the datasets and salve the model +prefix = '/opt/ml/' + +training_path = os.environ["SM_CHANNEL_TRAIN"] +val_path = os.environ["SM_CHANNEL_VALIDATION"] + +error_path = os.path.join(prefix, 'output') +model_path = os.environ['SM_MODEL_DIR'] + + +def read_input(file_path): + """ + Take the set of train files and read them all + into a single pandas dataframe + + Parameters + ---------- + file_path : string + Path of the file + + Returns + ------- + pd.Dataframe : pandas DataFrame + """ + input_files = [os.path.join(file_path, file) + for file in os.listdir(file_path)] + if len(input_files) == 0: + raise ValueError(('There are no files in {}.\n' + + 'This usually indicates that the channel ({}) was \ + incorrectly specified,\n' + + 'the data specification in S3 was incorrectly \ + specified or the role specified\n' + + 'does not have permission to access \ + the data.').format(file_path, channel_name)) + raw_data = [pd.read_csv(file) for file in input_files] + return pd.concat(raw_data) + + +def train(): + """ + Execute the train step in the virtual environment + + """ + logging.info('Starting the training') + try: + logging.info('Reading the inputs') + train = read_input(training_path) + val = read_input(val_path) + + # Define the target and columns to be used in the train + target = "Survived" + columns = train.columns.drop(target) + + logging.info("Training the model") + model = TrainerSklearn().train(train, val, target, classification=True, + algorithm=RandomForestClassifier, + columns=columns) + + # Salve the model and metrics + logging.info("Saving") + model.save_model(os.path.join(model_path, 'model.pkl')) + metrics = model.artifacts["metrics"] + logging.info(f"accuracy={metrics['accuracy']}; \ + f1={metrics['f1']}; \ + precision={metrics['precision']}; \ + recall={metrics['recall']};") + pd.DataFrame(model.artifacts["metrics"].items(), + columns=['Metric', 'Value']).to_csv( + os.path.join(model_path, 'metrics.csv'), index=False) + logging.info('Training complete.') + + except Exception as e: + # Write out an error file + trc = traceback.format_exc() + with open(os.path.join(error_path, 'failure'), 'w') as s: + s.write('Exception during training: ' + str(e) + '\n' + trc) + logging.info('Exception during training: ' + str(e) + '\n' + trc, + file=sys.stderr) + # A non-zero exit code causes the training job to be marked as Failed + sys.exit(255) + + +if __name__ == '__main__': + train() + # A zero exit code causes the job to be marked a Succeeded. + sys.exit(0) diff --git a/hermione/tests/test_hermione.py b/hermione/tests/test_hermione.py index 5cb2214..3842a19 100644 --- a/hermione/tests/test_hermione.py +++ b/hermione/tests/test_hermione.py @@ -19,4 +19,5 @@ def test_info(): def test_implementation_script_folders(): assert os.path.exists(os.path.join(os.getcwd(), 'hermione', 'module_templates', '__IMPLEMENTED_BASE__')) + assert os.path.exists(os.path.join(os.getcwd(), 'hermione', 'module_templates', '__IMPLEMENTED_SAGEMAKER__')) assert os.path.exists(os.path.join(os.getcwd(), 'hermione', 'module_templates', '__NOT_IMPLEMENTED_BASE__'))