From 07ca419d7864c384c8e55c147be4fef40dc9515e Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Wed, 26 May 2021 19:34:22 +0000 Subject: [PATCH 01/10] Sagemaker implemented version --- hermione/.ipynb_checkpoints/cli-checkpoint.py | 136 +++ hermione/cli.py | 12 +- .../__IMPLEMENTED_SAGEMAKER__-checkpoint.json | 7 + .../README.tpl-checkpoint.md | 6 + .../__IMPLEMENTED_SAGEMAKER__.json | 7 + .../README.tpl-checkpoint.md | 241 +++++ .../build_and_push-checkpoint.sh | 52 + .../requirements-checkpoint.txt | 28 + .../__IMPLEMENTED_SAGEMAKER__/README.tpl.md | 241 +++++ .../build_and_push.sh | 52 + .../data/raw/train.csv | 892 ++++++++++++++++++ .../.ipynb_checkpoints/Dockerfile-checkpoint | 59 ++ .../.ipynb_checkpoints/handler-checkpoint.py | 65 ++ .../.ipynb_checkpoints/main-checkpoint.py | 12 + .../inference/Dockerfile | 59 ++ .../inference/handler.py | 65 ++ .../inference/main.py | 12 + .../.ipynb_checkpoints/Dockerfile-checkpoint | 60 ++ .../preprocessor-checkpoint.py | 68 ++ .../processor/Dockerfile | 60 ++ .../processor/preprocessor.py | 68 ++ .../requirements.txt | 28 + .../__IMPLEMENTED_SAGEMAKER__/src/api/app.py | 41 + .../src/api/myrequests.py | 17 + .../__IMPLEMENTED_SAGEMAKER__/src/api/wsgi.py | 4 + .../.ipynb_checkpoints/config-checkpoint.json | 7 + .../src/config/config.json | 7 + .../.ipynb_checkpoints/cluster-checkpoint.py | 166 ++++ .../feature_selection-checkpoint.py | 387 ++++++++ .../.ipynb_checkpoints/pca-checkpoint.py | 149 +++ .../.ipynb_checkpoints/vif-checkpoint.py | 48 + .../src/ml/analysis/cluster.py | 166 ++++ .../src/ml/analysis/feature_selection.py | 387 ++++++++ .../src/ml/analysis/pca.py | 149 +++ .../src/ml/analysis/vif.py | 48 + .../src/ml/data_source/base.py | 12 + .../src/ml/data_source/database.py | 70 ++ .../src/ml/data_source/spreadsheet.py | 24 + .../.ipynb_checkpoints/metrics-checkpoint.py | 212 +++++ .../.ipynb_checkpoints/trainer-checkpoint.py | 104 ++ .../.ipynb_checkpoints/wrapper-checkpoint.py | 252 +++++ .../src/ml/model/metrics.py | 212 +++++ .../src/ml/model/trainer.py | 104 ++ .../src/ml/model/wrapper.py | 252 +++++ .../Sagemaker_Inference-checkpoint.ipynb | 322 +++++++ .../Sagemaker_Processor-checkpoint.ipynb | 396 ++++++++ ...r_StepFunctions_Inference-checkpoint.ipynb | 737 +++++++++++++++ ...maker_StepFunctions_Train-checkpoint.ipynb | 540 +++++++++++ .../Sagemaker_Train-checkpoint.ipynb | 393 ++++++++ .../ml/notebooks/Sagemaker_Inference.ipynb | 322 +++++++ .../ml/notebooks/Sagemaker_Processor.ipynb | 396 ++++++++ .../Sagemaker_StepFunctions_Inference.ipynb | 737 +++++++++++++++ .../Sagemaker_StepFunctions_Train.ipynb | 540 +++++++++++ .../src/ml/notebooks/Sagemaker_Train.ipynb | 393 ++++++++ .../dataquality-checkpoint.py | 60 ++ .../normalization-checkpoint.py | 159 ++++ .../preprocessing-checkpoint.py | 141 +++ .../text_vectorizer-checkpoint.py | 201 ++++ .../src/ml/preprocessing/dataquality.py | 60 ++ .../src/ml/preprocessing/normalization.py | 159 ++++ .../src/ml/preprocessing/preprocessing.py | 141 +++ .../src/ml/preprocessing/text_vectorizer.py | 201 ++++ .../visualization/app-streamlit-titanict.py | 84 ++ .../src/ml/visualization/visualization.py | 428 +++++++++ .../.ipynb_checkpoints/README-checkpoint.md | 41 + .../test_project-checkpoint.py | 54 ++ .../src/tests/README.md | 41 + .../src/tests/test_project.py | 54 ++ .../.ipynb_checkpoints/Dockerfile-checkpoint | 66 ++ .../.ipynb_checkpoints/train-checkpoint.py | 84 ++ .../train/Dockerfile | 66 ++ .../__IMPLEMENTED_SAGEMAKER__/train/train.py | 84 ++ 72 files changed, 11946 insertions(+), 2 deletions(-) create mode 100644 hermione/.ipynb_checkpoints/cli-checkpoint.py create mode 100644 hermione/module_templates/.ipynb_checkpoints/__IMPLEMENTED_SAGEMAKER__-checkpoint.json create mode 100644 hermione/module_templates/__IMPLEMENTED_BASE__/.ipynb_checkpoints/README.tpl-checkpoint.md create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__.json create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.ipynb_checkpoints/README.tpl-checkpoint.md create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.ipynb_checkpoints/build_and_push-checkpoint.sh create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.ipynb_checkpoints/requirements-checkpoint.txt create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/README.tpl.md create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/build_and_push.sh create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/data/raw/train.csv create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/.ipynb_checkpoints/Dockerfile-checkpoint create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/.ipynb_checkpoints/handler-checkpoint.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/.ipynb_checkpoints/main-checkpoint.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/Dockerfile create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/handler.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/main.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/.ipynb_checkpoints/Dockerfile-checkpoint create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/.ipynb_checkpoints/preprocessor-checkpoint.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/Dockerfile create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/preprocessor.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/requirements.txt create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/api/app.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/api/myrequests.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/api/wsgi.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/config/.ipynb_checkpoints/config-checkpoint.json create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/config/config.json create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/.ipynb_checkpoints/cluster-checkpoint.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/.ipynb_checkpoints/feature_selection-checkpoint.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/.ipynb_checkpoints/pca-checkpoint.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/.ipynb_checkpoints/vif-checkpoint.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/cluster.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/feature_selection.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/pca.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/vif.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/data_source/base.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/data_source/database.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/data_source/spreadsheet.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/.ipynb_checkpoints/metrics-checkpoint.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/.ipynb_checkpoints/trainer-checkpoint.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/.ipynb_checkpoints/wrapper-checkpoint.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/metrics.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/trainer.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/wrapper.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_Inference-checkpoint.ipynb create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_Processor-checkpoint.ipynb create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_StepFunctions_Inference-checkpoint.ipynb create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_StepFunctions_Train-checkpoint.ipynb create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_Train-checkpoint.ipynb create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_Inference.ipynb create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_Processor.ipynb create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_StepFunctions_Inference.ipynb create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_StepFunctions_Train.ipynb create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_Train.ipynb create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/.ipynb_checkpoints/dataquality-checkpoint.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/.ipynb_checkpoints/normalization-checkpoint.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/.ipynb_checkpoints/preprocessing-checkpoint.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/.ipynb_checkpoints/text_vectorizer-checkpoint.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/dataquality.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/normalization.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/preprocessing.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/text_vectorizer.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/visualization/app-streamlit-titanict.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/visualization/visualization.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/tests/.ipynb_checkpoints/README-checkpoint.md create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/tests/.ipynb_checkpoints/test_project-checkpoint.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/tests/README.md create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/tests/test_project.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/train/.ipynb_checkpoints/Dockerfile-checkpoint create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/train/.ipynb_checkpoints/train-checkpoint.py create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/train/Dockerfile create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/train/train.py diff --git a/hermione/.ipynb_checkpoints/cli-checkpoint.py b/hermione/.ipynb_checkpoints/cli-checkpoint.py new file mode 100644 index 0000000..73b67ef --- /dev/null +++ b/hermione/.ipynb_checkpoints/cli-checkpoint.py @@ -0,0 +1,136 @@ +import click +import os +import re +import sys +from .writer import * +from .module_writer import modules_autocomplete, write_module +from .__init__ import __version__ as version + +LOCAL_PATH = os.getcwd() + +# Correct LOCAL_PATH in case of empty spaces #21 + +logo = r""" + _ _ +| |__ ___ _ __ _ __ ___ (_) ___ _ __ ___ +| '_ \ / _ \ '__| '_ ` _ \| |/ _ \| '_ \ / _ \ +| | | | __/ | | | | | | | | (_) | | | | __/ +|_| |_|\___|_| |_| |_| |_|_|\___/|_| |_|\___| +v{} +""".format(version) + + +@click.group() +def cli(): + pass + +@cli.command() +def info(): + """ + Checks that hermione is correctly installed + """ + click.echo(logo) + +@cli.command() +@click.argument('project_name') +@click.option('-imp', '--implemented', 'implemented', prompt='Do you want to start with an implemented example (recommended) [y/n]?', + default='y', show_default=True) +def new(project_name, implemented): + """ + Create a new hermione project + """ + if implemented in ['yes', 'ye', 'y', 'Yes', 'YES', 'Y']: + is_imp = True + else: + is_imp = False + + click.echo(f"Creating project {project_name}") + + + custom_inputs = { + 'project_name':project_name, + "project_start_date": datetime.today().strftime("%B %d, %Y") + } + os.makedirs(os.path.join(LOCAL_PATH, project_name)) + if is_imp: + option = click.prompt('Do you want to start with: \n\t(1) Sagemaker \n\t(2) Local version \n', type=int, default=2) + implemented_version_type(project_name,custom_inputs,option) + else: + write_module(os.path.join(LOCAL_PATH, project_name), '__NOT_IMPLEMENTED_BASE__', True, custom_inputs) + + print(f'Creating virtual environment {project_name}_env') + os.chdir(project_name) + env_name = f"{project_name}_env" + os.system(f"python -m venv {env_name}") + + # Create git repo + os.system('git init') + print("A git repository was created. You should add your files and make your first commit.\n") + +def implemented_version_type(project_name,custom_inputs,option): + """ + Create a new hermione project + """ + if option == 1: + write_module(os.path.join(LOCAL_PATH, project_name), '__IMPLEMENTED_SAGEMAKER__', True, custom_inputs) + else: + write_module(os.path.join(LOCAL_PATH, project_name), '__IMPLEMENTED_BASE__', True, custom_inputs) + +@cli.command() +def train(): + """ + Execute the script in train.py. One should be at src directory + """ + if not os.path.exists('./train.py'): + click.echo("You gotta have an src/train.py file") + else: + os.system('python ./train.py') + print("\nModel trained. For MLFlow logging control, type:\nmlflow ui\nand visit http://localhost:5000/") + + +@cli.command() +def predict(): + """ + Execute the script in predict.py to make batch predictions. + One should be at src directory + """ + if not os.path.exists('./predict.py'): + click.echo("You gotta have an src/predict.py file") + else: + print("Making predictions: ") + os.system('python ./predict.py') + + +@click.argument('image_name') +@click.option('-t', '--tag', 'tag', default='latest', show_default=True) +@cli.command() +def build(image_name, tag): + """ + Build a docker image with given image_name. Only run if you have docker installed. + One should be at the root directory. + """ + if not os.path.exists('src/Dockerfile'): + click.echo("You gotta have an src/Dockerfile file. You must be at the project's root folder.") + else: + os.system(f'docker build -f src/Dockerfile -t {image_name}:{tag} .') + + +@click.argument('image_name') +@click.option('-t', '--tag', 'tag', default='latest', show_default=True) +@cli.command() +def run(image_name, tag): + """ + Run a container with given image_name. + Only run if you have docker installed. + """ + if not os.path.exists('src/Dockerfile'): + click.echo("You gotta have an src/Dockerfile file. You must be at the project's root folder.") + else: + os.system(f'docker run --rm -p 5000:5000 {image_name}:{tag}') + + +@click.argument("module_name", type = click.STRING, autocompletion=modules_autocomplete) +@cli.command() +@click.option('-y','--autoconfirm', is_flag=True) +def add_module(module_name, autoconfirm): + write_module(LOCAL_PATH, module_name, autoconfirm) \ No newline at end of file diff --git a/hermione/cli.py b/hermione/cli.py index 31a6609..73b67ef 100644 --- a/hermione/cli.py +++ b/hermione/cli.py @@ -53,7 +53,8 @@ def new(project_name, implemented): } os.makedirs(os.path.join(LOCAL_PATH, project_name)) if is_imp: - write_module(os.path.join(LOCAL_PATH, project_name), '__IMPLEMENTED_BASE__', True, custom_inputs) + option = click.prompt('Do you want to start with: \n\t(1) Sagemaker \n\t(2) Local version \n', type=int, default=2) + implemented_version_type(project_name,custom_inputs,option) else: write_module(os.path.join(LOCAL_PATH, project_name), '__NOT_IMPLEMENTED_BASE__', True, custom_inputs) @@ -66,7 +67,14 @@ def new(project_name, implemented): os.system('git init') print("A git repository was created. You should add your files and make your first commit.\n") - +def implemented_version_type(project_name,custom_inputs,option): + """ + Create a new hermione project + """ + if option == 1: + write_module(os.path.join(LOCAL_PATH, project_name), '__IMPLEMENTED_SAGEMAKER__', True, custom_inputs) + else: + write_module(os.path.join(LOCAL_PATH, project_name), '__IMPLEMENTED_BASE__', True, custom_inputs) @cli.command() def train(): diff --git a/hermione/module_templates/.ipynb_checkpoints/__IMPLEMENTED_SAGEMAKER__-checkpoint.json b/hermione/module_templates/.ipynb_checkpoints/__IMPLEMENTED_SAGEMAKER__-checkpoint.json new file mode 100644 index 0000000..aa8798f --- /dev/null +++ b/hermione/module_templates/.ipynb_checkpoints/__IMPLEMENTED_SAGEMAKER__-checkpoint.json @@ -0,0 +1,7 @@ +{ + "info": "Base files with implemented example", + "input_info": [ + ["project_name", "My Project", "Enter your project name"], + ["project_start_date", "01/01/21", "Enter the date your project started"] + ] +} \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_BASE__/.ipynb_checkpoints/README.tpl-checkpoint.md b/hermione/module_templates/__IMPLEMENTED_BASE__/.ipynb_checkpoints/README.tpl-checkpoint.md new file mode 100644 index 0000000..98bb4e6 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_BASE__/.ipynb_checkpoints/README.tpl-checkpoint.md @@ -0,0 +1,6 @@ +# {{ inputs['project_name'] }} + +Project started in {{ inputs['project_start_date'] }}. + + +**Please, complete here information on using and testing this project.** diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__.json b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__.json new file mode 100644 index 0000000..aa8798f --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__.json @@ -0,0 +1,7 @@ +{ + "info": "Base files with implemented example", + "input_info": [ + ["project_name", "My Project", "Enter your project name"], + ["project_start_date", "01/01/21", "Enter the date your project started"] + ] +} \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.ipynb_checkpoints/README.tpl-checkpoint.md b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.ipynb_checkpoints/README.tpl-checkpoint.md new file mode 100644 index 0000000..a14d02f --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.ipynb_checkpoints/README.tpl-checkpoint.md @@ -0,0 +1,241 @@ +# Hermione Sagemaker + +This notebook explains how to execute the Titanic project example + + +## Sagemaker + +Our code is divided in three steps: Processor, Train and Inference. In the Processor step, we preprocessed the training, validation and inference data. The Train step receives the preprocessed training and validation data, and uses them to train and validate a new model. The Inference step receives the inference data and model, and generates the prediction for the data. + +### Permitions + +If you are running this code on a SageMaker notebook instance, do the following to provide IAM permissions to the notebook: + +1. Open the Amazon [SageMaker console](https://console.aws.amazon.com/sagemaker/). +2. Select Notebook instances and choose the name of your notebook instance. +3. Under Permissions and encryption select the role ARN to view the role on the IAM console. +4. Under the Permissions tab, choose Attach policies and search for AmazonS3FullAccess. +5. Select the check box next to AmazonS3FullAccess. +6. Search for AmazonSageMakerFullAccess and AWSStepFunctionsFullAccess and select their check boxes. +7. Choose Attach policy. You will then be redirected to the details page for the role. +8. Copy and save the IAM role ARN for later use. + +Next, we will create a new policy to attach. + +12. Click Attach policies again and then Create policy.\n", +13. Enter the following in the JSON tab: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "VisualEditor0", + "Effect": "Allow", + "Action": [ + "s3:PutObject", + "s3:GetObject", + "logs:CreateLogStream", + "codebuild:DeleteProject", + "codebuild:StartBuild", + "s3:DeleteObject", + "codebuild:CreateProject", + "codebuild:BatchGetBuilds" + ], + "Resource": [ + "arn:aws:s3:::sagemaker-*/*", + "arn:aws:codebuild:*:*:project/sagemaker-studio*", + "arn:aws:logs:*:*:log-group:/aws/codebuild/sagemaker-studio*" + ] + }, + { + "Sid": "VisualEditor1", + "Effect": "Allow", + "Action": [ + "logs:GetLogEvents", + "s3:CreateBucket", + "logs:PutLogEvents" + ], + "Resource": [ + "arn:aws:logs:*:*:log-group:/aws/codebuild/sagemaker-studio*:log-stream:*", + "arn:aws:s3:::sagemaker*" + ] + }, + { + "Sid": "VisualEditor2", + "Effect": "Allow", + "Action": [ + "iam:GetRole", + "ecr:CreateRepository", + "iam:ListRoles", + "ecr:GetAuthorizationToken", + "ecr:UploadLayerPart", + "ecr:ListImages", + "logs:CreateLogGroup", + "ecr:PutImage", + "iam:PassRole", + "sagemaker:*", + "ecr:BatchGetImage", + "ecr:CompleteLayerUpload", + "ecr:DescribeImages", + "ecr:DescribeRepositories", + "ecr:InitiateLayerUpload", + "ecr:BatchCheckLayerAvailability" + ], + "Resource": "*" + } + ] +} +``` + +14. Choose Next:Tags and add a tag, if you want to. +15. Choose Next:Review and add a name such as AmazonSageMaker-ExecutionPolicy. +16. Choose Create Policy. +17. Select Roles and search for your role. +18. Under the Permissions tab, click Attach policies. +19. Search for your newly created policy and select the check box next to it. +20. Choose Attach policy. + +### Docker images + +First, we need to create an image and upload it in ECR for each one of the steps. To do that, execute the following commands in the terminal: + +```bash +cd Sagemaker/project-name +bash build_and_push.sh processor hermione-processor +bash build_and_push.sh train hermione-train +bash build_and_push.sh inference hermione-inference +``` + +The bash command will access the Dockerfile in the folder, create the image and save it in ECR with the specified name + +### Notebooks + +To test the images in ECR, execute the following notebooks: + +- project-name/src/ml/notebooks/Sagemaker_Processor.ipynb +- project-name/src/ml/notebooks/Sagemaker_Train.ipynb +- project-name/src/ml/notebooks/Sagemaker_Inference.ipynb + +## Stepfunctions + +We also create two Step Function state machines to execute the whole process. The first machine processes the training data and creates the model. And the second one processes the inference data and generates its prediction. + +### Permitions + +The Step Functions workflow requires an IAM role to interact with other services in AWS environment. To do that, follow these [AWS steps](https://github.com/aws/amazon-sagemaker-examples/blob/master/step-functions-data-science-sdk/step_functions_mlworkflow_processing/step_functions_mlworkflow_scikit_learn_data_processing_and_model_evaluation.ipynb): + + +1. Go to the [IAM console](https://console.aws.amazon.com/iam/). +2. Select Roles and then Create role. +3. Under Choose the service that will use this role select Step Functions. +4. Choose Next until you can enter a Role name. +5. Enter a name such as AmazonSageMaker-StepFunctionsWorkflowExecutionRole and then select Create role. +6. Search and click on the IAM Role you just created. +7. Click Attach policies and then select CloudWatchEventsFullAccess. +9. Click on Attach Policy + + +Next, create and attach another new policy to the role you created: + +9. Click Attach policies again and then Create policy. +10. Enter the following in the JSON tab: + + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "VisualEditor0", + "Effect": "Allow", + "Action": [ + "events:PutTargets", + "events:DescribeRule", + "events:PutRule" + ], + "Resource": [ + "arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTrainingJobsRule", + "arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTransformJobsRule", + "arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTuningJobsRule", + "arn:aws:events:*:*:rule/StepFunctionsGetEventsForECSTaskRule", + "arn:aws:events:*:*:rule/StepFunctionsGetEventsForBatchJobsRule", + "arn:aws:events:*:*:rule/StepFunctionsGetEventsForStepFunctionsExecutionRule", + "arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerProcessingJobsRule" + ] + }, + { + "Sid": "VisualEditor1", + "Effect": "Allow", + "Action": "iam:PassRole", + "Resource": "NOTEBOOK_ROLE_ARN", + "Condition": { + "StringEquals": { + "iam:PassedToService": "sagemaker.amazonaws.com" + } + } + }, + { + "Sid": "VisualEditor2", + "Effect": "Allow", + "Action": [ + "batch:DescribeJobs", + "batch:SubmitJob", + "batch:TerminateJob", + "dynamodb:DeleteItem", + "dynamodb:GetItem", + "dynamodb:PutItem", + "dynamodb:UpdateItem", + "ecs:DescribeTasks", + "ecs:RunTask", + "ecs:StopTask", + "glue:BatchStopJobRun", + "glue:GetJobRun", + "glue:GetJobRuns", + "glue:StartJobRun", + "lambda:InvokeFunction", + "sagemaker:CreateEndpoint", + "sagemaker:CreateEndpointConfig", + "sagemaker:CreateHyperParameterTuningJob", + "sagemaker:CreateModel", + "sagemaker:CreateProcessingJob", + "sagemaker:CreateTrainingJob", + "sagemaker:CreateTransformJob", + "sagemaker:DeleteEndpoint", + "sagemaker:DeleteEndpointConfig", + "sagemaker:DescribeHyperParameterTuningJob", + "sagemaker:DescribeProcessingJob", + "sagemaker:DescribeTrainingJob", + "sagemaker:DescribeTransformJob", + "sagemaker:ListProcessingJobs", + "sagemaker:ListTags", + "sagemaker:StopHyperParameterTuningJob", + "sagemaker:StopProcessingJob", + "sagemaker:StopTrainingJob", + "sagemaker:StopTransformJob", + "sagemaker:UpdateEndpoint", + "sns:Publish", + "sqs:SendMessage" + ], + "Resource": "*" + } + ] +} +``` + +11. Replace NOTEBOOK_ROLE_ARN with the ARN for your notebook that you used in the previous step in the above Sagemaker Permitions. +12. Choose Review policy and give the policy a name such as AmazonSageMaker-StepFunctionsWorkflowExecutionPolicy. +13. Choose Create policy. +14. Select Roles and search for your AmazonSageMaker-StepFunctionsWorkflowExecutionRole role. +15. Click Attach policies. +16. Search for your newly created AmazonSageMaker-StepFunctionsWorkflowExecutionPolicy policy and select the check box next to it. +17. Choose Attach policy. +18. Copy the AmazonSageMaker-StepFunctionsWorkflowExecutionRole Role ARN at the top of the Summary. You will use it in the next step. + + +### Notebooks + +To create and test the Step Functions state machines, execute the following notebooks: + +- project-name/src/ml/notebooks/Sagemaker_StepFunctions_Train.ipynb +- project-name/src/ml/notebooks/Sagemaker_StepFunctions_Inference.ipynb \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.ipynb_checkpoints/build_and_push-checkpoint.sh b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.ipynb_checkpoints/build_and_push-checkpoint.sh new file mode 100644 index 0000000..b1ea715 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.ipynb_checkpoints/build_and_push-checkpoint.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash + +# This script shows how to build the Docker image and push it to ECR to be ready for use +# by SageMaker. + +# The argument to this script is the image name. This will be used as the image on the local +# machine and combined with the account and region to form the repository name for ECR. +mode=$1 +image=$2 + + +if [ "$image" == "" ] +then + echo "Usage: $0 " + exit 1 +fi + + +# Get the account number associated with the current IAM credentials +account=$(aws sts get-caller-identity --query Account --output text) + +if [ $? -ne 0 ] +then + exit 255 +fi + + +# Get the region defined in the current configuration (default to us-east-1 if none defined) +region=$(aws configure get region) +region=${region:-us-east-1} + + +fullname="${account}.dkr.ecr.${region}.amazonaws.com/${image}:latest" + +# If the repository doesn't exist in ECR, create it. + +aws ecr describe-repositories --repository-names "${image}" > /dev/null 2>&1 + +if [ $? -ne 0 ] +then + aws ecr create-repository --repository-name "${image}" > /dev/null +fi + +# Get the login command from ECR and execute it directly +aws ecr get-login-password --region "${region}" | docker login --username AWS --password-stdin "${account}".dkr.ecr."${region}".amazonaws.com + +# Build the docker image locally with the image name and then push it to ECR +# with the full name. +docker build -f ${mode}/Dockerfile -t ${image} . +docker tag ${image} ${fullname} + +docker push ${fullname} \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.ipynb_checkpoints/requirements-checkpoint.txt b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.ipynb_checkpoints/requirements-checkpoint.txt new file mode 100644 index 0000000..a9d480f --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.ipynb_checkpoints/requirements-checkpoint.txt @@ -0,0 +1,28 @@ +category-encoders +coverage +datetime +Flask +gunicorn +hermione-ml +matplotlib +mlflow +mlxtend +numpy +pandas +plotly +pytest +seaborn +scikit-learn +scipy +statsmodels +tqdm +yellowbrick +vega_datasets +altair +pandas_profiling +streamlit_pandas_profiling +interpret-community +lime +lightgbm +great_expectations +stepfunctions diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/README.tpl.md b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/README.tpl.md new file mode 100644 index 0000000..a14d02f --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/README.tpl.md @@ -0,0 +1,241 @@ +# Hermione Sagemaker + +This notebook explains how to execute the Titanic project example + + +## Sagemaker + +Our code is divided in three steps: Processor, Train and Inference. In the Processor step, we preprocessed the training, validation and inference data. The Train step receives the preprocessed training and validation data, and uses them to train and validate a new model. The Inference step receives the inference data and model, and generates the prediction for the data. + +### Permitions + +If you are running this code on a SageMaker notebook instance, do the following to provide IAM permissions to the notebook: + +1. Open the Amazon [SageMaker console](https://console.aws.amazon.com/sagemaker/). +2. Select Notebook instances and choose the name of your notebook instance. +3. Under Permissions and encryption select the role ARN to view the role on the IAM console. +4. Under the Permissions tab, choose Attach policies and search for AmazonS3FullAccess. +5. Select the check box next to AmazonS3FullAccess. +6. Search for AmazonSageMakerFullAccess and AWSStepFunctionsFullAccess and select their check boxes. +7. Choose Attach policy. You will then be redirected to the details page for the role. +8. Copy and save the IAM role ARN for later use. + +Next, we will create a new policy to attach. + +12. Click Attach policies again and then Create policy.\n", +13. Enter the following in the JSON tab: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "VisualEditor0", + "Effect": "Allow", + "Action": [ + "s3:PutObject", + "s3:GetObject", + "logs:CreateLogStream", + "codebuild:DeleteProject", + "codebuild:StartBuild", + "s3:DeleteObject", + "codebuild:CreateProject", + "codebuild:BatchGetBuilds" + ], + "Resource": [ + "arn:aws:s3:::sagemaker-*/*", + "arn:aws:codebuild:*:*:project/sagemaker-studio*", + "arn:aws:logs:*:*:log-group:/aws/codebuild/sagemaker-studio*" + ] + }, + { + "Sid": "VisualEditor1", + "Effect": "Allow", + "Action": [ + "logs:GetLogEvents", + "s3:CreateBucket", + "logs:PutLogEvents" + ], + "Resource": [ + "arn:aws:logs:*:*:log-group:/aws/codebuild/sagemaker-studio*:log-stream:*", + "arn:aws:s3:::sagemaker*" + ] + }, + { + "Sid": "VisualEditor2", + "Effect": "Allow", + "Action": [ + "iam:GetRole", + "ecr:CreateRepository", + "iam:ListRoles", + "ecr:GetAuthorizationToken", + "ecr:UploadLayerPart", + "ecr:ListImages", + "logs:CreateLogGroup", + "ecr:PutImage", + "iam:PassRole", + "sagemaker:*", + "ecr:BatchGetImage", + "ecr:CompleteLayerUpload", + "ecr:DescribeImages", + "ecr:DescribeRepositories", + "ecr:InitiateLayerUpload", + "ecr:BatchCheckLayerAvailability" + ], + "Resource": "*" + } + ] +} +``` + +14. Choose Next:Tags and add a tag, if you want to. +15. Choose Next:Review and add a name such as AmazonSageMaker-ExecutionPolicy. +16. Choose Create Policy. +17. Select Roles and search for your role. +18. Under the Permissions tab, click Attach policies. +19. Search for your newly created policy and select the check box next to it. +20. Choose Attach policy. + +### Docker images + +First, we need to create an image and upload it in ECR for each one of the steps. To do that, execute the following commands in the terminal: + +```bash +cd Sagemaker/project-name +bash build_and_push.sh processor hermione-processor +bash build_and_push.sh train hermione-train +bash build_and_push.sh inference hermione-inference +``` + +The bash command will access the Dockerfile in the folder, create the image and save it in ECR with the specified name + +### Notebooks + +To test the images in ECR, execute the following notebooks: + +- project-name/src/ml/notebooks/Sagemaker_Processor.ipynb +- project-name/src/ml/notebooks/Sagemaker_Train.ipynb +- project-name/src/ml/notebooks/Sagemaker_Inference.ipynb + +## Stepfunctions + +We also create two Step Function state machines to execute the whole process. The first machine processes the training data and creates the model. And the second one processes the inference data and generates its prediction. + +### Permitions + +The Step Functions workflow requires an IAM role to interact with other services in AWS environment. To do that, follow these [AWS steps](https://github.com/aws/amazon-sagemaker-examples/blob/master/step-functions-data-science-sdk/step_functions_mlworkflow_processing/step_functions_mlworkflow_scikit_learn_data_processing_and_model_evaluation.ipynb): + + +1. Go to the [IAM console](https://console.aws.amazon.com/iam/). +2. Select Roles and then Create role. +3. Under Choose the service that will use this role select Step Functions. +4. Choose Next until you can enter a Role name. +5. Enter a name such as AmazonSageMaker-StepFunctionsWorkflowExecutionRole and then select Create role. +6. Search and click on the IAM Role you just created. +7. Click Attach policies and then select CloudWatchEventsFullAccess. +9. Click on Attach Policy + + +Next, create and attach another new policy to the role you created: + +9. Click Attach policies again and then Create policy. +10. Enter the following in the JSON tab: + + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "VisualEditor0", + "Effect": "Allow", + "Action": [ + "events:PutTargets", + "events:DescribeRule", + "events:PutRule" + ], + "Resource": [ + "arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTrainingJobsRule", + "arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTransformJobsRule", + "arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTuningJobsRule", + "arn:aws:events:*:*:rule/StepFunctionsGetEventsForECSTaskRule", + "arn:aws:events:*:*:rule/StepFunctionsGetEventsForBatchJobsRule", + "arn:aws:events:*:*:rule/StepFunctionsGetEventsForStepFunctionsExecutionRule", + "arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerProcessingJobsRule" + ] + }, + { + "Sid": "VisualEditor1", + "Effect": "Allow", + "Action": "iam:PassRole", + "Resource": "NOTEBOOK_ROLE_ARN", + "Condition": { + "StringEquals": { + "iam:PassedToService": "sagemaker.amazonaws.com" + } + } + }, + { + "Sid": "VisualEditor2", + "Effect": "Allow", + "Action": [ + "batch:DescribeJobs", + "batch:SubmitJob", + "batch:TerminateJob", + "dynamodb:DeleteItem", + "dynamodb:GetItem", + "dynamodb:PutItem", + "dynamodb:UpdateItem", + "ecs:DescribeTasks", + "ecs:RunTask", + "ecs:StopTask", + "glue:BatchStopJobRun", + "glue:GetJobRun", + "glue:GetJobRuns", + "glue:StartJobRun", + "lambda:InvokeFunction", + "sagemaker:CreateEndpoint", + "sagemaker:CreateEndpointConfig", + "sagemaker:CreateHyperParameterTuningJob", + "sagemaker:CreateModel", + "sagemaker:CreateProcessingJob", + "sagemaker:CreateTrainingJob", + "sagemaker:CreateTransformJob", + "sagemaker:DeleteEndpoint", + "sagemaker:DeleteEndpointConfig", + "sagemaker:DescribeHyperParameterTuningJob", + "sagemaker:DescribeProcessingJob", + "sagemaker:DescribeTrainingJob", + "sagemaker:DescribeTransformJob", + "sagemaker:ListProcessingJobs", + "sagemaker:ListTags", + "sagemaker:StopHyperParameterTuningJob", + "sagemaker:StopProcessingJob", + "sagemaker:StopTrainingJob", + "sagemaker:StopTransformJob", + "sagemaker:UpdateEndpoint", + "sns:Publish", + "sqs:SendMessage" + ], + "Resource": "*" + } + ] +} +``` + +11. Replace NOTEBOOK_ROLE_ARN with the ARN for your notebook that you used in the previous step in the above Sagemaker Permitions. +12. Choose Review policy and give the policy a name such as AmazonSageMaker-StepFunctionsWorkflowExecutionPolicy. +13. Choose Create policy. +14. Select Roles and search for your AmazonSageMaker-StepFunctionsWorkflowExecutionRole role. +15. Click Attach policies. +16. Search for your newly created AmazonSageMaker-StepFunctionsWorkflowExecutionPolicy policy and select the check box next to it. +17. Choose Attach policy. +18. Copy the AmazonSageMaker-StepFunctionsWorkflowExecutionRole Role ARN at the top of the Summary. You will use it in the next step. + + +### Notebooks + +To create and test the Step Functions state machines, execute the following notebooks: + +- project-name/src/ml/notebooks/Sagemaker_StepFunctions_Train.ipynb +- project-name/src/ml/notebooks/Sagemaker_StepFunctions_Inference.ipynb \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/build_and_push.sh b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/build_and_push.sh new file mode 100644 index 0000000..b1ea715 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/build_and_push.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash + +# This script shows how to build the Docker image and push it to ECR to be ready for use +# by SageMaker. + +# The argument to this script is the image name. This will be used as the image on the local +# machine and combined with the account and region to form the repository name for ECR. +mode=$1 +image=$2 + + +if [ "$image" == "" ] +then + echo "Usage: $0 " + exit 1 +fi + + +# Get the account number associated with the current IAM credentials +account=$(aws sts get-caller-identity --query Account --output text) + +if [ $? -ne 0 ] +then + exit 255 +fi + + +# Get the region defined in the current configuration (default to us-east-1 if none defined) +region=$(aws configure get region) +region=${region:-us-east-1} + + +fullname="${account}.dkr.ecr.${region}.amazonaws.com/${image}:latest" + +# If the repository doesn't exist in ECR, create it. + +aws ecr describe-repositories --repository-names "${image}" > /dev/null 2>&1 + +if [ $? -ne 0 ] +then + aws ecr create-repository --repository-name "${image}" > /dev/null +fi + +# Get the login command from ECR and execute it directly +aws ecr get-login-password --region "${region}" | docker login --username AWS --password-stdin "${account}".dkr.ecr."${region}".amazonaws.com + +# Build the docker image locally with the image name and then push it to ECR +# with the full name. +docker build -f ${mode}/Dockerfile -t ${image} . +docker tag ${image} ${fullname} + +docker push ${fullname} \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/data/raw/train.csv b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/data/raw/train.csv new file mode 100644 index 0000000..b0ee013 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/data/raw/train.csv @@ -0,0 +1,892 @@ +PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked +1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S +2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38,1,0,PC 17599,71.2833,C85,C +3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S +4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S +5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S +6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q +7,0,1,"McCarthy, Mr. Timothy J",male,54,0,0,17463,51.8625,E46,S +8,0,3,"Palsson, Master. Gosta Leonard",male,2,3,1,349909,21.075,,S +9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27,0,2,347742,11.1333,,S +10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14,1,0,237736,30.0708,,C +11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4,1,1,PP 9549,16.7,G6,S +12,1,1,"Bonnell, Miss. Elizabeth",female,58,0,0,113783,26.55,C103,S +13,0,3,"Saundercock, Mr. William Henry",male,20,0,0,A/5. 2151,8.05,,S +14,0,3,"Andersson, Mr. Anders Johan",male,39,1,5,347082,31.275,,S +15,0,3,"Vestrom, Miss. Hulda Amanda Adolfina",female,14,0,0,350406,7.8542,,S +16,1,2,"Hewlett, Mrs. (Mary D Kingcome) ",female,55,0,0,248706,16,,S +17,0,3,"Rice, Master. Eugene",male,2,4,1,382652,29.125,,Q +18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13,,S +19,0,3,"Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)",female,31,1,0,345763,18,,S +20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.225,,C +21,0,2,"Fynney, Mr. Joseph J",male,35,0,0,239865,26,,S +22,1,2,"Beesley, Mr. Lawrence",male,34,0,0,248698,13,D56,S +23,1,3,"McGowan, Miss. Anna ""Annie""",female,15,0,0,330923,8.0292,,Q +24,1,1,"Sloper, Mr. William Thompson",male,28,0,0,113788,35.5,A6,S +25,0,3,"Palsson, Miss. Torborg Danira",female,8,3,1,349909,21.075,,S +26,1,3,"Asplund, Mrs. Carl Oscar (Selma Augusta Emilia Johansson)",female,38,1,5,347077,31.3875,,S +27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.225,,C +28,0,1,"Fortune, Mr. Charles Alexander",male,19,3,2,19950,263,C23 C25 C27,S +29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q +30,0,3,"Todoroff, Mr. Lalio",male,,0,0,349216,7.8958,,S +31,0,1,"Uruchurtu, Don. Manuel E",male,40,0,0,PC 17601,27.7208,,C +32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,,1,0,PC 17569,146.5208,B78,C +33,1,3,"Glynn, Miss. Mary Agatha",female,,0,0,335677,7.75,,Q +34,0,2,"Wheadon, Mr. Edward H",male,66,0,0,C.A. 24579,10.5,,S +35,0,1,"Meyer, Mr. Edgar Joseph",male,28,1,0,PC 17604,82.1708,,C +36,0,1,"Holverson, Mr. Alexander Oskar",male,42,1,0,113789,52,,S +37,1,3,"Mamee, Mr. Hanna",male,,0,0,2677,7.2292,,C +38,0,3,"Cann, Mr. Ernest Charles",male,21,0,0,A./5. 2152,8.05,,S +39,0,3,"Vander Planke, Miss. Augusta Maria",female,18,2,0,345764,18,,S +40,1,3,"Nicola-Yarred, Miss. Jamila",female,14,1,0,2651,11.2417,,C +41,0,3,"Ahlin, Mrs. Johan (Johanna Persdotter Larsson)",female,40,1,0,7546,9.475,,S +42,0,2,"Turpin, Mrs. William John Robert (Dorothy Ann Wonnacott)",female,27,1,0,11668,21,,S +43,0,3,"Kraeff, Mr. Theodor",male,,0,0,349253,7.8958,,C +44,1,2,"Laroche, Miss. Simonne Marie Anne Andree",female,3,1,2,SC/Paris 2123,41.5792,,C +45,1,3,"Devaney, Miss. Margaret Delia",female,19,0,0,330958,7.8792,,Q +46,0,3,"Rogers, Mr. William John",male,,0,0,S.C./A.4. 23567,8.05,,S +47,0,3,"Lennon, Mr. Denis",male,,1,0,370371,15.5,,Q +48,1,3,"O'Driscoll, Miss. Bridget",female,,0,0,14311,7.75,,Q +49,0,3,"Samaan, Mr. Youssef",male,,2,0,2662,21.6792,,C +50,0,3,"Arnold-Franchi, Mrs. Josef (Josefine Franchi)",female,18,1,0,349237,17.8,,S +51,0,3,"Panula, Master. Juha Niilo",male,7,4,1,3101295,39.6875,,S +52,0,3,"Nosworthy, Mr. Richard Cater",male,21,0,0,A/4. 39886,7.8,,S +53,1,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49,1,0,PC 17572,76.7292,D33,C +54,1,2,"Faunthorpe, Mrs. Lizzie (Elizabeth Anne Wilkinson)",female,29,1,0,2926,26,,S +55,0,1,"Ostby, Mr. Engelhart Cornelius",male,65,0,1,113509,61.9792,B30,C +56,1,1,"Woolner, Mr. Hugh",male,,0,0,19947,35.5,C52,S +57,1,2,"Rugg, Miss. Emily",female,21,0,0,C.A. 31026,10.5,,S +58,0,3,"Novel, Mr. Mansouer",male,28.5,0,0,2697,7.2292,,C +59,1,2,"West, Miss. Constance Mirium",female,5,1,2,C.A. 34651,27.75,,S +60,0,3,"Goodwin, Master. William Frederick",male,11,5,2,CA 2144,46.9,,S +61,0,3,"Sirayanian, Mr. Orsen",male,22,0,0,2669,7.2292,,C +62,1,1,"Icard, Miss. Amelie",female,38,0,0,113572,80,B28, +63,0,1,"Harris, Mr. Henry Birkhardt",male,45,1,0,36973,83.475,C83,S +64,0,3,"Skoog, Master. Harald",male,4,3,2,347088,27.9,,S +65,0,1,"Stewart, Mr. Albert A",male,,0,0,PC 17605,27.7208,,C +66,1,3,"Moubarek, Master. Gerios",male,,1,1,2661,15.2458,,C +67,1,2,"Nye, Mrs. (Elizabeth Ramell)",female,29,0,0,C.A. 29395,10.5,F33,S +68,0,3,"Crease, Mr. Ernest James",male,19,0,0,S.P. 3464,8.1583,,S +69,1,3,"Andersson, Miss. Erna Alexandra",female,17,4,2,3101281,7.925,,S +70,0,3,"Kink, Mr. Vincenz",male,26,2,0,315151,8.6625,,S +71,0,2,"Jenkin, Mr. Stephen Curnow",male,32,0,0,C.A. 33111,10.5,,S +72,0,3,"Goodwin, Miss. Lillian Amy",female,16,5,2,CA 2144,46.9,,S +73,0,2,"Hood, Mr. Ambrose Jr",male,21,0,0,S.O.C. 14879,73.5,,S +74,0,3,"Chronopoulos, Mr. Apostolos",male,26,1,0,2680,14.4542,,C +75,1,3,"Bing, Mr. Lee",male,32,0,0,1601,56.4958,,S +76,0,3,"Moen, Mr. Sigurd Hansen",male,25,0,0,348123,7.65,F G73,S +77,0,3,"Staneff, Mr. Ivan",male,,0,0,349208,7.8958,,S +78,0,3,"Moutal, Mr. Rahamin Haim",male,,0,0,374746,8.05,,S +79,1,2,"Caldwell, Master. Alden Gates",male,0.83,0,2,248738,29,,S +80,1,3,"Dowdell, Miss. Elizabeth",female,30,0,0,364516,12.475,,S +81,0,3,"Waelens, Mr. Achille",male,22,0,0,345767,9,,S +82,1,3,"Sheerlinck, Mr. Jan Baptist",male,29,0,0,345779,9.5,,S +83,1,3,"McDermott, Miss. Brigdet Delia",female,,0,0,330932,7.7875,,Q +84,0,1,"Carrau, Mr. Francisco M",male,28,0,0,113059,47.1,,S +85,1,2,"Ilett, Miss. Bertha",female,17,0,0,SO/C 14885,10.5,,S +86,1,3,"Backstrom, Mrs. Karl Alfred (Maria Mathilda Gustafsson)",female,33,3,0,3101278,15.85,,S +87,0,3,"Ford, Mr. William Neal",male,16,1,3,W./C. 6608,34.375,,S +88,0,3,"Slocovski, Mr. Selman Francis",male,,0,0,SOTON/OQ 392086,8.05,,S +89,1,1,"Fortune, Miss. Mabel Helen",female,23,3,2,19950,263,C23 C25 C27,S +90,0,3,"Celotti, Mr. Francesco",male,24,0,0,343275,8.05,,S +91,0,3,"Christmann, Mr. Emil",male,29,0,0,343276,8.05,,S +92,0,3,"Andreasson, Mr. Paul Edvin",male,20,0,0,347466,7.8542,,S +93,0,1,"Chaffee, Mr. Herbert Fuller",male,46,1,0,W.E.P. 5734,61.175,E31,S +94,0,3,"Dean, Mr. Bertram Frank",male,26,1,2,C.A. 2315,20.575,,S +95,0,3,"Coxon, Mr. Daniel",male,59,0,0,364500,7.25,,S +96,0,3,"Shorney, Mr. Charles Joseph",male,,0,0,374910,8.05,,S +97,0,1,"Goldschmidt, Mr. George B",male,71,0,0,PC 17754,34.6542,A5,C +98,1,1,"Greenfield, Mr. William Bertram",male,23,0,1,PC 17759,63.3583,D10 D12,C +99,1,2,"Doling, Mrs. John T (Ada Julia Bone)",female,34,0,1,231919,23,,S +100,0,2,"Kantor, Mr. Sinai",male,34,1,0,244367,26,,S +101,0,3,"Petranec, Miss. Matilda",female,28,0,0,349245,7.8958,,S +102,0,3,"Petroff, Mr. Pastcho (""Pentcho"")",male,,0,0,349215,7.8958,,S +103,0,1,"White, Mr. Richard Frasar",male,21,0,1,35281,77.2875,D26,S +104,0,3,"Johansson, Mr. Gustaf Joel",male,33,0,0,7540,8.6542,,S +105,0,3,"Gustafsson, Mr. Anders Vilhelm",male,37,2,0,3101276,7.925,,S +106,0,3,"Mionoff, Mr. Stoytcho",male,28,0,0,349207,7.8958,,S +107,1,3,"Salkjelsvik, Miss. Anna Kristine",female,21,0,0,343120,7.65,,S +108,1,3,"Moss, Mr. Albert Johan",male,,0,0,312991,7.775,,S +109,0,3,"Rekic, Mr. Tido",male,38,0,0,349249,7.8958,,S +110,1,3,"Moran, Miss. Bertha",female,,1,0,371110,24.15,,Q +111,0,1,"Porter, Mr. Walter Chamberlain",male,47,0,0,110465,52,C110,S +112,0,3,"Zabour, Miss. Hileni",female,14.5,1,0,2665,14.4542,,C +113,0,3,"Barton, Mr. David John",male,22,0,0,324669,8.05,,S +114,0,3,"Jussila, Miss. Katriina",female,20,1,0,4136,9.825,,S +115,0,3,"Attalah, Miss. Malake",female,17,0,0,2627,14.4583,,C +116,0,3,"Pekoniemi, Mr. Edvard",male,21,0,0,STON/O 2. 3101294,7.925,,S +117,0,3,"Connors, Mr. Patrick",male,70.5,0,0,370369,7.75,,Q +118,0,2,"Turpin, Mr. William John Robert",male,29,1,0,11668,21,,S +119,0,1,"Baxter, Mr. Quigg Edmond",male,24,0,1,PC 17558,247.5208,B58 B60,C +120,0,3,"Andersson, Miss. Ellis Anna Maria",female,2,4,2,347082,31.275,,S +121,0,2,"Hickman, Mr. Stanley George",male,21,2,0,S.O.C. 14879,73.5,,S +122,0,3,"Moore, Mr. Leonard Charles",male,,0,0,A4. 54510,8.05,,S +123,0,2,"Nasser, Mr. Nicholas",male,32.5,1,0,237736,30.0708,,C +124,1,2,"Webber, Miss. Susan",female,32.5,0,0,27267,13,E101,S +125,0,1,"White, Mr. Percival Wayland",male,54,0,1,35281,77.2875,D26,S +126,1,3,"Nicola-Yarred, Master. Elias",male,12,1,0,2651,11.2417,,C +127,0,3,"McMahon, Mr. Martin",male,,0,0,370372,7.75,,Q +128,1,3,"Madsen, Mr. Fridtjof Arne",male,24,0,0,C 17369,7.1417,,S +129,1,3,"Peter, Miss. Anna",female,,1,1,2668,22.3583,F E69,C +130,0,3,"Ekstrom, Mr. Johan",male,45,0,0,347061,6.975,,S +131,0,3,"Drazenoic, Mr. Jozef",male,33,0,0,349241,7.8958,,C +132,0,3,"Coelho, Mr. Domingos Fernandeo",male,20,0,0,SOTON/O.Q. 3101307,7.05,,S +133,0,3,"Robins, Mrs. Alexander A (Grace Charity Laury)",female,47,1,0,A/5. 3337,14.5,,S +134,1,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",female,29,1,0,228414,26,,S +135,0,2,"Sobey, Mr. Samuel James Hayden",male,25,0,0,C.A. 29178,13,,S +136,0,2,"Richard, Mr. Emile",male,23,0,0,SC/PARIS 2133,15.0458,,C +137,1,1,"Newsom, Miss. Helen Monypeny",female,19,0,2,11752,26.2833,D47,S +138,0,1,"Futrelle, Mr. Jacques Heath",male,37,1,0,113803,53.1,C123,S +139,0,3,"Osen, Mr. Olaf Elon",male,16,0,0,7534,9.2167,,S +140,0,1,"Giglio, Mr. Victor",male,24,0,0,PC 17593,79.2,B86,C +141,0,3,"Boulos, Mrs. Joseph (Sultana)",female,,0,2,2678,15.2458,,C +142,1,3,"Nysten, Miss. Anna Sofia",female,22,0,0,347081,7.75,,S +143,1,3,"Hakkarainen, Mrs. Pekka Pietari (Elin Matilda Dolck)",female,24,1,0,STON/O2. 3101279,15.85,,S +144,0,3,"Burke, Mr. Jeremiah",male,19,0,0,365222,6.75,,Q +145,0,2,"Andrew, Mr. Edgardo Samuel",male,18,0,0,231945,11.5,,S +146,0,2,"Nicholls, Mr. Joseph Charles",male,19,1,1,C.A. 33112,36.75,,S +147,1,3,"Andersson, Mr. August Edvard (""Wennerstrom"")",male,27,0,0,350043,7.7958,,S +148,0,3,"Ford, Miss. Robina Maggie ""Ruby""",female,9,2,2,W./C. 6608,34.375,,S +149,0,2,"Navratil, Mr. Michel (""Louis M Hoffman"")",male,36.5,0,2,230080,26,F2,S +150,0,2,"Byles, Rev. Thomas Roussel Davids",male,42,0,0,244310,13,,S +151,0,2,"Bateman, Rev. Robert James",male,51,0,0,S.O.P. 1166,12.525,,S +152,1,1,"Pears, Mrs. Thomas (Edith Wearne)",female,22,1,0,113776,66.6,C2,S +153,0,3,"Meo, Mr. Alfonzo",male,55.5,0,0,A.5. 11206,8.05,,S +154,0,3,"van Billiard, Mr. Austin Blyler",male,40.5,0,2,A/5. 851,14.5,,S +155,0,3,"Olsen, Mr. Ole Martin",male,,0,0,Fa 265302,7.3125,,S +156,0,1,"Williams, Mr. Charles Duane",male,51,0,1,PC 17597,61.3792,,C +157,1,3,"Gilnagh, Miss. Katherine ""Katie""",female,16,0,0,35851,7.7333,,Q +158,0,3,"Corn, Mr. Harry",male,30,0,0,SOTON/OQ 392090,8.05,,S +159,0,3,"Smiljanic, Mr. Mile",male,,0,0,315037,8.6625,,S +160,0,3,"Sage, Master. Thomas Henry",male,,8,2,CA. 2343,69.55,,S +161,0,3,"Cribb, Mr. John Hatfield",male,44,0,1,371362,16.1,,S +162,1,2,"Watt, Mrs. James (Elizabeth ""Bessie"" Inglis Milne)",female,40,0,0,C.A. 33595,15.75,,S +163,0,3,"Bengtsson, Mr. John Viktor",male,26,0,0,347068,7.775,,S +164,0,3,"Calic, Mr. Jovo",male,17,0,0,315093,8.6625,,S +165,0,3,"Panula, Master. Eino Viljami",male,1,4,1,3101295,39.6875,,S +166,1,3,"Goldsmith, Master. Frank John William ""Frankie""",male,9,0,2,363291,20.525,,S +167,1,1,"Chibnall, Mrs. (Edith Martha Bowerman)",female,,0,1,113505,55,E33,S +168,0,3,"Skoog, Mrs. William (Anna Bernhardina Karlsson)",female,45,1,4,347088,27.9,,S +169,0,1,"Baumann, Mr. John D",male,,0,0,PC 17318,25.925,,S +170,0,3,"Ling, Mr. Lee",male,28,0,0,1601,56.4958,,S +171,0,1,"Van der hoef, Mr. Wyckoff",male,61,0,0,111240,33.5,B19,S +172,0,3,"Rice, Master. Arthur",male,4,4,1,382652,29.125,,Q +173,1,3,"Johnson, Miss. Eleanor Ileen",female,1,1,1,347742,11.1333,,S +174,0,3,"Sivola, Mr. Antti Wilhelm",male,21,0,0,STON/O 2. 3101280,7.925,,S +175,0,1,"Smith, Mr. James Clinch",male,56,0,0,17764,30.6958,A7,C +176,0,3,"Klasen, Mr. Klas Albin",male,18,1,1,350404,7.8542,,S +177,0,3,"Lefebre, Master. Henry Forbes",male,,3,1,4133,25.4667,,S +178,0,1,"Isham, Miss. Ann Elizabeth",female,50,0,0,PC 17595,28.7125,C49,C +179,0,2,"Hale, Mr. Reginald",male,30,0,0,250653,13,,S +180,0,3,"Leonard, Mr. Lionel",male,36,0,0,LINE,0,,S +181,0,3,"Sage, Miss. Constance Gladys",female,,8,2,CA. 2343,69.55,,S +182,0,2,"Pernot, Mr. Rene",male,,0,0,SC/PARIS 2131,15.05,,C +183,0,3,"Asplund, Master. Clarence Gustaf Hugo",male,9,4,2,347077,31.3875,,S +184,1,2,"Becker, Master. Richard F",male,1,2,1,230136,39,F4,S +185,1,3,"Kink-Heilmann, Miss. Luise Gretchen",female,4,0,2,315153,22.025,,S +186,0,1,"Rood, Mr. Hugh Roscoe",male,,0,0,113767,50,A32,S +187,1,3,"O'Brien, Mrs. Thomas (Johanna ""Hannah"" Godfrey)",female,,1,0,370365,15.5,,Q +188,1,1,"Romaine, Mr. Charles Hallace (""Mr C Rolmane"")",male,45,0,0,111428,26.55,,S +189,0,3,"Bourke, Mr. John",male,40,1,1,364849,15.5,,Q +190,0,3,"Turcin, Mr. Stjepan",male,36,0,0,349247,7.8958,,S +191,1,2,"Pinsky, Mrs. (Rosa)",female,32,0,0,234604,13,,S +192,0,2,"Carbines, Mr. William",male,19,0,0,28424,13,,S +193,1,3,"Andersen-Jensen, Miss. Carla Christine Nielsine",female,19,1,0,350046,7.8542,,S +194,1,2,"Navratil, Master. Michel M",male,3,1,1,230080,26,F2,S +195,1,1,"Brown, Mrs. James Joseph (Margaret Tobin)",female,44,0,0,PC 17610,27.7208,B4,C +196,1,1,"Lurette, Miss. Elise",female,58,0,0,PC 17569,146.5208,B80,C +197,0,3,"Mernagh, Mr. Robert",male,,0,0,368703,7.75,,Q +198,0,3,"Olsen, Mr. Karl Siegwart Andreas",male,42,0,1,4579,8.4042,,S +199,1,3,"Madigan, Miss. Margaret ""Maggie""",female,,0,0,370370,7.75,,Q +200,0,2,"Yrois, Miss. Henriette (""Mrs Harbeck"")",female,24,0,0,248747,13,,S +201,0,3,"Vande Walle, Mr. Nestor Cyriel",male,28,0,0,345770,9.5,,S +202,0,3,"Sage, Mr. Frederick",male,,8,2,CA. 2343,69.55,,S +203,0,3,"Johanson, Mr. Jakob Alfred",male,34,0,0,3101264,6.4958,,S +204,0,3,"Youseff, Mr. Gerious",male,45.5,0,0,2628,7.225,,C +205,1,3,"Cohen, Mr. Gurshon ""Gus""",male,18,0,0,A/5 3540,8.05,,S +206,0,3,"Strom, Miss. Telma Matilda",female,2,0,1,347054,10.4625,G6,S +207,0,3,"Backstrom, Mr. Karl Alfred",male,32,1,0,3101278,15.85,,S +208,1,3,"Albimona, Mr. Nassef Cassem",male,26,0,0,2699,18.7875,,C +209,1,3,"Carr, Miss. Helen ""Ellen""",female,16,0,0,367231,7.75,,Q +210,1,1,"Blank, Mr. Henry",male,40,0,0,112277,31,A31,C +211,0,3,"Ali, Mr. Ahmed",male,24,0,0,SOTON/O.Q. 3101311,7.05,,S +212,1,2,"Cameron, Miss. Clear Annie",female,35,0,0,F.C.C. 13528,21,,S +213,0,3,"Perkin, Mr. John Henry",male,22,0,0,A/5 21174,7.25,,S +214,0,2,"Givard, Mr. Hans Kristensen",male,30,0,0,250646,13,,S +215,0,3,"Kiernan, Mr. Philip",male,,1,0,367229,7.75,,Q +216,1,1,"Newell, Miss. Madeleine",female,31,1,0,35273,113.275,D36,C +217,1,3,"Honkanen, Miss. Eliina",female,27,0,0,STON/O2. 3101283,7.925,,S +218,0,2,"Jacobsohn, Mr. Sidney Samuel",male,42,1,0,243847,27,,S +219,1,1,"Bazzani, Miss. Albina",female,32,0,0,11813,76.2917,D15,C +220,0,2,"Harris, Mr. Walter",male,30,0,0,W/C 14208,10.5,,S +221,1,3,"Sunderland, Mr. Victor Francis",male,16,0,0,SOTON/OQ 392089,8.05,,S +222,0,2,"Bracken, Mr. James H",male,27,0,0,220367,13,,S +223,0,3,"Green, Mr. George Henry",male,51,0,0,21440,8.05,,S +224,0,3,"Nenkoff, Mr. Christo",male,,0,0,349234,7.8958,,S +225,1,1,"Hoyt, Mr. Frederick Maxfield",male,38,1,0,19943,90,C93,S +226,0,3,"Berglund, Mr. Karl Ivar Sven",male,22,0,0,PP 4348,9.35,,S +227,1,2,"Mellors, Mr. William John",male,19,0,0,SW/PP 751,10.5,,S +228,0,3,"Lovell, Mr. John Hall (""Henry"")",male,20.5,0,0,A/5 21173,7.25,,S +229,0,2,"Fahlstrom, Mr. Arne Jonas",male,18,0,0,236171,13,,S +230,0,3,"Lefebre, Miss. Mathilde",female,,3,1,4133,25.4667,,S +231,1,1,"Harris, Mrs. Henry Birkhardt (Irene Wallach)",female,35,1,0,36973,83.475,C83,S +232,0,3,"Larsson, Mr. Bengt Edvin",male,29,0,0,347067,7.775,,S +233,0,2,"Sjostedt, Mr. Ernst Adolf",male,59,0,0,237442,13.5,,S +234,1,3,"Asplund, Miss. Lillian Gertrud",female,5,4,2,347077,31.3875,,S +235,0,2,"Leyson, Mr. Robert William Norman",male,24,0,0,C.A. 29566,10.5,,S +236,0,3,"Harknett, Miss. Alice Phoebe",female,,0,0,W./C. 6609,7.55,,S +237,0,2,"Hold, Mr. Stephen",male,44,1,0,26707,26,,S +238,1,2,"Collyer, Miss. Marjorie ""Lottie""",female,8,0,2,C.A. 31921,26.25,,S +239,0,2,"Pengelly, Mr. Frederick William",male,19,0,0,28665,10.5,,S +240,0,2,"Hunt, Mr. George Henry",male,33,0,0,SCO/W 1585,12.275,,S +241,0,3,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C +242,1,3,"Murphy, Miss. Katherine ""Kate""",female,,1,0,367230,15.5,,Q +243,0,2,"Coleridge, Mr. Reginald Charles",male,29,0,0,W./C. 14263,10.5,,S +244,0,3,"Maenpaa, Mr. Matti Alexanteri",male,22,0,0,STON/O 2. 3101275,7.125,,S +245,0,3,"Attalah, Mr. Sleiman",male,30,0,0,2694,7.225,,C +246,0,1,"Minahan, Dr. William Edward",male,44,2,0,19928,90,C78,Q +247,0,3,"Lindahl, Miss. Agda Thorilda Viktoria",female,25,0,0,347071,7.775,,S +248,1,2,"Hamalainen, Mrs. William (Anna)",female,24,0,2,250649,14.5,,S +249,1,1,"Beckwith, Mr. Richard Leonard",male,37,1,1,11751,52.5542,D35,S +250,0,2,"Carter, Rev. Ernest Courtenay",male,54,1,0,244252,26,,S +251,0,3,"Reed, Mr. James George",male,,0,0,362316,7.25,,S +252,0,3,"Strom, Mrs. Wilhelm (Elna Matilda Persson)",female,29,1,1,347054,10.4625,G6,S +253,0,1,"Stead, Mr. William Thomas",male,62,0,0,113514,26.55,C87,S +254,0,3,"Lobb, Mr. William Arthur",male,30,1,0,A/5. 3336,16.1,,S +255,0,3,"Rosblom, Mrs. Viktor (Helena Wilhelmina)",female,41,0,2,370129,20.2125,,S +256,1,3,"Touma, Mrs. Darwis (Hanne Youssef Razi)",female,29,0,2,2650,15.2458,,C +257,1,1,"Thorne, Mrs. Gertrude Maybelle",female,,0,0,PC 17585,79.2,,C +258,1,1,"Cherry, Miss. Gladys",female,30,0,0,110152,86.5,B77,S +259,1,1,"Ward, Miss. Anna",female,35,0,0,PC 17755,512.3292,,C +260,1,2,"Parrish, Mrs. (Lutie Davis)",female,50,0,1,230433,26,,S +261,0,3,"Smith, Mr. Thomas",male,,0,0,384461,7.75,,Q +262,1,3,"Asplund, Master. Edvin Rojj Felix",male,3,4,2,347077,31.3875,,S +263,0,1,"Taussig, Mr. Emil",male,52,1,1,110413,79.65,E67,S +264,0,1,"Harrison, Mr. William",male,40,0,0,112059,0,B94,S +265,0,3,"Henry, Miss. Delia",female,,0,0,382649,7.75,,Q +266,0,2,"Reeves, Mr. David",male,36,0,0,C.A. 17248,10.5,,S +267,0,3,"Panula, Mr. Ernesti Arvid",male,16,4,1,3101295,39.6875,,S +268,1,3,"Persson, Mr. Ernst Ulrik",male,25,1,0,347083,7.775,,S +269,1,1,"Graham, Mrs. William Thompson (Edith Junkins)",female,58,0,1,PC 17582,153.4625,C125,S +270,1,1,"Bissette, Miss. Amelia",female,35,0,0,PC 17760,135.6333,C99,S +271,0,1,"Cairns, Mr. Alexander",male,,0,0,113798,31,,S +272,1,3,"Tornquist, Mr. William Henry",male,25,0,0,LINE,0,,S +273,1,2,"Mellinger, Mrs. (Elizabeth Anne Maidment)",female,41,0,1,250644,19.5,,S +274,0,1,"Natsch, Mr. Charles H",male,37,0,1,PC 17596,29.7,C118,C +275,1,3,"Healy, Miss. Hanora ""Nora""",female,,0,0,370375,7.75,,Q +276,1,1,"Andrews, Miss. Kornelia Theodosia",female,63,1,0,13502,77.9583,D7,S +277,0,3,"Lindblom, Miss. Augusta Charlotta",female,45,0,0,347073,7.75,,S +278,0,2,"Parkes, Mr. Francis ""Frank""",male,,0,0,239853,0,,S +279,0,3,"Rice, Master. Eric",male,7,4,1,382652,29.125,,Q +280,1,3,"Abbott, Mrs. Stanton (Rosa Hunt)",female,35,1,1,C.A. 2673,20.25,,S +281,0,3,"Duane, Mr. Frank",male,65,0,0,336439,7.75,,Q +282,0,3,"Olsson, Mr. Nils Johan Goransson",male,28,0,0,347464,7.8542,,S +283,0,3,"de Pelsmaeker, Mr. Alfons",male,16,0,0,345778,9.5,,S +284,1,3,"Dorking, Mr. Edward Arthur",male,19,0,0,A/5. 10482,8.05,,S +285,0,1,"Smith, Mr. Richard William",male,,0,0,113056,26,A19,S +286,0,3,"Stankovic, Mr. Ivan",male,33,0,0,349239,8.6625,,C +287,1,3,"de Mulder, Mr. Theodore",male,30,0,0,345774,9.5,,S +288,0,3,"Naidenoff, Mr. Penko",male,22,0,0,349206,7.8958,,S +289,1,2,"Hosono, Mr. Masabumi",male,42,0,0,237798,13,,S +290,1,3,"Connolly, Miss. Kate",female,22,0,0,370373,7.75,,Q +291,1,1,"Barber, Miss. Ellen ""Nellie""",female,26,0,0,19877,78.85,,S +292,1,1,"Bishop, Mrs. Dickinson H (Helen Walton)",female,19,1,0,11967,91.0792,B49,C +293,0,2,"Levy, Mr. Rene Jacques",male,36,0,0,SC/Paris 2163,12.875,D,C +294,0,3,"Haas, Miss. Aloisia",female,24,0,0,349236,8.85,,S +295,0,3,"Mineff, Mr. Ivan",male,24,0,0,349233,7.8958,,S +296,0,1,"Lewy, Mr. Ervin G",male,,0,0,PC 17612,27.7208,,C +297,0,3,"Hanna, Mr. Mansour",male,23.5,0,0,2693,7.2292,,C +298,0,1,"Allison, Miss. Helen Loraine",female,2,1,2,113781,151.55,C22 C26,S +299,1,1,"Saalfeld, Mr. Adolphe",male,,0,0,19988,30.5,C106,S +300,1,1,"Baxter, Mrs. James (Helene DeLaudeniere Chaput)",female,50,0,1,PC 17558,247.5208,B58 B60,C +301,1,3,"Kelly, Miss. Anna Katherine ""Annie Kate""",female,,0,0,9234,7.75,,Q +302,1,3,"McCoy, Mr. Bernard",male,,2,0,367226,23.25,,Q +303,0,3,"Johnson, Mr. William Cahoone Jr",male,19,0,0,LINE,0,,S +304,1,2,"Keane, Miss. Nora A",female,,0,0,226593,12.35,E101,Q +305,0,3,"Williams, Mr. Howard Hugh ""Harry""",male,,0,0,A/5 2466,8.05,,S +306,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S +307,1,1,"Fleming, Miss. Margaret",female,,0,0,17421,110.8833,,C +308,1,1,"Penasco y Castellana, Mrs. Victor de Satode (Maria Josefa Perez de Soto y Vallejo)",female,17,1,0,PC 17758,108.9,C65,C +309,0,2,"Abelson, Mr. Samuel",male,30,1,0,P/PP 3381,24,,C +310,1,1,"Francatelli, Miss. Laura Mabel",female,30,0,0,PC 17485,56.9292,E36,C +311,1,1,"Hays, Miss. Margaret Bechstein",female,24,0,0,11767,83.1583,C54,C +312,1,1,"Ryerson, Miss. Emily Borie",female,18,2,2,PC 17608,262.375,B57 B59 B63 B66,C +313,0,2,"Lahtinen, Mrs. William (Anna Sylfven)",female,26,1,1,250651,26,,S +314,0,3,"Hendekovic, Mr. Ignjac",male,28,0,0,349243,7.8958,,S +315,0,2,"Hart, Mr. Benjamin",male,43,1,1,F.C.C. 13529,26.25,,S +316,1,3,"Nilsson, Miss. Helmina Josefina",female,26,0,0,347470,7.8542,,S +317,1,2,"Kantor, Mrs. Sinai (Miriam Sternin)",female,24,1,0,244367,26,,S +318,0,2,"Moraweck, Dr. Ernest",male,54,0,0,29011,14,,S +319,1,1,"Wick, Miss. Mary Natalie",female,31,0,2,36928,164.8667,C7,S +320,1,1,"Spedden, Mrs. Frederic Oakley (Margaretta Corning Stone)",female,40,1,1,16966,134.5,E34,C +321,0,3,"Dennis, Mr. Samuel",male,22,0,0,A/5 21172,7.25,,S +322,0,3,"Danoff, Mr. Yoto",male,27,0,0,349219,7.8958,,S +323,1,2,"Slayter, Miss. Hilda Mary",female,30,0,0,234818,12.35,,Q +324,1,2,"Caldwell, Mrs. Albert Francis (Sylvia Mae Harbaugh)",female,22,1,1,248738,29,,S +325,0,3,"Sage, Mr. George John Jr",male,,8,2,CA. 2343,69.55,,S +326,1,1,"Young, Miss. Marie Grice",female,36,0,0,PC 17760,135.6333,C32,C +327,0,3,"Nysveen, Mr. Johan Hansen",male,61,0,0,345364,6.2375,,S +328,1,2,"Ball, Mrs. (Ada E Hall)",female,36,0,0,28551,13,D,S +329,1,3,"Goldsmith, Mrs. Frank John (Emily Alice Brown)",female,31,1,1,363291,20.525,,S +330,1,1,"Hippach, Miss. Jean Gertrude",female,16,0,1,111361,57.9792,B18,C +331,1,3,"McCoy, Miss. Agnes",female,,2,0,367226,23.25,,Q +332,0,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.5,C124,S +333,0,1,"Graham, Mr. George Edward",male,38,0,1,PC 17582,153.4625,C91,S +334,0,3,"Vander Planke, Mr. Leo Edmondus",male,16,2,0,345764,18,,S +335,1,1,"Frauenthal, Mrs. Henry William (Clara Heinsheimer)",female,,1,0,PC 17611,133.65,,S +336,0,3,"Denkoff, Mr. Mitto",male,,0,0,349225,7.8958,,S +337,0,1,"Pears, Mr. Thomas Clinton",male,29,1,0,113776,66.6,C2,S +338,1,1,"Burns, Miss. Elizabeth Margaret",female,41,0,0,16966,134.5,E40,C +339,1,3,"Dahl, Mr. Karl Edwart",male,45,0,0,7598,8.05,,S +340,0,1,"Blackwell, Mr. Stephen Weart",male,45,0,0,113784,35.5,T,S +341,1,2,"Navratil, Master. Edmond Roger",male,2,1,1,230080,26,F2,S +342,1,1,"Fortune, Miss. Alice Elizabeth",female,24,3,2,19950,263,C23 C25 C27,S +343,0,2,"Collander, Mr. Erik Gustaf",male,28,0,0,248740,13,,S +344,0,2,"Sedgwick, Mr. Charles Frederick Waddington",male,25,0,0,244361,13,,S +345,0,2,"Fox, Mr. Stanley Hubert",male,36,0,0,229236,13,,S +346,1,2,"Brown, Miss. Amelia ""Mildred""",female,24,0,0,248733,13,F33,S +347,1,2,"Smith, Miss. Marion Elsie",female,40,0,0,31418,13,,S +348,1,3,"Davison, Mrs. Thomas Henry (Mary E Finck)",female,,1,0,386525,16.1,,S +349,1,3,"Coutts, Master. William Loch ""William""",male,3,1,1,C.A. 37671,15.9,,S +350,0,3,"Dimic, Mr. Jovan",male,42,0,0,315088,8.6625,,S +351,0,3,"Odahl, Mr. Nils Martin",male,23,0,0,7267,9.225,,S +352,0,1,"Williams-Lambert, Mr. Fletcher Fellows",male,,0,0,113510,35,C128,S +353,0,3,"Elias, Mr. Tannous",male,15,1,1,2695,7.2292,,C +354,0,3,"Arnold-Franchi, Mr. Josef",male,25,1,0,349237,17.8,,S +355,0,3,"Yousif, Mr. Wazli",male,,0,0,2647,7.225,,C +356,0,3,"Vanden Steen, Mr. Leo Peter",male,28,0,0,345783,9.5,,S +357,1,1,"Bowerman, Miss. Elsie Edith",female,22,0,1,113505,55,E33,S +358,0,2,"Funk, Miss. Annie Clemmer",female,38,0,0,237671,13,,S +359,1,3,"McGovern, Miss. Mary",female,,0,0,330931,7.8792,,Q +360,1,3,"Mockler, Miss. Helen Mary ""Ellie""",female,,0,0,330980,7.8792,,Q +361,0,3,"Skoog, Mr. Wilhelm",male,40,1,4,347088,27.9,,S +362,0,2,"del Carlo, Mr. Sebastiano",male,29,1,0,SC/PARIS 2167,27.7208,,C +363,0,3,"Barbara, Mrs. (Catherine David)",female,45,0,1,2691,14.4542,,C +364,0,3,"Asim, Mr. Adola",male,35,0,0,SOTON/O.Q. 3101310,7.05,,S +365,0,3,"O'Brien, Mr. Thomas",male,,1,0,370365,15.5,,Q +366,0,3,"Adahl, Mr. Mauritz Nils Martin",male,30,0,0,C 7076,7.25,,S +367,1,1,"Warren, Mrs. Frank Manley (Anna Sophia Atkinson)",female,60,1,0,110813,75.25,D37,C +368,1,3,"Moussa, Mrs. (Mantoura Boulos)",female,,0,0,2626,7.2292,,C +369,1,3,"Jermyn, Miss. Annie",female,,0,0,14313,7.75,,Q +370,1,1,"Aubart, Mme. Leontine Pauline",female,24,0,0,PC 17477,69.3,B35,C +371,1,1,"Harder, Mr. George Achilles",male,25,1,0,11765,55.4417,E50,C +372,0,3,"Wiklund, Mr. Jakob Alfred",male,18,1,0,3101267,6.4958,,S +373,0,3,"Beavan, Mr. William Thomas",male,19,0,0,323951,8.05,,S +374,0,1,"Ringhini, Mr. Sante",male,22,0,0,PC 17760,135.6333,,C +375,0,3,"Palsson, Miss. Stina Viola",female,3,3,1,349909,21.075,,S +376,1,1,"Meyer, Mrs. Edgar Joseph (Leila Saks)",female,,1,0,PC 17604,82.1708,,C +377,1,3,"Landergren, Miss. Aurora Adelia",female,22,0,0,C 7077,7.25,,S +378,0,1,"Widener, Mr. Harry Elkins",male,27,0,2,113503,211.5,C82,C +379,0,3,"Betros, Mr. Tannous",male,20,0,0,2648,4.0125,,C +380,0,3,"Gustafsson, Mr. Karl Gideon",male,19,0,0,347069,7.775,,S +381,1,1,"Bidois, Miss. Rosalie",female,42,0,0,PC 17757,227.525,,C +382,1,3,"Nakid, Miss. Maria (""Mary"")",female,1,0,2,2653,15.7417,,C +383,0,3,"Tikkanen, Mr. Juho",male,32,0,0,STON/O 2. 3101293,7.925,,S +384,1,1,"Holverson, Mrs. Alexander Oskar (Mary Aline Towner)",female,35,1,0,113789,52,,S +385,0,3,"Plotcharsky, Mr. Vasil",male,,0,0,349227,7.8958,,S +386,0,2,"Davies, Mr. Charles Henry",male,18,0,0,S.O.C. 14879,73.5,,S +387,0,3,"Goodwin, Master. Sidney Leonard",male,1,5,2,CA 2144,46.9,,S +388,1,2,"Buss, Miss. Kate",female,36,0,0,27849,13,,S +389,0,3,"Sadlier, Mr. Matthew",male,,0,0,367655,7.7292,,Q +390,1,2,"Lehmann, Miss. Bertha",female,17,0,0,SC 1748,12,,C +391,1,1,"Carter, Mr. William Ernest",male,36,1,2,113760,120,B96 B98,S +392,1,3,"Jansson, Mr. Carl Olof",male,21,0,0,350034,7.7958,,S +393,0,3,"Gustafsson, Mr. Johan Birger",male,28,2,0,3101277,7.925,,S +394,1,1,"Newell, Miss. Marjorie",female,23,1,0,35273,113.275,D36,C +395,1,3,"Sandstrom, Mrs. Hjalmar (Agnes Charlotta Bengtsson)",female,24,0,2,PP 9549,16.7,G6,S +396,0,3,"Johansson, Mr. Erik",male,22,0,0,350052,7.7958,,S +397,0,3,"Olsson, Miss. Elina",female,31,0,0,350407,7.8542,,S +398,0,2,"McKane, Mr. Peter David",male,46,0,0,28403,26,,S +399,0,2,"Pain, Dr. Alfred",male,23,0,0,244278,10.5,,S +400,1,2,"Trout, Mrs. William H (Jessie L)",female,28,0,0,240929,12.65,,S +401,1,3,"Niskanen, Mr. Juha",male,39,0,0,STON/O 2. 3101289,7.925,,S +402,0,3,"Adams, Mr. John",male,26,0,0,341826,8.05,,S +403,0,3,"Jussila, Miss. Mari Aina",female,21,1,0,4137,9.825,,S +404,0,3,"Hakkarainen, Mr. Pekka Pietari",male,28,1,0,STON/O2. 3101279,15.85,,S +405,0,3,"Oreskovic, Miss. Marija",female,20,0,0,315096,8.6625,,S +406,0,2,"Gale, Mr. Shadrach",male,34,1,0,28664,21,,S +407,0,3,"Widegren, Mr. Carl/Charles Peter",male,51,0,0,347064,7.75,,S +408,1,2,"Richards, Master. William Rowe",male,3,1,1,29106,18.75,,S +409,0,3,"Birkeland, Mr. Hans Martin Monsen",male,21,0,0,312992,7.775,,S +410,0,3,"Lefebre, Miss. Ida",female,,3,1,4133,25.4667,,S +411,0,3,"Sdycoff, Mr. Todor",male,,0,0,349222,7.8958,,S +412,0,3,"Hart, Mr. Henry",male,,0,0,394140,6.8583,,Q +413,1,1,"Minahan, Miss. Daisy E",female,33,1,0,19928,90,C78,Q +414,0,2,"Cunningham, Mr. Alfred Fleming",male,,0,0,239853,0,,S +415,1,3,"Sundman, Mr. Johan Julian",male,44,0,0,STON/O 2. 3101269,7.925,,S +416,0,3,"Meek, Mrs. Thomas (Annie Louise Rowley)",female,,0,0,343095,8.05,,S +417,1,2,"Drew, Mrs. James Vivian (Lulu Thorne Christian)",female,34,1,1,28220,32.5,,S +418,1,2,"Silven, Miss. Lyyli Karoliina",female,18,0,2,250652,13,,S +419,0,2,"Matthews, Mr. William John",male,30,0,0,28228,13,,S +420,0,3,"Van Impe, Miss. Catharina",female,10,0,2,345773,24.15,,S +421,0,3,"Gheorgheff, Mr. Stanio",male,,0,0,349254,7.8958,,C +422,0,3,"Charters, Mr. David",male,21,0,0,A/5. 13032,7.7333,,Q +423,0,3,"Zimmerman, Mr. Leo",male,29,0,0,315082,7.875,,S +424,0,3,"Danbom, Mrs. Ernst Gilbert (Anna Sigrid Maria Brogren)",female,28,1,1,347080,14.4,,S +425,0,3,"Rosblom, Mr. Viktor Richard",male,18,1,1,370129,20.2125,,S +426,0,3,"Wiseman, Mr. Phillippe",male,,0,0,A/4. 34244,7.25,,S +427,1,2,"Clarke, Mrs. Charles V (Ada Maria Winfield)",female,28,1,0,2003,26,,S +428,1,2,"Phillips, Miss. Kate Florence (""Mrs Kate Louise Phillips Marshall"")",female,19,0,0,250655,26,,S +429,0,3,"Flynn, Mr. James",male,,0,0,364851,7.75,,Q +430,1,3,"Pickard, Mr. Berk (Berk Trembisky)",male,32,0,0,SOTON/O.Q. 392078,8.05,E10,S +431,1,1,"Bjornstrom-Steffansson, Mr. Mauritz Hakan",male,28,0,0,110564,26.55,C52,S +432,1,3,"Thorneycroft, Mrs. Percival (Florence Kate White)",female,,1,0,376564,16.1,,S +433,1,2,"Louch, Mrs. Charles Alexander (Alice Adelaide Slow)",female,42,1,0,SC/AH 3085,26,,S +434,0,3,"Kallio, Mr. Nikolai Erland",male,17,0,0,STON/O 2. 3101274,7.125,,S +435,0,1,"Silvey, Mr. William Baird",male,50,1,0,13507,55.9,E44,S +436,1,1,"Carter, Miss. Lucile Polk",female,14,1,2,113760,120,B96 B98,S +437,0,3,"Ford, Miss. Doolina Margaret ""Daisy""",female,21,2,2,W./C. 6608,34.375,,S +438,1,2,"Richards, Mrs. Sidney (Emily Hocking)",female,24,2,3,29106,18.75,,S +439,0,1,"Fortune, Mr. Mark",male,64,1,4,19950,263,C23 C25 C27,S +440,0,2,"Kvillner, Mr. Johan Henrik Johannesson",male,31,0,0,C.A. 18723,10.5,,S +441,1,2,"Hart, Mrs. Benjamin (Esther Ada Bloomfield)",female,45,1,1,F.C.C. 13529,26.25,,S +442,0,3,"Hampe, Mr. Leon",male,20,0,0,345769,9.5,,S +443,0,3,"Petterson, Mr. Johan Emil",male,25,1,0,347076,7.775,,S +444,1,2,"Reynaldo, Ms. Encarnacion",female,28,0,0,230434,13,,S +445,1,3,"Johannesen-Bratthammer, Mr. Bernt",male,,0,0,65306,8.1125,,S +446,1,1,"Dodge, Master. Washington",male,4,0,2,33638,81.8583,A34,S +447,1,2,"Mellinger, Miss. Madeleine Violet",female,13,0,1,250644,19.5,,S +448,1,1,"Seward, Mr. Frederic Kimber",male,34,0,0,113794,26.55,,S +449,1,3,"Baclini, Miss. Marie Catherine",female,5,2,1,2666,19.2583,,C +450,1,1,"Peuchen, Major. Arthur Godfrey",male,52,0,0,113786,30.5,C104,S +451,0,2,"West, Mr. Edwy Arthur",male,36,1,2,C.A. 34651,27.75,,S +452,0,3,"Hagland, Mr. Ingvald Olai Olsen",male,,1,0,65303,19.9667,,S +453,0,1,"Foreman, Mr. Benjamin Laventall",male,30,0,0,113051,27.75,C111,C +454,1,1,"Goldenberg, Mr. Samuel L",male,49,1,0,17453,89.1042,C92,C +455,0,3,"Peduzzi, Mr. Joseph",male,,0,0,A/5 2817,8.05,,S +456,1,3,"Jalsevac, Mr. Ivan",male,29,0,0,349240,7.8958,,C +457,0,1,"Millet, Mr. Francis Davis",male,65,0,0,13509,26.55,E38,S +458,1,1,"Kenyon, Mrs. Frederick R (Marion)",female,,1,0,17464,51.8625,D21,S +459,1,2,"Toomey, Miss. Ellen",female,50,0,0,F.C.C. 13531,10.5,,S +460,0,3,"O'Connor, Mr. Maurice",male,,0,0,371060,7.75,,Q +461,1,1,"Anderson, Mr. Harry",male,48,0,0,19952,26.55,E12,S +462,0,3,"Morley, Mr. William",male,34,0,0,364506,8.05,,S +463,0,1,"Gee, Mr. Arthur H",male,47,0,0,111320,38.5,E63,S +464,0,2,"Milling, Mr. Jacob Christian",male,48,0,0,234360,13,,S +465,0,3,"Maisner, Mr. Simon",male,,0,0,A/S 2816,8.05,,S +466,0,3,"Goncalves, Mr. Manuel Estanslas",male,38,0,0,SOTON/O.Q. 3101306,7.05,,S +467,0,2,"Campbell, Mr. William",male,,0,0,239853,0,,S +468,0,1,"Smart, Mr. John Montgomery",male,56,0,0,113792,26.55,,S +469,0,3,"Scanlan, Mr. James",male,,0,0,36209,7.725,,Q +470,1,3,"Baclini, Miss. Helene Barbara",female,0.75,2,1,2666,19.2583,,C +471,0,3,"Keefe, Mr. Arthur",male,,0,0,323592,7.25,,S +472,0,3,"Cacic, Mr. Luka",male,38,0,0,315089,8.6625,,S +473,1,2,"West, Mrs. Edwy Arthur (Ada Mary Worth)",female,33,1,2,C.A. 34651,27.75,,S +474,1,2,"Jerwan, Mrs. Amin S (Marie Marthe Thuillard)",female,23,0,0,SC/AH Basle 541,13.7917,D,C +475,0,3,"Strandberg, Miss. Ida Sofia",female,22,0,0,7553,9.8375,,S +476,0,1,"Clifford, Mr. George Quincy",male,,0,0,110465,52,A14,S +477,0,2,"Renouf, Mr. Peter Henry",male,34,1,0,31027,21,,S +478,0,3,"Braund, Mr. Lewis Richard",male,29,1,0,3460,7.0458,,S +479,0,3,"Karlsson, Mr. Nils August",male,22,0,0,350060,7.5208,,S +480,1,3,"Hirvonen, Miss. Hildur E",female,2,0,1,3101298,12.2875,,S +481,0,3,"Goodwin, Master. Harold Victor",male,9,5,2,CA 2144,46.9,,S +482,0,2,"Frost, Mr. Anthony Wood ""Archie""",male,,0,0,239854,0,,S +483,0,3,"Rouse, Mr. Richard Henry",male,50,0,0,A/5 3594,8.05,,S +484,1,3,"Turkula, Mrs. (Hedwig)",female,63,0,0,4134,9.5875,,S +485,1,1,"Bishop, Mr. Dickinson H",male,25,1,0,11967,91.0792,B49,C +486,0,3,"Lefebre, Miss. Jeannie",female,,3,1,4133,25.4667,,S +487,1,1,"Hoyt, Mrs. Frederick Maxfield (Jane Anne Forby)",female,35,1,0,19943,90,C93,S +488,0,1,"Kent, Mr. Edward Austin",male,58,0,0,11771,29.7,B37,C +489,0,3,"Somerton, Mr. Francis William",male,30,0,0,A.5. 18509,8.05,,S +490,1,3,"Coutts, Master. Eden Leslie ""Neville""",male,9,1,1,C.A. 37671,15.9,,S +491,0,3,"Hagland, Mr. Konrad Mathias Reiersen",male,,1,0,65304,19.9667,,S +492,0,3,"Windelov, Mr. Einar",male,21,0,0,SOTON/OQ 3101317,7.25,,S +493,0,1,"Molson, Mr. Harry Markland",male,55,0,0,113787,30.5,C30,S +494,0,1,"Artagaveytia, Mr. Ramon",male,71,0,0,PC 17609,49.5042,,C +495,0,3,"Stanley, Mr. Edward Roland",male,21,0,0,A/4 45380,8.05,,S +496,0,3,"Yousseff, Mr. Gerious",male,,0,0,2627,14.4583,,C +497,1,1,"Eustis, Miss. Elizabeth Mussey",female,54,1,0,36947,78.2667,D20,C +498,0,3,"Shellard, Mr. Frederick William",male,,0,0,C.A. 6212,15.1,,S +499,0,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25,1,2,113781,151.55,C22 C26,S +500,0,3,"Svensson, Mr. Olof",male,24,0,0,350035,7.7958,,S +501,0,3,"Calic, Mr. Petar",male,17,0,0,315086,8.6625,,S +502,0,3,"Canavan, Miss. Mary",female,21,0,0,364846,7.75,,Q +503,0,3,"O'Sullivan, Miss. Bridget Mary",female,,0,0,330909,7.6292,,Q +504,0,3,"Laitinen, Miss. Kristina Sofia",female,37,0,0,4135,9.5875,,S +505,1,1,"Maioni, Miss. Roberta",female,16,0,0,110152,86.5,B79,S +506,0,1,"Penasco y Castellana, Mr. Victor de Satode",male,18,1,0,PC 17758,108.9,C65,C +507,1,2,"Quick, Mrs. Frederick Charles (Jane Richards)",female,33,0,2,26360,26,,S +508,1,1,"Bradley, Mr. George (""George Arthur Brayton"")",male,,0,0,111427,26.55,,S +509,0,3,"Olsen, Mr. Henry Margido",male,28,0,0,C 4001,22.525,,S +510,1,3,"Lang, Mr. Fang",male,26,0,0,1601,56.4958,,S +511,1,3,"Daly, Mr. Eugene Patrick",male,29,0,0,382651,7.75,,Q +512,0,3,"Webber, Mr. James",male,,0,0,SOTON/OQ 3101316,8.05,,S +513,1,1,"McGough, Mr. James Robert",male,36,0,0,PC 17473,26.2875,E25,S +514,1,1,"Rothschild, Mrs. Martin (Elizabeth L. Barrett)",female,54,1,0,PC 17603,59.4,,C +515,0,3,"Coleff, Mr. Satio",male,24,0,0,349209,7.4958,,S +516,0,1,"Walker, Mr. William Anderson",male,47,0,0,36967,34.0208,D46,S +517,1,2,"Lemore, Mrs. (Amelia Milley)",female,34,0,0,C.A. 34260,10.5,F33,S +518,0,3,"Ryan, Mr. Patrick",male,,0,0,371110,24.15,,Q +519,1,2,"Angle, Mrs. William A (Florence ""Mary"" Agnes Hughes)",female,36,1,0,226875,26,,S +520,0,3,"Pavlovic, Mr. Stefo",male,32,0,0,349242,7.8958,,S +521,1,1,"Perreault, Miss. Anne",female,30,0,0,12749,93.5,B73,S +522,0,3,"Vovk, Mr. Janko",male,22,0,0,349252,7.8958,,S +523,0,3,"Lahoud, Mr. Sarkis",male,,0,0,2624,7.225,,C +524,1,1,"Hippach, Mrs. Louis Albert (Ida Sophia Fischer)",female,44,0,1,111361,57.9792,B18,C +525,0,3,"Kassem, Mr. Fared",male,,0,0,2700,7.2292,,C +526,0,3,"Farrell, Mr. James",male,40.5,0,0,367232,7.75,,Q +527,1,2,"Ridsdale, Miss. Lucy",female,50,0,0,W./C. 14258,10.5,,S +528,0,1,"Farthing, Mr. John",male,,0,0,PC 17483,221.7792,C95,S +529,0,3,"Salonen, Mr. Johan Werner",male,39,0,0,3101296,7.925,,S +530,0,2,"Hocking, Mr. Richard George",male,23,2,1,29104,11.5,,S +531,1,2,"Quick, Miss. Phyllis May",female,2,1,1,26360,26,,S +532,0,3,"Toufik, Mr. Nakli",male,,0,0,2641,7.2292,,C +533,0,3,"Elias, Mr. Joseph Jr",male,17,1,1,2690,7.2292,,C +534,1,3,"Peter, Mrs. Catherine (Catherine Rizk)",female,,0,2,2668,22.3583,,C +535,0,3,"Cacic, Miss. Marija",female,30,0,0,315084,8.6625,,S +536,1,2,"Hart, Miss. Eva Miriam",female,7,0,2,F.C.C. 13529,26.25,,S +537,0,1,"Butt, Major. Archibald Willingham",male,45,0,0,113050,26.55,B38,S +538,1,1,"LeRoy, Miss. Bertha",female,30,0,0,PC 17761,106.425,,C +539,0,3,"Risien, Mr. Samuel Beard",male,,0,0,364498,14.5,,S +540,1,1,"Frolicher, Miss. Hedwig Margaritha",female,22,0,2,13568,49.5,B39,C +541,1,1,"Crosby, Miss. Harriet R",female,36,0,2,WE/P 5735,71,B22,S +542,0,3,"Andersson, Miss. Ingeborg Constanzia",female,9,4,2,347082,31.275,,S +543,0,3,"Andersson, Miss. Sigrid Elisabeth",female,11,4,2,347082,31.275,,S +544,1,2,"Beane, Mr. Edward",male,32,1,0,2908,26,,S +545,0,1,"Douglas, Mr. Walter Donald",male,50,1,0,PC 17761,106.425,C86,C +546,0,1,"Nicholson, Mr. Arthur Ernest",male,64,0,0,693,26,,S +547,1,2,"Beane, Mrs. Edward (Ethel Clarke)",female,19,1,0,2908,26,,S +548,1,2,"Padro y Manent, Mr. Julian",male,,0,0,SC/PARIS 2146,13.8625,,C +549,0,3,"Goldsmith, Mr. Frank John",male,33,1,1,363291,20.525,,S +550,1,2,"Davies, Master. John Morgan Jr",male,8,1,1,C.A. 33112,36.75,,S +551,1,1,"Thayer, Mr. John Borland Jr",male,17,0,2,17421,110.8833,C70,C +552,0,2,"Sharp, Mr. Percival James R",male,27,0,0,244358,26,,S +553,0,3,"O'Brien, Mr. Timothy",male,,0,0,330979,7.8292,,Q +554,1,3,"Leeni, Mr. Fahim (""Philip Zenni"")",male,22,0,0,2620,7.225,,C +555,1,3,"Ohman, Miss. Velin",female,22,0,0,347085,7.775,,S +556,0,1,"Wright, Mr. George",male,62,0,0,113807,26.55,,S +557,1,1,"Duff Gordon, Lady. (Lucille Christiana Sutherland) (""Mrs Morgan"")",female,48,1,0,11755,39.6,A16,C +558,0,1,"Robbins, Mr. Victor",male,,0,0,PC 17757,227.525,,C +559,1,1,"Taussig, Mrs. Emil (Tillie Mandelbaum)",female,39,1,1,110413,79.65,E67,S +560,1,3,"de Messemaeker, Mrs. Guillaume Joseph (Emma)",female,36,1,0,345572,17.4,,S +561,0,3,"Morrow, Mr. Thomas Rowan",male,,0,0,372622,7.75,,Q +562,0,3,"Sivic, Mr. Husein",male,40,0,0,349251,7.8958,,S +563,0,2,"Norman, Mr. Robert Douglas",male,28,0,0,218629,13.5,,S +564,0,3,"Simmons, Mr. John",male,,0,0,SOTON/OQ 392082,8.05,,S +565,0,3,"Meanwell, Miss. (Marion Ogden)",female,,0,0,SOTON/O.Q. 392087,8.05,,S +566,0,3,"Davies, Mr. Alfred J",male,24,2,0,A/4 48871,24.15,,S +567,0,3,"Stoytcheff, Mr. Ilia",male,19,0,0,349205,7.8958,,S +568,0,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29,0,4,349909,21.075,,S +569,0,3,"Doharr, Mr. Tannous",male,,0,0,2686,7.2292,,C +570,1,3,"Jonsson, Mr. Carl",male,32,0,0,350417,7.8542,,S +571,1,2,"Harris, Mr. George",male,62,0,0,S.W./PP 752,10.5,,S +572,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53,2,0,11769,51.4792,C101,S +573,1,1,"Flynn, Mr. John Irwin (""Irving"")",male,36,0,0,PC 17474,26.3875,E25,S +574,1,3,"Kelly, Miss. Mary",female,,0,0,14312,7.75,,Q +575,0,3,"Rush, Mr. Alfred George John",male,16,0,0,A/4. 20589,8.05,,S +576,0,3,"Patchett, Mr. George",male,19,0,0,358585,14.5,,S +577,1,2,"Garside, Miss. Ethel",female,34,0,0,243880,13,,S +578,1,1,"Silvey, Mrs. William Baird (Alice Munger)",female,39,1,0,13507,55.9,E44,S +579,0,3,"Caram, Mrs. Joseph (Maria Elias)",female,,1,0,2689,14.4583,,C +580,1,3,"Jussila, Mr. Eiriik",male,32,0,0,STON/O 2. 3101286,7.925,,S +581,1,2,"Christy, Miss. Julie Rachel",female,25,1,1,237789,30,,S +582,1,1,"Thayer, Mrs. John Borland (Marian Longstreth Morris)",female,39,1,1,17421,110.8833,C68,C +583,0,2,"Downton, Mr. William James",male,54,0,0,28403,26,,S +584,0,1,"Ross, Mr. John Hugo",male,36,0,0,13049,40.125,A10,C +585,0,3,"Paulner, Mr. Uscher",male,,0,0,3411,8.7125,,C +586,1,1,"Taussig, Miss. Ruth",female,18,0,2,110413,79.65,E68,S +587,0,2,"Jarvis, Mr. John Denzil",male,47,0,0,237565,15,,S +588,1,1,"Frolicher-Stehli, Mr. Maxmillian",male,60,1,1,13567,79.2,B41,C +589,0,3,"Gilinski, Mr. Eliezer",male,22,0,0,14973,8.05,,S +590,0,3,"Murdlin, Mr. Joseph",male,,0,0,A./5. 3235,8.05,,S +591,0,3,"Rintamaki, Mr. Matti",male,35,0,0,STON/O 2. 3101273,7.125,,S +592,1,1,"Stephenson, Mrs. Walter Bertram (Martha Eustis)",female,52,1,0,36947,78.2667,D20,C +593,0,3,"Elsbury, Mr. William James",male,47,0,0,A/5 3902,7.25,,S +594,0,3,"Bourke, Miss. Mary",female,,0,2,364848,7.75,,Q +595,0,2,"Chapman, Mr. John Henry",male,37,1,0,SC/AH 29037,26,,S +596,0,3,"Van Impe, Mr. Jean Baptiste",male,36,1,1,345773,24.15,,S +597,1,2,"Leitch, Miss. Jessie Wills",female,,0,0,248727,33,,S +598,0,3,"Johnson, Mr. Alfred",male,49,0,0,LINE,0,,S +599,0,3,"Boulos, Mr. Hanna",male,,0,0,2664,7.225,,C +600,1,1,"Duff Gordon, Sir. Cosmo Edmund (""Mr Morgan"")",male,49,1,0,PC 17485,56.9292,A20,C +601,1,2,"Jacobsohn, Mrs. Sidney Samuel (Amy Frances Christy)",female,24,2,1,243847,27,,S +602,0,3,"Slabenoff, Mr. Petco",male,,0,0,349214,7.8958,,S +603,0,1,"Harrington, Mr. Charles H",male,,0,0,113796,42.4,,S +604,0,3,"Torber, Mr. Ernst William",male,44,0,0,364511,8.05,,S +605,1,1,"Homer, Mr. Harry (""Mr E Haven"")",male,35,0,0,111426,26.55,,C +606,0,3,"Lindell, Mr. Edvard Bengtsson",male,36,1,0,349910,15.55,,S +607,0,3,"Karaic, Mr. Milan",male,30,0,0,349246,7.8958,,S +608,1,1,"Daniel, Mr. Robert Williams",male,27,0,0,113804,30.5,,S +609,1,2,"Laroche, Mrs. Joseph (Juliette Marie Louise Lafargue)",female,22,1,2,SC/Paris 2123,41.5792,,C +610,1,1,"Shutes, Miss. Elizabeth W",female,40,0,0,PC 17582,153.4625,C125,S +611,0,3,"Andersson, Mrs. Anders Johan (Alfrida Konstantia Brogren)",female,39,1,5,347082,31.275,,S +612,0,3,"Jardin, Mr. Jose Neto",male,,0,0,SOTON/O.Q. 3101305,7.05,,S +613,1,3,"Murphy, Miss. Margaret Jane",female,,1,0,367230,15.5,,Q +614,0,3,"Horgan, Mr. John",male,,0,0,370377,7.75,,Q +615,0,3,"Brocklebank, Mr. William Alfred",male,35,0,0,364512,8.05,,S +616,1,2,"Herman, Miss. Alice",female,24,1,2,220845,65,,S +617,0,3,"Danbom, Mr. Ernst Gilbert",male,34,1,1,347080,14.4,,S +618,0,3,"Lobb, Mrs. William Arthur (Cordelia K Stanlick)",female,26,1,0,A/5. 3336,16.1,,S +619,1,2,"Becker, Miss. Marion Louise",female,4,2,1,230136,39,F4,S +620,0,2,"Gavey, Mr. Lawrence",male,26,0,0,31028,10.5,,S +621,0,3,"Yasbeck, Mr. Antoni",male,27,1,0,2659,14.4542,,C +622,1,1,"Kimball, Mr. Edwin Nelson Jr",male,42,1,0,11753,52.5542,D19,S +623,1,3,"Nakid, Mr. Sahid",male,20,1,1,2653,15.7417,,C +624,0,3,"Hansen, Mr. Henry Damsgaard",male,21,0,0,350029,7.8542,,S +625,0,3,"Bowen, Mr. David John ""Dai""",male,21,0,0,54636,16.1,,S +626,0,1,"Sutton, Mr. Frederick",male,61,0,0,36963,32.3208,D50,S +627,0,2,"Kirkland, Rev. Charles Leonard",male,57,0,0,219533,12.35,,Q +628,1,1,"Longley, Miss. Gretchen Fiske",female,21,0,0,13502,77.9583,D9,S +629,0,3,"Bostandyeff, Mr. Guentcho",male,26,0,0,349224,7.8958,,S +630,0,3,"O'Connell, Mr. Patrick D",male,,0,0,334912,7.7333,,Q +631,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,80,0,0,27042,30,A23,S +632,0,3,"Lundahl, Mr. Johan Svensson",male,51,0,0,347743,7.0542,,S +633,1,1,"Stahelin-Maeglin, Dr. Max",male,32,0,0,13214,30.5,B50,C +634,0,1,"Parr, Mr. William Henry Marsh",male,,0,0,112052,0,,S +635,0,3,"Skoog, Miss. Mabel",female,9,3,2,347088,27.9,,S +636,1,2,"Davis, Miss. Mary",female,28,0,0,237668,13,,S +637,0,3,"Leinonen, Mr. Antti Gustaf",male,32,0,0,STON/O 2. 3101292,7.925,,S +638,0,2,"Collyer, Mr. Harvey",male,31,1,1,C.A. 31921,26.25,,S +639,0,3,"Panula, Mrs. Juha (Maria Emilia Ojala)",female,41,0,5,3101295,39.6875,,S +640,0,3,"Thorneycroft, Mr. Percival",male,,1,0,376564,16.1,,S +641,0,3,"Jensen, Mr. Hans Peder",male,20,0,0,350050,7.8542,,S +642,1,1,"Sagesser, Mlle. Emma",female,24,0,0,PC 17477,69.3,B35,C +643,0,3,"Skoog, Miss. Margit Elizabeth",female,2,3,2,347088,27.9,,S +644,1,3,"Foo, Mr. Choong",male,,0,0,1601,56.4958,,S +645,1,3,"Baclini, Miss. Eugenie",female,0.75,2,1,2666,19.2583,,C +646,1,1,"Harper, Mr. Henry Sleeper",male,48,1,0,PC 17572,76.7292,D33,C +647,0,3,"Cor, Mr. Liudevit",male,19,0,0,349231,7.8958,,S +648,1,1,"Simonius-Blumer, Col. Oberst Alfons",male,56,0,0,13213,35.5,A26,C +649,0,3,"Willey, Mr. Edward",male,,0,0,S.O./P.P. 751,7.55,,S +650,1,3,"Stanley, Miss. Amy Zillah Elsie",female,23,0,0,CA. 2314,7.55,,S +651,0,3,"Mitkoff, Mr. Mito",male,,0,0,349221,7.8958,,S +652,1,2,"Doling, Miss. Elsie",female,18,0,1,231919,23,,S +653,0,3,"Kalvik, Mr. Johannes Halvorsen",male,21,0,0,8475,8.4333,,S +654,1,3,"O'Leary, Miss. Hanora ""Norah""",female,,0,0,330919,7.8292,,Q +655,0,3,"Hegarty, Miss. Hanora ""Nora""",female,18,0,0,365226,6.75,,Q +656,0,2,"Hickman, Mr. Leonard Mark",male,24,2,0,S.O.C. 14879,73.5,,S +657,0,3,"Radeff, Mr. Alexander",male,,0,0,349223,7.8958,,S +658,0,3,"Bourke, Mrs. John (Catherine)",female,32,1,1,364849,15.5,,Q +659,0,2,"Eitemiller, Mr. George Floyd",male,23,0,0,29751,13,,S +660,0,1,"Newell, Mr. Arthur Webster",male,58,0,2,35273,113.275,D48,C +661,1,1,"Frauenthal, Dr. Henry William",male,50,2,0,PC 17611,133.65,,S +662,0,3,"Badt, Mr. Mohamed",male,40,0,0,2623,7.225,,C +663,0,1,"Colley, Mr. Edward Pomeroy",male,47,0,0,5727,25.5875,E58,S +664,0,3,"Coleff, Mr. Peju",male,36,0,0,349210,7.4958,,S +665,1,3,"Lindqvist, Mr. Eino William",male,20,1,0,STON/O 2. 3101285,7.925,,S +666,0,2,"Hickman, Mr. Lewis",male,32,2,0,S.O.C. 14879,73.5,,S +667,0,2,"Butler, Mr. Reginald Fenton",male,25,0,0,234686,13,,S +668,0,3,"Rommetvedt, Mr. Knud Paust",male,,0,0,312993,7.775,,S +669,0,3,"Cook, Mr. Jacob",male,43,0,0,A/5 3536,8.05,,S +670,1,1,"Taylor, Mrs. Elmer Zebley (Juliet Cummins Wright)",female,,1,0,19996,52,C126,S +671,1,2,"Brown, Mrs. Thomas William Solomon (Elizabeth Catherine Ford)",female,40,1,1,29750,39,,S +672,0,1,"Davidson, Mr. Thornton",male,31,1,0,F.C. 12750,52,B71,S +673,0,2,"Mitchell, Mr. Henry Michael",male,70,0,0,C.A. 24580,10.5,,S +674,1,2,"Wilhelms, Mr. Charles",male,31,0,0,244270,13,,S +675,0,2,"Watson, Mr. Ennis Hastings",male,,0,0,239856,0,,S +676,0,3,"Edvardsson, Mr. Gustaf Hjalmar",male,18,0,0,349912,7.775,,S +677,0,3,"Sawyer, Mr. Frederick Charles",male,24.5,0,0,342826,8.05,,S +678,1,3,"Turja, Miss. Anna Sofia",female,18,0,0,4138,9.8417,,S +679,0,3,"Goodwin, Mrs. Frederick (Augusta Tyler)",female,43,1,6,CA 2144,46.9,,S +680,1,1,"Cardeza, Mr. Thomas Drake Martinez",male,36,0,1,PC 17755,512.3292,B51 B53 B55,C +681,0,3,"Peters, Miss. Katie",female,,0,0,330935,8.1375,,Q +682,1,1,"Hassab, Mr. Hammad",male,27,0,0,PC 17572,76.7292,D49,C +683,0,3,"Olsvigen, Mr. Thor Anderson",male,20,0,0,6563,9.225,,S +684,0,3,"Goodwin, Mr. Charles Edward",male,14,5,2,CA 2144,46.9,,S +685,0,2,"Brown, Mr. Thomas William Solomon",male,60,1,1,29750,39,,S +686,0,2,"Laroche, Mr. Joseph Philippe Lemercier",male,25,1,2,SC/Paris 2123,41.5792,,C +687,0,3,"Panula, Mr. Jaako Arnold",male,14,4,1,3101295,39.6875,,S +688,0,3,"Dakic, Mr. Branko",male,19,0,0,349228,10.1708,,S +689,0,3,"Fischer, Mr. Eberhard Thelander",male,18,0,0,350036,7.7958,,S +690,1,1,"Madill, Miss. Georgette Alexandra",female,15,0,1,24160,211.3375,B5,S +691,1,1,"Dick, Mr. Albert Adrian",male,31,1,0,17474,57,B20,S +692,1,3,"Karun, Miss. Manca",female,4,0,1,349256,13.4167,,C +693,1,3,"Lam, Mr. Ali",male,,0,0,1601,56.4958,,S +694,0,3,"Saad, Mr. Khalil",male,25,0,0,2672,7.225,,C +695,0,1,"Weir, Col. John",male,60,0,0,113800,26.55,,S +696,0,2,"Chapman, Mr. Charles Henry",male,52,0,0,248731,13.5,,S +697,0,3,"Kelly, Mr. James",male,44,0,0,363592,8.05,,S +698,1,3,"Mullens, Miss. Katherine ""Katie""",female,,0,0,35852,7.7333,,Q +699,0,1,"Thayer, Mr. John Borland",male,49,1,1,17421,110.8833,C68,C +700,0,3,"Humblen, Mr. Adolf Mathias Nicolai Olsen",male,42,0,0,348121,7.65,F G63,S +701,1,1,"Astor, Mrs. John Jacob (Madeleine Talmadge Force)",female,18,1,0,PC 17757,227.525,C62 C64,C +702,1,1,"Silverthorne, Mr. Spencer Victor",male,35,0,0,PC 17475,26.2875,E24,S +703,0,3,"Barbara, Miss. Saiide",female,18,0,1,2691,14.4542,,C +704,0,3,"Gallagher, Mr. Martin",male,25,0,0,36864,7.7417,,Q +705,0,3,"Hansen, Mr. Henrik Juul",male,26,1,0,350025,7.8542,,S +706,0,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39,0,0,250655,26,,S +707,1,2,"Kelly, Mrs. Florence ""Fannie""",female,45,0,0,223596,13.5,,S +708,1,1,"Calderhead, Mr. Edward Pennington",male,42,0,0,PC 17476,26.2875,E24,S +709,1,1,"Cleaver, Miss. Alice",female,22,0,0,113781,151.55,,S +710,1,3,"Moubarek, Master. Halim Gonios (""William George"")",male,,1,1,2661,15.2458,,C +711,1,1,"Mayne, Mlle. Berthe Antonine (""Mrs de Villiers"")",female,24,0,0,PC 17482,49.5042,C90,C +712,0,1,"Klaber, Mr. Herman",male,,0,0,113028,26.55,C124,S +713,1,1,"Taylor, Mr. Elmer Zebley",male,48,1,0,19996,52,C126,S +714,0,3,"Larsson, Mr. August Viktor",male,29,0,0,7545,9.4833,,S +715,0,2,"Greenberg, Mr. Samuel",male,52,0,0,250647,13,,S +716,0,3,"Soholt, Mr. Peter Andreas Lauritz Andersen",male,19,0,0,348124,7.65,F G73,S +717,1,1,"Endres, Miss. Caroline Louise",female,38,0,0,PC 17757,227.525,C45,C +718,1,2,"Troutt, Miss. Edwina Celia ""Winnie""",female,27,0,0,34218,10.5,E101,S +719,0,3,"McEvoy, Mr. Michael",male,,0,0,36568,15.5,,Q +720,0,3,"Johnson, Mr. Malkolm Joackim",male,33,0,0,347062,7.775,,S +721,1,2,"Harper, Miss. Annie Jessie ""Nina""",female,6,0,1,248727,33,,S +722,0,3,"Jensen, Mr. Svend Lauritz",male,17,1,0,350048,7.0542,,S +723,0,2,"Gillespie, Mr. William Henry",male,34,0,0,12233,13,,S +724,0,2,"Hodges, Mr. Henry Price",male,50,0,0,250643,13,,S +725,1,1,"Chambers, Mr. Norman Campbell",male,27,1,0,113806,53.1,E8,S +726,0,3,"Oreskovic, Mr. Luka",male,20,0,0,315094,8.6625,,S +727,1,2,"Renouf, Mrs. Peter Henry (Lillian Jefferys)",female,30,3,0,31027,21,,S +728,1,3,"Mannion, Miss. Margareth",female,,0,0,36866,7.7375,,Q +729,0,2,"Bryhl, Mr. Kurt Arnold Gottfrid",male,25,1,0,236853,26,,S +730,0,3,"Ilmakangas, Miss. Pieta Sofia",female,25,1,0,STON/O2. 3101271,7.925,,S +731,1,1,"Allen, Miss. Elisabeth Walton",female,29,0,0,24160,211.3375,B5,S +732,0,3,"Hassan, Mr. Houssein G N",male,11,0,0,2699,18.7875,,C +733,0,2,"Knight, Mr. Robert J",male,,0,0,239855,0,,S +734,0,2,"Berriman, Mr. William John",male,23,0,0,28425,13,,S +735,0,2,"Troupiansky, Mr. Moses Aaron",male,23,0,0,233639,13,,S +736,0,3,"Williams, Mr. Leslie",male,28.5,0,0,54636,16.1,,S +737,0,3,"Ford, Mrs. Edward (Margaret Ann Watson)",female,48,1,3,W./C. 6608,34.375,,S +738,1,1,"Lesurer, Mr. Gustave J",male,35,0,0,PC 17755,512.3292,B101,C +739,0,3,"Ivanoff, Mr. Kanio",male,,0,0,349201,7.8958,,S +740,0,3,"Nankoff, Mr. Minko",male,,0,0,349218,7.8958,,S +741,1,1,"Hawksford, Mr. Walter James",male,,0,0,16988,30,D45,S +742,0,1,"Cavendish, Mr. Tyrell William",male,36,1,0,19877,78.85,C46,S +743,1,1,"Ryerson, Miss. Susan Parker ""Suzette""",female,21,2,2,PC 17608,262.375,B57 B59 B63 B66,C +744,0,3,"McNamee, Mr. Neal",male,24,1,0,376566,16.1,,S +745,1,3,"Stranden, Mr. Juho",male,31,0,0,STON/O 2. 3101288,7.925,,S +746,0,1,"Crosby, Capt. Edward Gifford",male,70,1,1,WE/P 5735,71,B22,S +747,0,3,"Abbott, Mr. Rossmore Edward",male,16,1,1,C.A. 2673,20.25,,S +748,1,2,"Sinkkonen, Miss. Anna",female,30,0,0,250648,13,,S +749,0,1,"Marvin, Mr. Daniel Warner",male,19,1,0,113773,53.1,D30,S +750,0,3,"Connaghton, Mr. Michael",male,31,0,0,335097,7.75,,Q +751,1,2,"Wells, Miss. Joan",female,4,1,1,29103,23,,S +752,1,3,"Moor, Master. Meier",male,6,0,1,392096,12.475,E121,S +753,0,3,"Vande Velde, Mr. Johannes Joseph",male,33,0,0,345780,9.5,,S +754,0,3,"Jonkoff, Mr. Lalio",male,23,0,0,349204,7.8958,,S +755,1,2,"Herman, Mrs. Samuel (Jane Laver)",female,48,1,2,220845,65,,S +756,1,2,"Hamalainen, Master. Viljo",male,0.67,1,1,250649,14.5,,S +757,0,3,"Carlsson, Mr. August Sigfrid",male,28,0,0,350042,7.7958,,S +758,0,2,"Bailey, Mr. Percy Andrew",male,18,0,0,29108,11.5,,S +759,0,3,"Theobald, Mr. Thomas Leonard",male,34,0,0,363294,8.05,,S +760,1,1,"Rothes, the Countess. of (Lucy Noel Martha Dyer-Edwards)",female,33,0,0,110152,86.5,B77,S +761,0,3,"Garfirth, Mr. John",male,,0,0,358585,14.5,,S +762,0,3,"Nirva, Mr. Iisakki Antino Aijo",male,41,0,0,SOTON/O2 3101272,7.125,,S +763,1,3,"Barah, Mr. Hanna Assi",male,20,0,0,2663,7.2292,,C +764,1,1,"Carter, Mrs. William Ernest (Lucile Polk)",female,36,1,2,113760,120,B96 B98,S +765,0,3,"Eklund, Mr. Hans Linus",male,16,0,0,347074,7.775,,S +766,1,1,"Hogeboom, Mrs. John C (Anna Andrews)",female,51,1,0,13502,77.9583,D11,S +767,0,1,"Brewe, Dr. Arthur Jackson",male,,0,0,112379,39.6,,C +768,0,3,"Mangan, Miss. Mary",female,30.5,0,0,364850,7.75,,Q +769,0,3,"Moran, Mr. Daniel J",male,,1,0,371110,24.15,,Q +770,0,3,"Gronnestad, Mr. Daniel Danielsen",male,32,0,0,8471,8.3625,,S +771,0,3,"Lievens, Mr. Rene Aime",male,24,0,0,345781,9.5,,S +772,0,3,"Jensen, Mr. Niels Peder",male,48,0,0,350047,7.8542,,S +773,0,2,"Mack, Mrs. (Mary)",female,57,0,0,S.O./P.P. 3,10.5,E77,S +774,0,3,"Elias, Mr. Dibo",male,,0,0,2674,7.225,,C +775,1,2,"Hocking, Mrs. Elizabeth (Eliza Needs)",female,54,1,3,29105,23,,S +776,0,3,"Myhrman, Mr. Pehr Fabian Oliver Malkolm",male,18,0,0,347078,7.75,,S +777,0,3,"Tobin, Mr. Roger",male,,0,0,383121,7.75,F38,Q +778,1,3,"Emanuel, Miss. Virginia Ethel",female,5,0,0,364516,12.475,,S +779,0,3,"Kilgannon, Mr. Thomas J",male,,0,0,36865,7.7375,,Q +780,1,1,"Robert, Mrs. Edward Scott (Elisabeth Walton McMillan)",female,43,0,1,24160,211.3375,B3,S +781,1,3,"Ayoub, Miss. Banoura",female,13,0,0,2687,7.2292,,C +782,1,1,"Dick, Mrs. Albert Adrian (Vera Gillespie)",female,17,1,0,17474,57,B20,S +783,0,1,"Long, Mr. Milton Clyde",male,29,0,0,113501,30,D6,S +784,0,3,"Johnston, Mr. Andrew G",male,,1,2,W./C. 6607,23.45,,S +785,0,3,"Ali, Mr. William",male,25,0,0,SOTON/O.Q. 3101312,7.05,,S +786,0,3,"Harmer, Mr. Abraham (David Lishin)",male,25,0,0,374887,7.25,,S +787,1,3,"Sjoblom, Miss. Anna Sofia",female,18,0,0,3101265,7.4958,,S +788,0,3,"Rice, Master. George Hugh",male,8,4,1,382652,29.125,,Q +789,1,3,"Dean, Master. Bertram Vere",male,1,1,2,C.A. 2315,20.575,,S +790,0,1,"Guggenheim, Mr. Benjamin",male,46,0,0,PC 17593,79.2,B82 B84,C +791,0,3,"Keane, Mr. Andrew ""Andy""",male,,0,0,12460,7.75,,Q +792,0,2,"Gaskell, Mr. Alfred",male,16,0,0,239865,26,,S +793,0,3,"Sage, Miss. Stella Anna",female,,8,2,CA. 2343,69.55,,S +794,0,1,"Hoyt, Mr. William Fisher",male,,0,0,PC 17600,30.6958,,C +795,0,3,"Dantcheff, Mr. Ristiu",male,25,0,0,349203,7.8958,,S +796,0,2,"Otter, Mr. Richard",male,39,0,0,28213,13,,S +797,1,1,"Leader, Dr. Alice (Farnham)",female,49,0,0,17465,25.9292,D17,S +798,1,3,"Osman, Mrs. Mara",female,31,0,0,349244,8.6833,,S +799,0,3,"Ibrahim Shawah, Mr. Yousseff",male,30,0,0,2685,7.2292,,C +800,0,3,"Van Impe, Mrs. Jean Baptiste (Rosalie Paula Govaert)",female,30,1,1,345773,24.15,,S +801,0,2,"Ponesell, Mr. Martin",male,34,0,0,250647,13,,S +802,1,2,"Collyer, Mrs. Harvey (Charlotte Annie Tate)",female,31,1,1,C.A. 31921,26.25,,S +803,1,1,"Carter, Master. William Thornton II",male,11,1,2,113760,120,B96 B98,S +804,1,3,"Thomas, Master. Assad Alexander",male,0.42,0,1,2625,8.5167,,C +805,1,3,"Hedman, Mr. Oskar Arvid",male,27,0,0,347089,6.975,,S +806,0,3,"Johansson, Mr. Karl Johan",male,31,0,0,347063,7.775,,S +807,0,1,"Andrews, Mr. Thomas Jr",male,39,0,0,112050,0,A36,S +808,0,3,"Pettersson, Miss. Ellen Natalia",female,18,0,0,347087,7.775,,S +809,0,2,"Meyer, Mr. August",male,39,0,0,248723,13,,S +810,1,1,"Chambers, Mrs. Norman Campbell (Bertha Griggs)",female,33,1,0,113806,53.1,E8,S +811,0,3,"Alexander, Mr. William",male,26,0,0,3474,7.8875,,S +812,0,3,"Lester, Mr. James",male,39,0,0,A/4 48871,24.15,,S +813,0,2,"Slemen, Mr. Richard James",male,35,0,0,28206,10.5,,S +814,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6,4,2,347082,31.275,,S +815,0,3,"Tomlin, Mr. Ernest Portage",male,30.5,0,0,364499,8.05,,S +816,0,1,"Fry, Mr. Richard",male,,0,0,112058,0,B102,S +817,0,3,"Heininen, Miss. Wendla Maria",female,23,0,0,STON/O2. 3101290,7.925,,S +818,0,2,"Mallet, Mr. Albert",male,31,1,1,S.C./PARIS 2079,37.0042,,C +819,0,3,"Holm, Mr. John Fredrik Alexander",male,43,0,0,C 7075,6.45,,S +820,0,3,"Skoog, Master. Karl Thorsten",male,10,3,2,347088,27.9,,S +821,1,1,"Hays, Mrs. Charles Melville (Clara Jennings Gregg)",female,52,1,1,12749,93.5,B69,S +822,1,3,"Lulic, Mr. Nikola",male,27,0,0,315098,8.6625,,S +823,0,1,"Reuchlin, Jonkheer. John George",male,38,0,0,19972,0,,S +824,1,3,"Moor, Mrs. (Beila)",female,27,0,1,392096,12.475,E121,S +825,0,3,"Panula, Master. Urho Abraham",male,2,4,1,3101295,39.6875,,S +826,0,3,"Flynn, Mr. John",male,,0,0,368323,6.95,,Q +827,0,3,"Lam, Mr. Len",male,,0,0,1601,56.4958,,S +828,1,2,"Mallet, Master. Andre",male,1,0,2,S.C./PARIS 2079,37.0042,,C +829,1,3,"McCormack, Mr. Thomas Joseph",male,,0,0,367228,7.75,,Q +830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62,0,0,113572,80,B28, +831,1,3,"Yasbeck, Mrs. Antoni (Selini Alexander)",female,15,1,0,2659,14.4542,,C +832,1,2,"Richards, Master. George Sibley",male,0.83,1,1,29106,18.75,,S +833,0,3,"Saad, Mr. Amin",male,,0,0,2671,7.2292,,C +834,0,3,"Augustsson, Mr. Albert",male,23,0,0,347468,7.8542,,S +835,0,3,"Allum, Mr. Owen George",male,18,0,0,2223,8.3,,S +836,1,1,"Compton, Miss. Sara Rebecca",female,39,1,1,PC 17756,83.1583,E49,C +837,0,3,"Pasic, Mr. Jakob",male,21,0,0,315097,8.6625,,S +838,0,3,"Sirota, Mr. Maurice",male,,0,0,392092,8.05,,S +839,1,3,"Chip, Mr. Chang",male,32,0,0,1601,56.4958,,S +840,1,1,"Marechal, Mr. Pierre",male,,0,0,11774,29.7,C47,C +841,0,3,"Alhomaki, Mr. Ilmari Rudolf",male,20,0,0,SOTON/O2 3101287,7.925,,S +842,0,2,"Mudd, Mr. Thomas Charles",male,16,0,0,S.O./P.P. 3,10.5,,S +843,1,1,"Serepeca, Miss. Augusta",female,30,0,0,113798,31,,C +844,0,3,"Lemberopolous, Mr. Peter L",male,34.5,0,0,2683,6.4375,,C +845,0,3,"Culumovic, Mr. Jeso",male,17,0,0,315090,8.6625,,S +846,0,3,"Abbing, Mr. Anthony",male,42,0,0,C.A. 5547,7.55,,S +847,0,3,"Sage, Mr. Douglas Bullen",male,,8,2,CA. 2343,69.55,,S +848,0,3,"Markoff, Mr. Marin",male,35,0,0,349213,7.8958,,C +849,0,2,"Harper, Rev. John",male,28,0,1,248727,33,,S +850,1,1,"Goldenberg, Mrs. Samuel L (Edwiga Grabowska)",female,,1,0,17453,89.1042,C92,C +851,0,3,"Andersson, Master. Sigvard Harald Elias",male,4,4,2,347082,31.275,,S +852,0,3,"Svensson, Mr. Johan",male,74,0,0,347060,7.775,,S +853,0,3,"Boulos, Miss. Nourelain",female,9,1,1,2678,15.2458,,C +854,1,1,"Lines, Miss. Mary Conover",female,16,0,1,PC 17592,39.4,D28,S +855,0,2,"Carter, Mrs. Ernest Courtenay (Lilian Hughes)",female,44,1,0,244252,26,,S +856,1,3,"Aks, Mrs. Sam (Leah Rosen)",female,18,0,1,392091,9.35,,S +857,1,1,"Wick, Mrs. George Dennick (Mary Hitchcock)",female,45,1,1,36928,164.8667,,S +858,1,1,"Daly, Mr. Peter Denis ",male,51,0,0,113055,26.55,E17,S +859,1,3,"Baclini, Mrs. Solomon (Latifa Qurban)",female,24,0,3,2666,19.2583,,C +860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C +861,0,3,"Hansen, Mr. Claus Peter",male,41,2,0,350026,14.1083,,S +862,0,2,"Giles, Mr. Frederick Edward",male,21,1,0,28134,11.5,,S +863,1,1,"Swift, Mrs. Frederick Joel (Margaret Welles Barron)",female,48,0,0,17466,25.9292,D17,S +864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.55,,S +865,0,2,"Gill, Mr. John William",male,24,0,0,233866,13,,S +866,1,2,"Bystrom, Mrs. (Karolina)",female,42,0,0,236852,13,,S +867,1,2,"Duran y More, Miss. Asuncion",female,27,1,0,SC/PARIS 2149,13.8583,,C +868,0,1,"Roebling, Mr. Washington Augustus II",male,31,0,0,PC 17590,50.4958,A24,S +869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5,,S +870,1,3,"Johnson, Master. Harold Theodor",male,4,1,1,347742,11.1333,,S +871,0,3,"Balkic, Mr. Cerin",male,26,0,0,349248,7.8958,,S +872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47,1,1,11751,52.5542,D35,S +873,0,1,"Carlsson, Mr. Frans Olof",male,33,0,0,695,5,B51 B53 B55,S +874,0,3,"Vander Cruyssen, Mr. Victor",male,47,0,0,345765,9,,S +875,1,2,"Abelson, Mrs. Samuel (Hannah Wizosky)",female,28,1,0,P/PP 3381,24,,C +876,1,3,"Najib, Miss. Adele Kiamie ""Jane""",female,15,0,0,2667,7.225,,C +877,0,3,"Gustafsson, Mr. Alfred Ossian",male,20,0,0,7534,9.8458,,S +878,0,3,"Petroff, Mr. Nedelio",male,19,0,0,349212,7.8958,,S +879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S +880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56,0,1,11767,83.1583,C50,C +881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25,0,1,230433,26,,S +882,0,3,"Markun, Mr. Johann",male,33,0,0,349257,7.8958,,S +883,0,3,"Dahlberg, Miss. Gerda Ulrika",female,22,0,0,7552,10.5167,,S +884,0,2,"Banfield, Mr. Frederick James",male,28,0,0,C.A./SOTON 34068,10.5,,S +885,0,3,"Sutehall, Mr. Henry Jr",male,25,0,0,SOTON/OQ 392076,7.05,,S +886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39,0,5,382652,29.125,,Q +887,0,2,"Montvila, Rev. Juozas",male,27,0,0,211536,13,,S +888,1,1,"Graham, Miss. Margaret Edith",female,19,0,0,112053,30,B42,S +889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S +890,1,1,"Behr, Mr. Karl Howell",male,26,0,0,111369,30,C148,C +891,0,3,"Dooley, Mr. Patrick",male,32,0,0,370376,7.75,,Q \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/.ipynb_checkpoints/Dockerfile-checkpoint b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/.ipynb_checkpoints/Dockerfile-checkpoint new file mode 100644 index 0000000..b9524cc --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/.ipynb_checkpoints/Dockerfile-checkpoint @@ -0,0 +1,59 @@ +FROM ubuntu:latest +# Set a docker label to advertise multi-model support on the container +LABEL com.amazonaws.sagemaker.capabilities.multi-models=false +# Set a docker label to enable container to use SAGEMAKER_BIND_TO_PORT environment variable if present +LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true + +# Install some handful libraries like curl, wget, git, build-essential, zlib +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + gcc \ + python3.7 \ + python3-dev \ + python3-pip \ + ca-certificates \ + git \ + curl \ + openjdk-8-jre-headless\ + wget &&\ + rm -rf /var/lib/apt/lists/* + +# install the SageMaker Inference Toolkit +RUN pip3 install --no-cache \ + multi-model-server \ + sagemaker-inference \ + retrying + +# Change working directory +WORKDIR / + +# Install requirements +COPY requirements.txt /opt/ml/code/src/requirements.txt +RUN pip3 install --no-cache -r /opt/ml/code/src/requirements.txt + +# set some environment variables +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PYTHONIOENCODING=UTF-8 \ + LANG=C.UTF-8 \ + LC_ALL=C.UTF-8 + +# copy folders for code +COPY src/config/ /opt/ml/code/config/ +COPY src/ml/ /opt/ml/code/ml/ +COPY src/util.py /opt/ml/code/util.py + +# Copy entrypoint script to the image and make it executable +COPY inference/main.py /opt/ml/code/main.py +COPY inference/handler.py /opt/ml/code/serving/handler.py + +# install sagemaker training +RUN pip3 install --no-cache --upgrade \ + boto3 \ + sagemaker + +# Setting PYTHONPATH to access the copied code +ENV PYTHONPATH="/opt/ml/code:${PATH}" + +# Add a Python script and configure Docker to run it +ENTRYPOINT ["python3", "/opt/ml/code/main.py"] diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/.ipynb_checkpoints/handler-checkpoint.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/.ipynb_checkpoints/handler-checkpoint.py new file mode 100644 index 0000000..b6bdc50 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/.ipynb_checkpoints/handler-checkpoint.py @@ -0,0 +1,65 @@ +import sys +sys.path.append("..") + +import os +import logging +import pandas as pd +from joblib import load +from six import StringIO + +from ml.model.wrapper import Wrapper +from sagemaker_inference.default_inference_handler import DefaultInferenceHandler +from sagemaker_inference.default_handler_service import DefaultHandlerService +from sagemaker_inference import content_types, errors, transformer, encoder, decoder + +logging.getLogger().setLevel('INFO') + +# Path to access the model +MODEL_DIR = '/opt/ml/model' + +def _csv_to_pandas(string_like): # type: (str) -> pd.DataFrame + """Convert a CSV object to a pandas DataFrame. + Args: + string_like (str): CSV string. + + Returns: + (pd.DataFrame): pandas DataFrame + """ + stream = StringIO(string_like) + res = pd.read_csv(stream) + return res + +class HandlerService(DefaultHandlerService, DefaultInferenceHandler): + """ + Execute the inference step in the virtual environment + + """ + def __init__(self): + op = transformer.Transformer(default_inference_handler=self) + super(HandlerService, self).__init__(transformer=op) + + # Loads the model from the disk + def default_model_fn(self, model_dir): + logging.info('Loading the model') + return load(os.path.join(MODEL_DIR, "model.pkl")) + + # Parse and check the format of the input data + def default_input_fn(self, input_data, content_type): + global colunas + if content_type != "text/csv": + raise Exception("Invalid content-type: %s" % content_type) + return _csv_to_pandas(input_data) + + # Run our model and do the prediction + def default_predict_fn(self, df, model): + logging.info('Predicting...') + resultados = model.predict(df,included_input=True) + logging.info('Prediction Complete') + return resultados.reset_index(drop=True).T.reset_index().T + + # Gets the prediction output and format it to be returned to the user + def default_output_fn(self, prediction, accept): + logging.info('Saving') + if accept != "text/csv": + raise Exception("Invalid accept: %s" % accept) + return encoder.encode(prediction, accept) \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/.ipynb_checkpoints/main-checkpoint.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/.ipynb_checkpoints/main-checkpoint.py new file mode 100644 index 0000000..9ff9b2a --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/.ipynb_checkpoints/main-checkpoint.py @@ -0,0 +1,12 @@ +import argparse +import sys +import os +import logging +from sagemaker_inference import model_server + +logging.getLogger().setLevel(logging.INFO) + + +if __name__ == "__main__": + + model_server.start_model_server(handler_service="serving.handler") \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/Dockerfile b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/Dockerfile new file mode 100644 index 0000000..b9524cc --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/Dockerfile @@ -0,0 +1,59 @@ +FROM ubuntu:latest +# Set a docker label to advertise multi-model support on the container +LABEL com.amazonaws.sagemaker.capabilities.multi-models=false +# Set a docker label to enable container to use SAGEMAKER_BIND_TO_PORT environment variable if present +LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true + +# Install some handful libraries like curl, wget, git, build-essential, zlib +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + gcc \ + python3.7 \ + python3-dev \ + python3-pip \ + ca-certificates \ + git \ + curl \ + openjdk-8-jre-headless\ + wget &&\ + rm -rf /var/lib/apt/lists/* + +# install the SageMaker Inference Toolkit +RUN pip3 install --no-cache \ + multi-model-server \ + sagemaker-inference \ + retrying + +# Change working directory +WORKDIR / + +# Install requirements +COPY requirements.txt /opt/ml/code/src/requirements.txt +RUN pip3 install --no-cache -r /opt/ml/code/src/requirements.txt + +# set some environment variables +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PYTHONIOENCODING=UTF-8 \ + LANG=C.UTF-8 \ + LC_ALL=C.UTF-8 + +# copy folders for code +COPY src/config/ /opt/ml/code/config/ +COPY src/ml/ /opt/ml/code/ml/ +COPY src/util.py /opt/ml/code/util.py + +# Copy entrypoint script to the image and make it executable +COPY inference/main.py /opt/ml/code/main.py +COPY inference/handler.py /opt/ml/code/serving/handler.py + +# install sagemaker training +RUN pip3 install --no-cache --upgrade \ + boto3 \ + sagemaker + +# Setting PYTHONPATH to access the copied code +ENV PYTHONPATH="/opt/ml/code:${PATH}" + +# Add a Python script and configure Docker to run it +ENTRYPOINT ["python3", "/opt/ml/code/main.py"] diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/handler.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/handler.py new file mode 100644 index 0000000..b6bdc50 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/handler.py @@ -0,0 +1,65 @@ +import sys +sys.path.append("..") + +import os +import logging +import pandas as pd +from joblib import load +from six import StringIO + +from ml.model.wrapper import Wrapper +from sagemaker_inference.default_inference_handler import DefaultInferenceHandler +from sagemaker_inference.default_handler_service import DefaultHandlerService +from sagemaker_inference import content_types, errors, transformer, encoder, decoder + +logging.getLogger().setLevel('INFO') + +# Path to access the model +MODEL_DIR = '/opt/ml/model' + +def _csv_to_pandas(string_like): # type: (str) -> pd.DataFrame + """Convert a CSV object to a pandas DataFrame. + Args: + string_like (str): CSV string. + + Returns: + (pd.DataFrame): pandas DataFrame + """ + stream = StringIO(string_like) + res = pd.read_csv(stream) + return res + +class HandlerService(DefaultHandlerService, DefaultInferenceHandler): + """ + Execute the inference step in the virtual environment + + """ + def __init__(self): + op = transformer.Transformer(default_inference_handler=self) + super(HandlerService, self).__init__(transformer=op) + + # Loads the model from the disk + def default_model_fn(self, model_dir): + logging.info('Loading the model') + return load(os.path.join(MODEL_DIR, "model.pkl")) + + # Parse and check the format of the input data + def default_input_fn(self, input_data, content_type): + global colunas + if content_type != "text/csv": + raise Exception("Invalid content-type: %s" % content_type) + return _csv_to_pandas(input_data) + + # Run our model and do the prediction + def default_predict_fn(self, df, model): + logging.info('Predicting...') + resultados = model.predict(df,included_input=True) + logging.info('Prediction Complete') + return resultados.reset_index(drop=True).T.reset_index().T + + # Gets the prediction output and format it to be returned to the user + def default_output_fn(self, prediction, accept): + logging.info('Saving') + if accept != "text/csv": + raise Exception("Invalid accept: %s" % accept) + return encoder.encode(prediction, accept) \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/main.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/main.py new file mode 100644 index 0000000..9ff9b2a --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/main.py @@ -0,0 +1,12 @@ +import argparse +import sys +import os +import logging +from sagemaker_inference import model_server + +logging.getLogger().setLevel(logging.INFO) + + +if __name__ == "__main__": + + model_server.start_model_server(handler_service="serving.handler") \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/.ipynb_checkpoints/Dockerfile-checkpoint b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/.ipynb_checkpoints/Dockerfile-checkpoint new file mode 100644 index 0000000..38fa906 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/.ipynb_checkpoints/Dockerfile-checkpoint @@ -0,0 +1,60 @@ +FROM ubuntu:latest +# Set a docker label to advertise multi-model support on the container +LABEL com.amazonaws.sagemaker.capabilities.multi-models=false +# Set a docker label to enable container to use SAGEMAKER_BIND_TO_PORT environment variable if present +LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true + +# No question/dialog is asked during apt-get install +ARG DEBIAN_FRONTEND=noninteractive + +# Setting the Timezone Environment Variable +ENV TZ=America/Sao_Paulo + +# install ubuntu libraries +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + gcc \ + python3.7 \ + python3-dev \ + python3-pip \ + ca-certificates \ + git \ + curl \ + nginx \ + openjdk-8-jre-headless\ + wget &&\ + rm -rf /var/lib/apt/lists/* + +# Create folders for code +RUN mkdir /opt/ml && \ + mkdir /opt/ml/processing && \ + mkdir /opt/ml/processing/input && \ + mkdir /opt/ml/processing/input/raw_data && \ + mkdir /opt/ml/processing/input/preprocessing && \ + mkdir /opt/ml/processing/input/expectations && \ + mkdir /opt/ml/processing/output && \ + mkdir /opt/ml/processing/output/processed && \ + mkdir /opt/ml/processing/output/processed/train && \ + mkdir /opt/ml/processing/output/processed/val && \ + mkdir /opt/ml/processing/output/processed/inference && \ + mkdir /opt/ml/processing/output/expectations && \ + mkdir /opt/ml/processing/output/validations + +# Install requirements +COPY requirements.txt /opt/ml/code/src/requirements.txt +RUN pip3 install --no-cache -r /opt/ml/code/src/requirements.txt + +# Copy entrypoint script to the image and make it executable +COPY src/config/ /opt/ml/code/src/config/ +COPY src/ml/ /opt/ml/processing/ml/ +COPY src/util.py /opt/ml/processing/util.py +COPY processor/preprocessor.py /opt/ml/processing/preprocessor.py + +# Change working directory +WORKDIR /opt/ml/processing + +# Setting PYTHONPATH to access the copied code +ENV PYTHONPATH="/opt/ml/processing:${PATH}" + +# Add a Python script and configure Docker to run it +ENTRYPOINT ["python3", "preprocessor.py"] diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/.ipynb_checkpoints/preprocessor-checkpoint.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/.ipynb_checkpoints/preprocessor-checkpoint.py new file mode 100644 index 0000000..1920dbd --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/.ipynb_checkpoints/preprocessor-checkpoint.py @@ -0,0 +1,68 @@ +from ml.preprocessing.preprocessing import Preprocessing +from ml.preprocessing.dataquality import DataQuality +import great_expectations as ge +from datetime import date +import pandas as pd +import argparse +import logging +import glob +import json +from joblib import dump, load + +logging.getLogger().setLevel('INFO') + +if __name__=='__main__': + """ + Execute the processor step in the virtual environment + + """ + logging.info('Starting the preprocessing') + + # Read the step argument (train or test) + parser = argparse.ArgumentParser() + parser.add_argument('--step', type=str, default='train') + args = parser.parse_args() + step_train = True if args.step == "train" else False + logging.info(f'step_train: {step_train}') + + logging.info('Reading the inputs') + file = glob.glob("/opt/ml/processing/input/raw_data/*.csv")[0] + logging.info(f'Reading file: {file}') + df = pd.read_csv(file) + + + logging.info("Data Quality") + # If True, it creates the DataQuality object, otherwise it loads an existing one + if step_train: + dq = DataQuality(discrete_cat_cols=['Sex', 'Pclass','Survived']) + df_ge = dq.perform(df) + df_ge.save_expectation_suite('/opt/ml/processing/output/expectations/expectations.json') + else: + date = date.today().strftime('%Y%m%d') + df_ge = ge.dataset.PandasDataset(df) + ge_val = df_ge.validate(expectation_suite='/opt/ml/processing/input/expectations/expectations.json', only_return_failures=False) + with open(f'/opt/ml/processing/output/validations/{date}.json', 'w') as f: + json.dump(ge_val.to_json_dict(), f) + + logging.info("Preprocessing") + # If True, it creates the Preprocessing object, otherwise it loads an existing one + if step_train: + norm_cols = {'min-max': ['Age']} + oneHot_cols = ['Pclass','Sex'] + p = Preprocessing(norm_cols, oneHot_cols) + train, test_train = p.execute(df, step_train = True, val_size = 0.2) + else: + p = load("/opt/ml/processing/input/preprocessing/preprocessing.pkl") + test = p.execute(df, step_train = False) + + logging.info("Saving") + # If True, it saves the Preprocessing to be used later in the inference step + if step_train: + dump(p, '/opt/ml/processing/output/preprocessing/preprocessing.pkl') + + # If True, it saves the train and val files, otherwise it saves only the inference file + if step_train: + train.to_csv('/opt/ml/processing/output/processed/train/train.csv', index=False) + test_train.to_csv('/opt/ml/processing/output/processed/val/val.csv', index=False) + else: + test.to_csv('/opt/ml/processing/output/processed/inference/inference.csv', index=False) \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/Dockerfile b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/Dockerfile new file mode 100644 index 0000000..38fa906 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/Dockerfile @@ -0,0 +1,60 @@ +FROM ubuntu:latest +# Set a docker label to advertise multi-model support on the container +LABEL com.amazonaws.sagemaker.capabilities.multi-models=false +# Set a docker label to enable container to use SAGEMAKER_BIND_TO_PORT environment variable if present +LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true + +# No question/dialog is asked during apt-get install +ARG DEBIAN_FRONTEND=noninteractive + +# Setting the Timezone Environment Variable +ENV TZ=America/Sao_Paulo + +# install ubuntu libraries +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + gcc \ + python3.7 \ + python3-dev \ + python3-pip \ + ca-certificates \ + git \ + curl \ + nginx \ + openjdk-8-jre-headless\ + wget &&\ + rm -rf /var/lib/apt/lists/* + +# Create folders for code +RUN mkdir /opt/ml && \ + mkdir /opt/ml/processing && \ + mkdir /opt/ml/processing/input && \ + mkdir /opt/ml/processing/input/raw_data && \ + mkdir /opt/ml/processing/input/preprocessing && \ + mkdir /opt/ml/processing/input/expectations && \ + mkdir /opt/ml/processing/output && \ + mkdir /opt/ml/processing/output/processed && \ + mkdir /opt/ml/processing/output/processed/train && \ + mkdir /opt/ml/processing/output/processed/val && \ + mkdir /opt/ml/processing/output/processed/inference && \ + mkdir /opt/ml/processing/output/expectations && \ + mkdir /opt/ml/processing/output/validations + +# Install requirements +COPY requirements.txt /opt/ml/code/src/requirements.txt +RUN pip3 install --no-cache -r /opt/ml/code/src/requirements.txt + +# Copy entrypoint script to the image and make it executable +COPY src/config/ /opt/ml/code/src/config/ +COPY src/ml/ /opt/ml/processing/ml/ +COPY src/util.py /opt/ml/processing/util.py +COPY processor/preprocessor.py /opt/ml/processing/preprocessor.py + +# Change working directory +WORKDIR /opt/ml/processing + +# Setting PYTHONPATH to access the copied code +ENV PYTHONPATH="/opt/ml/processing:${PATH}" + +# Add a Python script and configure Docker to run it +ENTRYPOINT ["python3", "preprocessor.py"] diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/preprocessor.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/preprocessor.py new file mode 100644 index 0000000..1920dbd --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/preprocessor.py @@ -0,0 +1,68 @@ +from ml.preprocessing.preprocessing import Preprocessing +from ml.preprocessing.dataquality import DataQuality +import great_expectations as ge +from datetime import date +import pandas as pd +import argparse +import logging +import glob +import json +from joblib import dump, load + +logging.getLogger().setLevel('INFO') + +if __name__=='__main__': + """ + Execute the processor step in the virtual environment + + """ + logging.info('Starting the preprocessing') + + # Read the step argument (train or test) + parser = argparse.ArgumentParser() + parser.add_argument('--step', type=str, default='train') + args = parser.parse_args() + step_train = True if args.step == "train" else False + logging.info(f'step_train: {step_train}') + + logging.info('Reading the inputs') + file = glob.glob("/opt/ml/processing/input/raw_data/*.csv")[0] + logging.info(f'Reading file: {file}') + df = pd.read_csv(file) + + + logging.info("Data Quality") + # If True, it creates the DataQuality object, otherwise it loads an existing one + if step_train: + dq = DataQuality(discrete_cat_cols=['Sex', 'Pclass','Survived']) + df_ge = dq.perform(df) + df_ge.save_expectation_suite('/opt/ml/processing/output/expectations/expectations.json') + else: + date = date.today().strftime('%Y%m%d') + df_ge = ge.dataset.PandasDataset(df) + ge_val = df_ge.validate(expectation_suite='/opt/ml/processing/input/expectations/expectations.json', only_return_failures=False) + with open(f'/opt/ml/processing/output/validations/{date}.json', 'w') as f: + json.dump(ge_val.to_json_dict(), f) + + logging.info("Preprocessing") + # If True, it creates the Preprocessing object, otherwise it loads an existing one + if step_train: + norm_cols = {'min-max': ['Age']} + oneHot_cols = ['Pclass','Sex'] + p = Preprocessing(norm_cols, oneHot_cols) + train, test_train = p.execute(df, step_train = True, val_size = 0.2) + else: + p = load("/opt/ml/processing/input/preprocessing/preprocessing.pkl") + test = p.execute(df, step_train = False) + + logging.info("Saving") + # If True, it saves the Preprocessing to be used later in the inference step + if step_train: + dump(p, '/opt/ml/processing/output/preprocessing/preprocessing.pkl') + + # If True, it saves the train and val files, otherwise it saves only the inference file + if step_train: + train.to_csv('/opt/ml/processing/output/processed/train/train.csv', index=False) + test_train.to_csv('/opt/ml/processing/output/processed/val/val.csv', index=False) + else: + test.to_csv('/opt/ml/processing/output/processed/inference/inference.csv', index=False) \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/requirements.txt b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/requirements.txt new file mode 100644 index 0000000..a9d480f --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/requirements.txt @@ -0,0 +1,28 @@ +category-encoders +coverage +datetime +Flask +gunicorn +hermione-ml +matplotlib +mlflow +mlxtend +numpy +pandas +plotly +pytest +seaborn +scikit-learn +scipy +statsmodels +tqdm +yellowbrick +vega_datasets +altair +pandas_profiling +streamlit_pandas_profiling +interpret-community +lime +lightgbm +great_expectations +stepfunctions diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/api/app.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/api/app.py new file mode 100644 index 0000000..7441f7c --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/api/app.py @@ -0,0 +1,41 @@ +from flask import Flask, request, redirect, url_for, flash, jsonify +import numpy as np +import pandas as pd +from joblib import load +import json +import logging + +logging.getLogger().setLevel(logging.INFO) + +app = Flask(__name__) + +def predict_new(X, probs=True): + model = load('model/titanic_model_rf.pkl') + p = model.get_preprocessing() + + X = p.clean_data(X) + X = p.categ_encoding(X) + + columns = model.get_columns() + for col in columns: + if col not in X.columns: + X[col] = 0 + if probs: + return model.predict_proba(X)[:,1] + else: + return model.predict(X) + +@app.route('/invocations', methods=['POST']) +def predict(): + data = pd.read_json(request.json) + predictions = np.array2string(predict_new(data, probs=True)) + return jsonify(predictions) + +@app.route('/health', methods=['GET']) +def health_check(): + resp = jsonify(success=True) + return resp + + +if __name__ == "__main__": + app.run(host='0.0.0.0') \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/api/myrequests.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/api/myrequests.py new file mode 100644 index 0000000..46fda29 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/api/myrequests.py @@ -0,0 +1,17 @@ +import requests +import json + +url = 'http://localhost:5000/invocations' + +data = { + 'Pclass':[3,3,3], + 'Sex': ['male', 'female', 'male'], + 'Age':[4, 22, 28] + } +j_data = json.dumps(data) + +headers = {'Content-Type': 'application/json'} +print("Sending request for model...") +print(f"Data: {j_data}") +r = requests.post(url, json=j_data, headers=headers) +print(f"Response: {r.text}") \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/api/wsgi.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/api/wsgi.py new file mode 100644 index 0000000..9e83905 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/api/wsgi.py @@ -0,0 +1,4 @@ +from app import app + +if __name__ == "__main__": + app.run(use_reloader=True, debug=True) diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/config/.ipynb_checkpoints/config-checkpoint.json b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/config/.ipynb_checkpoints/config-checkpoint.json new file mode 100644 index 0000000..c34a7bc --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/config/.ipynb_checkpoints/config-checkpoint.json @@ -0,0 +1,7 @@ +{ +"project_name": "hermione-sagemaker", + "env_path": "hermione-sagemaker/hermione-sagemaker_env", + "files_path": "../data/raw/", + "key": "<<<>>>", + "user": "<<<>>>" + } \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/config/config.json b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/config/config.json new file mode 100644 index 0000000..c34a7bc --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/config/config.json @@ -0,0 +1,7 @@ +{ +"project_name": "hermione-sagemaker", + "env_path": "hermione-sagemaker/hermione-sagemaker_env", + "files_path": "../data/raw/", + "key": "<<<>>>", + "user": "<<<>>>" + } \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/.ipynb_checkpoints/cluster-checkpoint.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/.ipynb_checkpoints/cluster-checkpoint.py new file mode 100644 index 0000000..5e5f7a6 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/.ipynb_checkpoints/cluster-checkpoint.py @@ -0,0 +1,166 @@ +from sklearn.mixture import GaussianMixture +from sklearn.cluster import KMeans +from sklearn import metrics +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + +class Cluster: + + @classmethod + def analyzeK(cls, X, k_min = 2, k_max = 20): + """ + Plot the result of the methods (elbow, silhouette and calinski_harabas) to find the best k + + Parameters + ---------- + X : array + values ​​that will be used to find the best k + k_min : int + minimum interval for K + k_max : int + maximum range for K + + Returns + ------- + None + """ + + if X is None: + raise Exception("Error: X is None.") + if k_min is None or k_max is None: + raise Exception("Error: Range is None.") + if k_min < 2: + raise Exception("Error: k_min < 2") + + wss = [] + s_gmm = [] + s_kmeans = [] + ch_gmm = [] + ch_kmeans = [] + + K = range(k_min, k_max) + + for k in K: + kmeans = KMeans(n_clusters=k) + kmeans.fit(X) + gmm = GaussianMixture(n_components=k, covariance_type='full') + gmm.fit(X) + + labels_kmeans = kmeans.predict(X) + labels_gmm = gmm.predict(X) + + s_kmeans.append(metrics.silhouette_score(X, labels_kmeans, metric='euclidean')) + s_gmm.append(metrics.silhouette_score(X, labels_gmm, metric='euclidean')) + + ch_kmeans.append(metrics.calinski_harabasz_score(X, labels_kmeans)) + ch_gmm.append(metrics.calinski_harabasz_score(X, labels_gmm)) + + wss.append(kmeans.inertia_) + + cls._elbow(K, wss) + cls._silhouette_coefficient(K, s_kmeans, s_gmm) + cls._calinski_harabaz(K, ch_kmeans, ch_gmm) + + @classmethod + def _elbow(cls, K, wss): + """ + Function plots the result of the elbow method + + Parameters + ---------- + k : array + possible k values + k_min : array + Total WSS measures cluster compression and we want it to be as small as possible + Returns + ------- + None + """ + plt.plot(K, wss, 'bx-') + plt.xlabel('k') + plt.ylabel('WSS') + plt.title('The Elbow Method showing the optimal k') + plt.show() + + @classmethod + def _silhouette_coefficient(cls, K, s_kmeans, s_gmm): + """ + Function plots the result of the silhouette method for kmeans and Gaussian Mixture Models + + Parameters + ---------- + k : array + k values + s_kmeans : array + Silhouette kmeans values + s_gmm : array + Silhouette Gaussian Mixture Models values + + Returns + ---- + None + """ + plt.plot(K, s_kmeans, 'xr-') # plotting t, a separately + plt.plot(K, s_gmm, 'ob-') + plt.legend(["kmeans", "gmm"]) + plt.xlabel('k') + plt.ylabel('Mean Silhouette Coefficient') + plt.title('Mean Silhouette Coefficient for each k') + plt.show() + + @classmethod + def _calinski_harabaz(cls, K, ch_kmeans, ch_gmm): + """ + Function plots the result of the calinski_harabaz method for kmeans and Gaussian Mixture Models + + Parameters + ---------- + k : array + possible k values + s_kmeans : array + calinski_harabaz kmeans values + s_gmm : array + Gaussian Mixture Models values + + Returns + ------- + None + """ + plt.plot(K, ch_kmeans, 'xr-') # plotting t, a separately + plt.plot(K, ch_gmm, 'ob-') + plt.legend(["kmeans", "gmm"]) + plt.xlabel('k') + plt.ylabel('Calinski and Harabaz score') + plt.title('Calinski and Harabaz score for each k') + plt.show() + + @classmethod + def plot_cluster(cls, df_res_algorithm, algorithm_name = "K-means"): + """ + Function that plots clusters + + Parameters + ---------- + df_res_algoritmo : pd.DataFrame + Dataframe must have the following columns (x, y, cluster) + algorithm_name : str + algorithm name + Return + ------- + None + """ + # verifica quantos clusters tem + qtde_cluster = df_res_algorithm.cluster.max()+1 + plots = [] + for cluster in range(qtde_cluster): + p = plt.scatter(df_res_algorithm[df_res_algorithm['cluster'] == cluster].x, + df_res_algorithm[df_res_algorithm['cluster'] == cluster].y) + plots.append(p) + plt.legend(tuple(plots), + (tuple(["Cluster {}".format(c) for c in range(1, qtde_cluster+1)])), + loc=2, fontsize=8, bbox_to_anchor=(1.05, 1)) + plt.xlabel("X") + plt.ylabel("Y") + plt.title("Clusters created by "+algorithm_name) + plt.show() \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/.ipynb_checkpoints/feature_selection-checkpoint.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/.ipynb_checkpoints/feature_selection-checkpoint.py new file mode 100644 index 0000000..4b3a7bf --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/.ipynb_checkpoints/feature_selection-checkpoint.py @@ -0,0 +1,387 @@ +from sklearn.feature_selection import VarianceThreshold +from sklearn.feature_selection import SelectKBest +from sklearn.feature_selection import SelectPercentile +from sklearn.feature_selection import RFE +from sklearn.feature_selection import SelectFromModel +from sklearn.feature_selection import SequentialFeatureSelector +from mlxtend.feature_selection import ExhaustiveFeatureSelector +from abc import ABC, abstractmethod +import numpy as np +import pandas as pd + +class SelectAlgorithm(ABC): + """ + Abstract class for feature selection algorithms + """ + def transform(self, df: pd.DataFrame): + """ + Select features based on fit + + Parameters + ---------- + df : pd.DataFrame + dataframe with features to be selected + + Returns + ------- + pd.DataFrame + dataframe with selected features only + """ + return df[df.columns[self.selected_columns]] + + def get_support(self): + """ + Get a mask, or integer index, of the features selected + + Parameters + ---------- + + Returns + ------- + np.array + """ + return self.selected_columns + + @abstractmethod + def fit(self) -> None: + """ + Abstract method that is implemented in classes that inherit it + """ + pass + +class SelectCoefficients(SelectAlgorithm): + """ + Class to select features based on model coefficients + """ + def __init__(self, model, num_feat = None): + """ + Constructor + + Parameters + ---------- + model : + should be an instance of a classification or regression model class from scikit-learn and have coef_.ravel method + + num_feats : int + number of features to be selected + Returns + ------- + SelectCoefficients + """ + self.model = model + self.num_feat = num_feat + + def fit(self, X: pd.DataFrame, y = None): + """ + Identify the features to be selected. + + Parameters + ---------- + X : pd.DataFrame + features to be selected + + y : pd.DataFrame + target values + + Returns + ------- + None + """ + self.num_feat = int(X.shape[1]/2) if self.num_feat == None else self.num_feat + trained_model = self.model.fit(X,y) + self.selected_columns = np.argsort(np.abs(trained_model.coef_.ravel()))[-self.num_feat:] + +class SelectCorrelation(SelectAlgorithm): + """ + Class to select features based on correlation between features + """ + def __init__(self, threshold = 1.0): + """ + Constructor + + Parameters + ---------- + threshold : float + correlation threshold + Returns + ------- + SelectCorrelation + """ + self.threshold = threshold + def fit(self, X: pd.DataFrame, y = None): + """ + Identify the features to be selected. + + Parameters + ---------- + X : pd.DataFrame + features to be selected + + y : pd.DataFrame + target values + + Returns + ------- + None + """ + corr = X.corr() + self.selected_columns = np.full((corr.shape[0],), True, dtype=bool) + [self.check_correlation(corr.iloc[i,j],j) for i in range(corr.shape[0]) for j in range(i+1, corr.shape[0])] + + def check_correlation(self,corr,j): + """ + Auxiliar method to check if correlation between features is above threshold + Parameters + ---------- + corr : float + correlation between two atributes + + j : int + index of column to be removed in case corr >= self.threshold + + Returns + ------- + None + """ + if np.abs(corr) >= self.threshold and self.selected_columns[j]: + self.selected_columns[j] = False + +class MyExhaustiveFeatureSelector(ExhaustiveFeatureSelector): + """ + Class that inherits from ExhaustiveFeatureSelector (from mlxtend) and implements get_support method for + compatibility issues + """ + def get_support(self): + return list(self.best_idx_) + +class SelectEnsemble(SelectAlgorithm): + """ + Class to select features based on ensemble of methods + """ + def __init__(self, dic_selection: dict, num_feat = None): + """ + Constructor + + Parameters + ---------- + dic_selection : dict + dict with name of the algorithm as keys and dicts of parameters as values + Ex: dic_selection = { 'variance': {'threshold' : 0.3}, + 'recursive': {'estimator' : LinearSVC(), 'n_features_to_select' : 2}} + num_feats : int + number of features to be selected + Returns + ------- + SelectCoefficients + """ + self.dic_selection = dic_selection + self.num_feat = num_feat + + def fit(self, X: pd.DataFrame, y = None): + """ + Identify the features to be selected. + + Parameters + ---------- + X : pd.DataFrame + features to be selected + + y : pd.DataFrame + target values + + Returns + ------- + None + """ + self.num_feat = int(X.shape[1]/2) if self.num_feat == None else self.num_feat + self.column_dic = {} + for i,column in enumerate(X.columns): + self.column_dic[column] = i + self.column_count = [0 for column in X.columns] + selections = [FeatureSelector(selector,**self.dic_selection[selector]) for selector in self.dic_selection] + [selection.fit(X,y) for selection in selections] + [self.increment_count(column) for selection in selections for column in selection.selected_columns] + self.selected_columns = np.argsort(self.column_count)[-self.num_feat:] + + def increment_count(self,column): + """ + Auxiliar method to increment the count of a column + Parameters + ---------- + column : int + column which the count will be incremented + + Returns + ------- + None + """ + self.column_count[self.column_dic[column]]+=1 + +class FeatureSelector: + + def __init__(self, selector, **kwargs): + """ + Constructor + + Parameters + ---------- + selector : str + name of algorithm to be applied + **kwargs : + optional and positional arguments of the choosen algorithm (selector) + Returns + ------- + FeatureSelector + Examples + --------- + variance thresholding: f = FeatureSelector('variance', threshold=0.3) #Instantiating + f.fit(X[,y]) #fitting (y is optional for variance thresholding) + X = f.transform(X) #transforming + + filter-based, k best (MAD): f = FeatureSelector('univariate_kbest', score_func=FeatureSelector.mean_abs_diff, k=2) #Instantiating + #score_func can be any function f: R^n -> R^n (n = number of columns) + f.fit(X,y) #fitting + X = f.transform(X) #transforming + + wrapper, recursive: f = FeatureSelector('recursive', estimator = LinearSVC(), n_features_to_select=2) #Instantiating + #estimator should be an instance of a classification or regression model class from scikit-learn + #one can use a custom class but it must be compatible with scikit-learn arquitecture + f.fit(X,y) #fitting + X = f.transform(X) #transforming + + wrapper, sequential: f = FeatureSelector('sequential', estimator = LinearSVC(), direction='forward') #Instantiating + #estimator should be an instance of a classification or regression model class from scikit-learn + #one can use a custom class but it must be compatible with scikit-learn arquitecture + f.fit(X,y) #fitting + X = f.transform(X) #transforming + + to better understand the optional arguments of each algorithm see: https://scikit-learn.org/stable/modules/feature_selection.html + """ + self.selector = selector + self.selectors = {'variance': VarianceThreshold, + 'univariate_kbest': SelectKBest, + 'univariate_percentile': SelectPercentile, + 'recursive': RFE, + 'model':SelectFromModel, + 'sequential':SequentialFeatureSelector, + 'exaustive':MyExhaustiveFeatureSelector, + 'correlation':SelectCorrelation, + 'coefficients':SelectCoefficients, + 'ensemble':SelectEnsemble} + self.kwargs = kwargs + self.fitted = False + + def fit(self, X: pd.DataFrame, y = None): + """ + Identify the features to be selected. + + Parameters + ---------- + X : pd.DataFrame + features to be selected + + y : pd.DataFrame + target values + + Returns + ------- + None + """ + self.columns = X.columns + self.selection = self.selectors[self.selector](**self.kwargs) + self.selection.fit(X,y) + self.selected_columns = self.columns[self.selection.get_support()] + self.fitted = True + + def transform(self, df: pd.DataFrame): + """ + Select features based on fit + + Parameters + ---------- + pd.DataFrame + dataframe with features to be selected + + Returns + ------- + df : pd.DataFrame + dataframe with selected features only + """ + if not self.fitted: + raise Exception("Not yet trained.") + + + #return self.selection.transform(df) + return df[self.selected_columns] + + def inverse_transform(self, df: pd.DataFrame): + """ + Apply the invese_transform of vectorizer to each column + Options: index, bag_of_words and tf_idf + + Parameters + ---------- + df : pd.DataFrame + dataframe with columns to be unvectorizer + + Returns + ------- + pd.DataFrame + """ + pass + + #return df + + @staticmethod + def mean_abs_diff(X, y=None): + """ + method to compute the mean absolute difference (MAD) of all atributes of X + + Parameters + ---------- + X : pd.DataFrame + dataframe + y: any type + not necessary, used only for compatibility issues + + Returns + ------- + pd.DataFrame + """ + return np.sum(np.abs(X - np.mean(X, axis = 0)), axis = 0)/X.shape[0] + + @staticmethod + def variance(X, y=None): + """ + method to compute the mean variance of all atributes of X + + Parameters + ---------- + X : pd.DataFrame + dataframe + y: any type + not necessary, used only for compatibility issues + + Returns + ------- + pd.DataFrame + """ + return np.sum((X - np.mean(X, axis = 0)**2), axis = 0)/X.shape[0] + + @staticmethod + def disp_ratio(X, y=None): + """ + method to compute the dispersion ratio of all atributes od X + + Parameters + ---------- + X : pd.DataFrame + dataframe + y: any type + not necessary, used only for compatibility issues + + Returns + ------- + pd.DataFrame + """ + return np.mean(X, axis = 0)/np.power(np.prod(X, axis = 0),1/X.shape[0]) diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/.ipynb_checkpoints/pca-checkpoint.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/.ipynb_checkpoints/pca-checkpoint.py new file mode 100644 index 0000000..2596a64 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/.ipynb_checkpoints/pca-checkpoint.py @@ -0,0 +1,149 @@ +import pandas as pd +from sklearn.decomposition import PCA as PCA_sklearn +from sklearn import metrics + +class PCA: + + def __init__(self, columns, prefix="prefix", k=2): + """ + Constructor + + Parameters + ---------- + columns : list + Columns for dimensionality reduction + prefix : bool + column prefix + k : int + Number of dimensions + + Returns + ------- + PCA + """ + self.columns = columns + self.prefix = prefix + self.k = k + + + def __find_k(self, df, threshold): + """ + Find how many k dimensions will be reduced + + Parameters + ---------- + df : pd.Dataframe + dataframe to be reduced + + Returns + ------- + int + """ + self.pca = PCA_sklearn(n_components=len(self.columns)) + self.pca.fit(df[ self.columns ].values) + for i in range(len(self.columns)-1): + if self.pca.explained_variance_ratio_[i]+self.pca.explained_variance_ratio_[i+1] < threshold: + if i == 0: + raise Expecption("Not reduced by poor explicability") + return i+1 + + def __check(self, df: pd.DataFrame): + """ + Check dataframe contains all columns + + Parameters + ---------- + df : pd.Dataframe + dataframe to be reduced + + Returns + ------- + bool + """ + if not all(col in list(df.columns) for col in self.columns): + raise Exception('Missing columns') + return True + + + def transform(self, df: pd.DataFrame): + """ + Transform the data + + Parameters + ---------- + df : pd.Dataframe + dataframe to be reduced + + Returns + ------- + None + """ + self.__check(df) + if self.pca is None: + raise Exception("Error - object not fitted") + reduced = self.pca.transform(df[self.columns].values) + for col in range(self.k): + df[self.prefix+"_"+str(col)] = [line[col] for line in reduced] + df.drop(self.columns, axis=1, inplace=True) + + + def fit(self, df : pd.DataFrame, threshold=0.4): + """ + Compute PCA object + + Parameters + ---------- + df : pd.Dataframe + dataframe to be reduced + + Returns + ------- + None + """ + self.__check(df) + if self.k is None: + self.k = self.__find_k(df,threshold) + self.pca = PCA_sklearn(n_components=self.k) + self.pca.fit(df[ self.columns ].values) + + + def fit_transform (self, df : pd.DataFrame, threshold=0.4): + """ + Fit to data, then transform it. + + Parameters + ---------- + df : pd.Dataframe + dataframe to be reduced + + Returns + ------- + None + """ + self.__check(df) + if self.k is None: + self.k = self.__find_k(df,threshold) + self.pca = PCA_sklearn(n_components=self.k) + self.pca.fit(df[ self.columns ].values) + self.transform(df) + self.report() + + + + + def report(self): + """ + Returns explained variance + + Parameters + ---------- + None + + Returns + ------- + None + """ + for col in range(self.k): + print("Explained variance ({col}): {ratio}". + format(col = self.prefix+"_"+str(col), + ratio = str(self.pca.explained_variance_ratio_[col]))) \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/.ipynb_checkpoints/vif-checkpoint.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/.ipynb_checkpoints/vif-checkpoint.py new file mode 100644 index 0000000..79535f8 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/.ipynb_checkpoints/vif-checkpoint.py @@ -0,0 +1,48 @@ +import pandas as pd +from statsmodels.stats.outliers_influence import variance_inflation_factor + +class VIF: + + @classmethod + def analyze(cls, df: pd.DataFrame, thresh=5.0, verbose=True): + """ + Multicollinearity analysis + + Parameters + ---------- + df : pd.DataFrame + Dataframe must have the following columns (x, y, cluster) + thresh : int + value of cut + verbose : bool + if true prints possible variables to be removed + + + Return + ------- + pd.DataFrame + """ + variables = list(range(df.shape[1])) + dropped = True + while dropped: + dropped = False + vif = [variance_inflation_factor(df.iloc[:, variables].values, ix) + for ix in range(df.iloc[:, variables].shape[1])] + + maxloc = vif.index(max(vif)) + if max(vif) > thresh: + m = max(vif) + index_max = [i for i, j in enumerate(vif) if j == m] + if verbose: + cols_possibles_remove = [str(df.iloc[:, variables].columns[i]) for i in index_max] + print("Columns that can be removed -> " + ", ".join(cols_possibles_remove)) + print("------") + print('dropping \'' + str(df.iloc[:, variables].columns[maxloc]) + + '\' at index: ' + str(maxloc)) + print("_____________________________________________________________") + del variables[maxloc] + dropped = True + + print('Remaining variables:') + print(df.columns[variables]) + return df.iloc[:, variables] \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/cluster.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/cluster.py new file mode 100644 index 0000000..5e5f7a6 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/cluster.py @@ -0,0 +1,166 @@ +from sklearn.mixture import GaussianMixture +from sklearn.cluster import KMeans +from sklearn import metrics +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + +class Cluster: + + @classmethod + def analyzeK(cls, X, k_min = 2, k_max = 20): + """ + Plot the result of the methods (elbow, silhouette and calinski_harabas) to find the best k + + Parameters + ---------- + X : array + values ​​that will be used to find the best k + k_min : int + minimum interval for K + k_max : int + maximum range for K + + Returns + ------- + None + """ + + if X is None: + raise Exception("Error: X is None.") + if k_min is None or k_max is None: + raise Exception("Error: Range is None.") + if k_min < 2: + raise Exception("Error: k_min < 2") + + wss = [] + s_gmm = [] + s_kmeans = [] + ch_gmm = [] + ch_kmeans = [] + + K = range(k_min, k_max) + + for k in K: + kmeans = KMeans(n_clusters=k) + kmeans.fit(X) + gmm = GaussianMixture(n_components=k, covariance_type='full') + gmm.fit(X) + + labels_kmeans = kmeans.predict(X) + labels_gmm = gmm.predict(X) + + s_kmeans.append(metrics.silhouette_score(X, labels_kmeans, metric='euclidean')) + s_gmm.append(metrics.silhouette_score(X, labels_gmm, metric='euclidean')) + + ch_kmeans.append(metrics.calinski_harabasz_score(X, labels_kmeans)) + ch_gmm.append(metrics.calinski_harabasz_score(X, labels_gmm)) + + wss.append(kmeans.inertia_) + + cls._elbow(K, wss) + cls._silhouette_coefficient(K, s_kmeans, s_gmm) + cls._calinski_harabaz(K, ch_kmeans, ch_gmm) + + @classmethod + def _elbow(cls, K, wss): + """ + Function plots the result of the elbow method + + Parameters + ---------- + k : array + possible k values + k_min : array + Total WSS measures cluster compression and we want it to be as small as possible + Returns + ------- + None + """ + plt.plot(K, wss, 'bx-') + plt.xlabel('k') + plt.ylabel('WSS') + plt.title('The Elbow Method showing the optimal k') + plt.show() + + @classmethod + def _silhouette_coefficient(cls, K, s_kmeans, s_gmm): + """ + Function plots the result of the silhouette method for kmeans and Gaussian Mixture Models + + Parameters + ---------- + k : array + k values + s_kmeans : array + Silhouette kmeans values + s_gmm : array + Silhouette Gaussian Mixture Models values + + Returns + ---- + None + """ + plt.plot(K, s_kmeans, 'xr-') # plotting t, a separately + plt.plot(K, s_gmm, 'ob-') + plt.legend(["kmeans", "gmm"]) + plt.xlabel('k') + plt.ylabel('Mean Silhouette Coefficient') + plt.title('Mean Silhouette Coefficient for each k') + plt.show() + + @classmethod + def _calinski_harabaz(cls, K, ch_kmeans, ch_gmm): + """ + Function plots the result of the calinski_harabaz method for kmeans and Gaussian Mixture Models + + Parameters + ---------- + k : array + possible k values + s_kmeans : array + calinski_harabaz kmeans values + s_gmm : array + Gaussian Mixture Models values + + Returns + ------- + None + """ + plt.plot(K, ch_kmeans, 'xr-') # plotting t, a separately + plt.plot(K, ch_gmm, 'ob-') + plt.legend(["kmeans", "gmm"]) + plt.xlabel('k') + plt.ylabel('Calinski and Harabaz score') + plt.title('Calinski and Harabaz score for each k') + plt.show() + + @classmethod + def plot_cluster(cls, df_res_algorithm, algorithm_name = "K-means"): + """ + Function that plots clusters + + Parameters + ---------- + df_res_algoritmo : pd.DataFrame + Dataframe must have the following columns (x, y, cluster) + algorithm_name : str + algorithm name + Return + ------- + None + """ + # verifica quantos clusters tem + qtde_cluster = df_res_algorithm.cluster.max()+1 + plots = [] + for cluster in range(qtde_cluster): + p = plt.scatter(df_res_algorithm[df_res_algorithm['cluster'] == cluster].x, + df_res_algorithm[df_res_algorithm['cluster'] == cluster].y) + plots.append(p) + plt.legend(tuple(plots), + (tuple(["Cluster {}".format(c) for c in range(1, qtde_cluster+1)])), + loc=2, fontsize=8, bbox_to_anchor=(1.05, 1)) + plt.xlabel("X") + plt.ylabel("Y") + plt.title("Clusters created by "+algorithm_name) + plt.show() \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/feature_selection.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/feature_selection.py new file mode 100644 index 0000000..4b3a7bf --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/feature_selection.py @@ -0,0 +1,387 @@ +from sklearn.feature_selection import VarianceThreshold +from sklearn.feature_selection import SelectKBest +from sklearn.feature_selection import SelectPercentile +from sklearn.feature_selection import RFE +from sklearn.feature_selection import SelectFromModel +from sklearn.feature_selection import SequentialFeatureSelector +from mlxtend.feature_selection import ExhaustiveFeatureSelector +from abc import ABC, abstractmethod +import numpy as np +import pandas as pd + +class SelectAlgorithm(ABC): + """ + Abstract class for feature selection algorithms + """ + def transform(self, df: pd.DataFrame): + """ + Select features based on fit + + Parameters + ---------- + df : pd.DataFrame + dataframe with features to be selected + + Returns + ------- + pd.DataFrame + dataframe with selected features only + """ + return df[df.columns[self.selected_columns]] + + def get_support(self): + """ + Get a mask, or integer index, of the features selected + + Parameters + ---------- + + Returns + ------- + np.array + """ + return self.selected_columns + + @abstractmethod + def fit(self) -> None: + """ + Abstract method that is implemented in classes that inherit it + """ + pass + +class SelectCoefficients(SelectAlgorithm): + """ + Class to select features based on model coefficients + """ + def __init__(self, model, num_feat = None): + """ + Constructor + + Parameters + ---------- + model : + should be an instance of a classification or regression model class from scikit-learn and have coef_.ravel method + + num_feats : int + number of features to be selected + Returns + ------- + SelectCoefficients + """ + self.model = model + self.num_feat = num_feat + + def fit(self, X: pd.DataFrame, y = None): + """ + Identify the features to be selected. + + Parameters + ---------- + X : pd.DataFrame + features to be selected + + y : pd.DataFrame + target values + + Returns + ------- + None + """ + self.num_feat = int(X.shape[1]/2) if self.num_feat == None else self.num_feat + trained_model = self.model.fit(X,y) + self.selected_columns = np.argsort(np.abs(trained_model.coef_.ravel()))[-self.num_feat:] + +class SelectCorrelation(SelectAlgorithm): + """ + Class to select features based on correlation between features + """ + def __init__(self, threshold = 1.0): + """ + Constructor + + Parameters + ---------- + threshold : float + correlation threshold + Returns + ------- + SelectCorrelation + """ + self.threshold = threshold + def fit(self, X: pd.DataFrame, y = None): + """ + Identify the features to be selected. + + Parameters + ---------- + X : pd.DataFrame + features to be selected + + y : pd.DataFrame + target values + + Returns + ------- + None + """ + corr = X.corr() + self.selected_columns = np.full((corr.shape[0],), True, dtype=bool) + [self.check_correlation(corr.iloc[i,j],j) for i in range(corr.shape[0]) for j in range(i+1, corr.shape[0])] + + def check_correlation(self,corr,j): + """ + Auxiliar method to check if correlation between features is above threshold + Parameters + ---------- + corr : float + correlation between two atributes + + j : int + index of column to be removed in case corr >= self.threshold + + Returns + ------- + None + """ + if np.abs(corr) >= self.threshold and self.selected_columns[j]: + self.selected_columns[j] = False + +class MyExhaustiveFeatureSelector(ExhaustiveFeatureSelector): + """ + Class that inherits from ExhaustiveFeatureSelector (from mlxtend) and implements get_support method for + compatibility issues + """ + def get_support(self): + return list(self.best_idx_) + +class SelectEnsemble(SelectAlgorithm): + """ + Class to select features based on ensemble of methods + """ + def __init__(self, dic_selection: dict, num_feat = None): + """ + Constructor + + Parameters + ---------- + dic_selection : dict + dict with name of the algorithm as keys and dicts of parameters as values + Ex: dic_selection = { 'variance': {'threshold' : 0.3}, + 'recursive': {'estimator' : LinearSVC(), 'n_features_to_select' : 2}} + num_feats : int + number of features to be selected + Returns + ------- + SelectCoefficients + """ + self.dic_selection = dic_selection + self.num_feat = num_feat + + def fit(self, X: pd.DataFrame, y = None): + """ + Identify the features to be selected. + + Parameters + ---------- + X : pd.DataFrame + features to be selected + + y : pd.DataFrame + target values + + Returns + ------- + None + """ + self.num_feat = int(X.shape[1]/2) if self.num_feat == None else self.num_feat + self.column_dic = {} + for i,column in enumerate(X.columns): + self.column_dic[column] = i + self.column_count = [0 for column in X.columns] + selections = [FeatureSelector(selector,**self.dic_selection[selector]) for selector in self.dic_selection] + [selection.fit(X,y) for selection in selections] + [self.increment_count(column) for selection in selections for column in selection.selected_columns] + self.selected_columns = np.argsort(self.column_count)[-self.num_feat:] + + def increment_count(self,column): + """ + Auxiliar method to increment the count of a column + Parameters + ---------- + column : int + column which the count will be incremented + + Returns + ------- + None + """ + self.column_count[self.column_dic[column]]+=1 + +class FeatureSelector: + + def __init__(self, selector, **kwargs): + """ + Constructor + + Parameters + ---------- + selector : str + name of algorithm to be applied + **kwargs : + optional and positional arguments of the choosen algorithm (selector) + Returns + ------- + FeatureSelector + Examples + --------- + variance thresholding: f = FeatureSelector('variance', threshold=0.3) #Instantiating + f.fit(X[,y]) #fitting (y is optional for variance thresholding) + X = f.transform(X) #transforming + + filter-based, k best (MAD): f = FeatureSelector('univariate_kbest', score_func=FeatureSelector.mean_abs_diff, k=2) #Instantiating + #score_func can be any function f: R^n -> R^n (n = number of columns) + f.fit(X,y) #fitting + X = f.transform(X) #transforming + + wrapper, recursive: f = FeatureSelector('recursive', estimator = LinearSVC(), n_features_to_select=2) #Instantiating + #estimator should be an instance of a classification or regression model class from scikit-learn + #one can use a custom class but it must be compatible with scikit-learn arquitecture + f.fit(X,y) #fitting + X = f.transform(X) #transforming + + wrapper, sequential: f = FeatureSelector('sequential', estimator = LinearSVC(), direction='forward') #Instantiating + #estimator should be an instance of a classification or regression model class from scikit-learn + #one can use a custom class but it must be compatible with scikit-learn arquitecture + f.fit(X,y) #fitting + X = f.transform(X) #transforming + + to better understand the optional arguments of each algorithm see: https://scikit-learn.org/stable/modules/feature_selection.html + """ + self.selector = selector + self.selectors = {'variance': VarianceThreshold, + 'univariate_kbest': SelectKBest, + 'univariate_percentile': SelectPercentile, + 'recursive': RFE, + 'model':SelectFromModel, + 'sequential':SequentialFeatureSelector, + 'exaustive':MyExhaustiveFeatureSelector, + 'correlation':SelectCorrelation, + 'coefficients':SelectCoefficients, + 'ensemble':SelectEnsemble} + self.kwargs = kwargs + self.fitted = False + + def fit(self, X: pd.DataFrame, y = None): + """ + Identify the features to be selected. + + Parameters + ---------- + X : pd.DataFrame + features to be selected + + y : pd.DataFrame + target values + + Returns + ------- + None + """ + self.columns = X.columns + self.selection = self.selectors[self.selector](**self.kwargs) + self.selection.fit(X,y) + self.selected_columns = self.columns[self.selection.get_support()] + self.fitted = True + + def transform(self, df: pd.DataFrame): + """ + Select features based on fit + + Parameters + ---------- + pd.DataFrame + dataframe with features to be selected + + Returns + ------- + df : pd.DataFrame + dataframe with selected features only + """ + if not self.fitted: + raise Exception("Not yet trained.") + + + #return self.selection.transform(df) + return df[self.selected_columns] + + def inverse_transform(self, df: pd.DataFrame): + """ + Apply the invese_transform of vectorizer to each column + Options: index, bag_of_words and tf_idf + + Parameters + ---------- + df : pd.DataFrame + dataframe with columns to be unvectorizer + + Returns + ------- + pd.DataFrame + """ + pass + + #return df + + @staticmethod + def mean_abs_diff(X, y=None): + """ + method to compute the mean absolute difference (MAD) of all atributes of X + + Parameters + ---------- + X : pd.DataFrame + dataframe + y: any type + not necessary, used only for compatibility issues + + Returns + ------- + pd.DataFrame + """ + return np.sum(np.abs(X - np.mean(X, axis = 0)), axis = 0)/X.shape[0] + + @staticmethod + def variance(X, y=None): + """ + method to compute the mean variance of all atributes of X + + Parameters + ---------- + X : pd.DataFrame + dataframe + y: any type + not necessary, used only for compatibility issues + + Returns + ------- + pd.DataFrame + """ + return np.sum((X - np.mean(X, axis = 0)**2), axis = 0)/X.shape[0] + + @staticmethod + def disp_ratio(X, y=None): + """ + method to compute the dispersion ratio of all atributes od X + + Parameters + ---------- + X : pd.DataFrame + dataframe + y: any type + not necessary, used only for compatibility issues + + Returns + ------- + pd.DataFrame + """ + return np.mean(X, axis = 0)/np.power(np.prod(X, axis = 0),1/X.shape[0]) diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/pca.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/pca.py new file mode 100644 index 0000000..2596a64 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/pca.py @@ -0,0 +1,149 @@ +import pandas as pd +from sklearn.decomposition import PCA as PCA_sklearn +from sklearn import metrics + +class PCA: + + def __init__(self, columns, prefix="prefix", k=2): + """ + Constructor + + Parameters + ---------- + columns : list + Columns for dimensionality reduction + prefix : bool + column prefix + k : int + Number of dimensions + + Returns + ------- + PCA + """ + self.columns = columns + self.prefix = prefix + self.k = k + + + def __find_k(self, df, threshold): + """ + Find how many k dimensions will be reduced + + Parameters + ---------- + df : pd.Dataframe + dataframe to be reduced + + Returns + ------- + int + """ + self.pca = PCA_sklearn(n_components=len(self.columns)) + self.pca.fit(df[ self.columns ].values) + for i in range(len(self.columns)-1): + if self.pca.explained_variance_ratio_[i]+self.pca.explained_variance_ratio_[i+1] < threshold: + if i == 0: + raise Expecption("Not reduced by poor explicability") + return i+1 + + def __check(self, df: pd.DataFrame): + """ + Check dataframe contains all columns + + Parameters + ---------- + df : pd.Dataframe + dataframe to be reduced + + Returns + ------- + bool + """ + if not all(col in list(df.columns) for col in self.columns): + raise Exception('Missing columns') + return True + + + def transform(self, df: pd.DataFrame): + """ + Transform the data + + Parameters + ---------- + df : pd.Dataframe + dataframe to be reduced + + Returns + ------- + None + """ + self.__check(df) + if self.pca is None: + raise Exception("Error - object not fitted") + reduced = self.pca.transform(df[self.columns].values) + for col in range(self.k): + df[self.prefix+"_"+str(col)] = [line[col] for line in reduced] + df.drop(self.columns, axis=1, inplace=True) + + + def fit(self, df : pd.DataFrame, threshold=0.4): + """ + Compute PCA object + + Parameters + ---------- + df : pd.Dataframe + dataframe to be reduced + + Returns + ------- + None + """ + self.__check(df) + if self.k is None: + self.k = self.__find_k(df,threshold) + self.pca = PCA_sklearn(n_components=self.k) + self.pca.fit(df[ self.columns ].values) + + + def fit_transform (self, df : pd.DataFrame, threshold=0.4): + """ + Fit to data, then transform it. + + Parameters + ---------- + df : pd.Dataframe + dataframe to be reduced + + Returns + ------- + None + """ + self.__check(df) + if self.k is None: + self.k = self.__find_k(df,threshold) + self.pca = PCA_sklearn(n_components=self.k) + self.pca.fit(df[ self.columns ].values) + self.transform(df) + self.report() + + + + + def report(self): + """ + Returns explained variance + + Parameters + ---------- + None + + Returns + ------- + None + """ + for col in range(self.k): + print("Explained variance ({col}): {ratio}". + format(col = self.prefix+"_"+str(col), + ratio = str(self.pca.explained_variance_ratio_[col]))) \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/vif.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/vif.py new file mode 100644 index 0000000..79535f8 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/vif.py @@ -0,0 +1,48 @@ +import pandas as pd +from statsmodels.stats.outliers_influence import variance_inflation_factor + +class VIF: + + @classmethod + def analyze(cls, df: pd.DataFrame, thresh=5.0, verbose=True): + """ + Multicollinearity analysis + + Parameters + ---------- + df : pd.DataFrame + Dataframe must have the following columns (x, y, cluster) + thresh : int + value of cut + verbose : bool + if true prints possible variables to be removed + + + Return + ------- + pd.DataFrame + """ + variables = list(range(df.shape[1])) + dropped = True + while dropped: + dropped = False + vif = [variance_inflation_factor(df.iloc[:, variables].values, ix) + for ix in range(df.iloc[:, variables].shape[1])] + + maxloc = vif.index(max(vif)) + if max(vif) > thresh: + m = max(vif) + index_max = [i for i, j in enumerate(vif) if j == m] + if verbose: + cols_possibles_remove = [str(df.iloc[:, variables].columns[i]) for i in index_max] + print("Columns that can be removed -> " + ", ".join(cols_possibles_remove)) + print("------") + print('dropping \'' + str(df.iloc[:, variables].columns[maxloc]) + + '\' at index: ' + str(maxloc)) + print("_____________________________________________________________") + del variables[maxloc] + dropped = True + + print('Remaining variables:') + print(df.columns[variables]) + return df.iloc[:, variables] \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/data_source/base.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/data_source/base.py new file mode 100644 index 0000000..82d4d10 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/data_source/base.py @@ -0,0 +1,12 @@ +from abc import ABC, abstractmethod +import pandas as pd + +class DataSource(ABC): + + @abstractmethod + def get_data(self) -> pd.DataFrame: + """ + Abstract method that is implemented in classes that inherit it + """ + pass + diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/data_source/database.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/data_source/database.py new file mode 100644 index 0000000..a5554d8 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/data_source/database.py @@ -0,0 +1,70 @@ +import pandas as pd + +from ml.data_source.base import DataSource + +class DataBase(DataSource): + + def __init__(self): + """ + Constructor. + + Parameters + ----------- + arg : type + description + + Returns + ------- + class Object + """ + pass + + def get_data(self)->pd.DataFrame: + """ + Returns a flat table in Dataframe + + Parameters + ----------- + arg : type + description + + Returns + ------- + pd.DataFrame + Dataframe with data + """ + pass + + def open_connection(self, connection): + """ + Opens the connection to the database + + Parameters + ----------- + connection : string + Connection with database + + Returns + ------- + bool + Check if connection is open or not + + """ + pass + + def close_connection(self, connection ): + """ + Close the connection database + + Parameters + ----------- + connection : string + Connection with database + + Returns + ------- + bool + Check if connection was closed + + """ + pass diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/data_source/spreadsheet.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/data_source/spreadsheet.py new file mode 100644 index 0000000..7f48cff --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/data_source/spreadsheet.py @@ -0,0 +1,24 @@ +import pandas as pd + +from ml.data_source.base import DataSource + +class Spreadsheet(DataSource): + """ + Class to read files from spreadsheets or raw text files + """ + + def get_data(self, path)->pd.DataFrame: + """ + Returns a flat table in Dataframe + + Parameters + ---------- + arg : type + description + + Returns + ------- + pd.DataFrame + Dataframe with data + """ + return pd.read_csv(path)[['Survived', 'Pclass', 'Sex', 'Age']] \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/.ipynb_checkpoints/metrics-checkpoint.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/.ipynb_checkpoints/metrics-checkpoint.py new file mode 100644 index 0000000..34cd079 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/.ipynb_checkpoints/metrics-checkpoint.py @@ -0,0 +1,212 @@ +from sklearn.metrics import * +import numpy as np +from sklearn.metrics import make_scorer +from sklearn.model_selection import cross_validate + +class Metrics: + + @classmethod + def smape(cls, A, F): + """ + Calculates the smape value between the real and the predicted + + Parameters + ---------- + A : array + Target values + F : array + Predicted values + + Returns + ------- + float: smape value + """ + return 100/len(A) * np.sum(np.abs(F - A) / (np.abs(A) + np.abs(F))) + + @classmethod + def __custom_score(cls, y_true, y_pred): + """ + Creates a custom metric + + Parameters + ---------- + y_true : array + Target values + y_pred : array + Predicted values + + Returns + ------- + sklearn.metrics + """ + #return sklearn.metrics.fbeta_score(y_true, y_pred, 2) + pass + + @classmethod + def customized(cls, y_true, y_pred): + """ + Creates a custom metric + + Parameters + ---------- + y_true : array + Target values + y_pred : array + Predicted values + + Returns + ------- + float + """ + custom_metric = make_scorer(cls.__custom_score, greater_is_better=True) + return custom_metric + + @classmethod + def mape(cls, y_true, y_pred): + """ + Calculates the map value between the real and the predicted + + Parameters + ---------- + y_true : array + Target values + y_pred : array + Predicted values + + Returns + ------- + float : value of mape + """ + y_true, y_pred = np.array(y_true), np.array(y_pred) + return np.mean(np.abs(((y_true+1) - (y_pred+1)) / (y_true+1))) * 100 + + + @classmethod + def regression(cls, y_true, y_pred): + """ + Calculates some metrics for regression problems + + Parameters + ---------- + y_true : array + Target values + y_pred : array + Predicted values + + Returns + ------- + dict : metrics results + """ + results = {'mean_absolute_error': round(mean_absolute_error(y_true, y_pred), 7), + 'root_mean_squared_error': round(np.sqrt(mean_squared_error(y_true, y_pred)), 7), + 'r2': round(r2_score(y_true, y_pred), 7), + 'smape': round(cls.smape(y_true, y_pred), 7), + 'mape': round(cls.mape(y_true, y_pred), 7) + } + return results + + @classmethod + def crossvalidation(cls, model, X, y, classification: bool, cv=5, agg=np.mean): + if classification: + if len(set(y)) > 2: + metrics = ['accuracy','f1_weighted', 'recall_weighted','precision_weighted'] + else: + metrics = ['accuracy','f1', 'recall','precision', 'roc_auc'] + else: + metrics = ['mean_absolute_error', 'r2', 'root_mean_squared_error', 'smape', 'mape'] + res_metrics = cross_validate(model, X, y, cv=cv, return_train_score=False, scoring=metrics) + results = {metric.replace("test_", ""): round(agg(res_metrics[metric]),7) for metric in res_metrics} + return results + + @classmethod + def __multiclass_classification(cls, y_true, y_pred): + """ + Calculates some metrics for multiclass classification problems + + Parameters + ---------- + y_true : array + Target values + y_pred : array + Predicted values + + Returns + ------- + dict : metrics results + """ + results = {'accuracy': accuracy_score(y_true, y_pred), + 'f1': f1_score(y_true, y_pred, average='weighted'), + 'precision': precision_score(y_true, y_pred, average='weighted'), + 'recall': recall_score(y_true, y_pred, average='weighted'), + } + return results + + @classmethod + def __binary_classification(cls, y_true, y_pred, y_probs): + """ + Calculates some metrics for binary classification problems + + Parameters + ---------- + y_true : array + Target values + y_pred : array + Predicted values + + Returns + ------- + dict : metrics results + """ + results = {'accuracy': accuracy_score(y_true, y_pred), + 'f1': f1_score(y_true, y_pred), + 'precision': precision_score(y_true, y_pred), + 'recall': recall_score(y_true, y_pred), + 'roc_auc': roc_auc_score(y_true, y_probs) + } + return results + + @classmethod + def classification(cls, y_true, y_pred, y_probs): + """ + Checks which classification method will be applied: binary or multiclass + + Parameters + ---------- + y_true : array + Target values + y_pred : array + Predicted values + y_probs : array + Probabilities values + + Returns + ------- + dict: metrics results + """ + if len(set(y_true)) > 2: + results = cls.__multiclass_classification(y_true, y_pred) + else: + results = cls.__binary_classification(y_true, y_pred, y_probs) + return results + + + @classmethod + def clusterization(cls, X, labels): + """ + Calculates some metrics on clustering quality + + Parameters + ---------- + X : array[array], shape (n_linha, n_colunas) + Matrix with the values that were used in the cluster + labels : array, shape (n_linha, 1) + Vector with labels selected by the clustering method (eg KMeans) + + Returns + ------- + dict : metrics results + """ + results = {'silhouette': silhouette_score(X, labels, metric='euclidean'), + 'calinski_harabaz': calinski_harabaz_score(X, labels) + } + return results \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/.ipynb_checkpoints/trainer-checkpoint.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/.ipynb_checkpoints/trainer-checkpoint.py new file mode 100644 index 0000000..1266611 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/.ipynb_checkpoints/trainer-checkpoint.py @@ -0,0 +1,104 @@ +from abc import ABC, abstractmethod +from ml.model.wrapper import Wrapper +from ml.model.metrics import Metrics +import statsmodels.formula.api as smf +from sklearn.model_selection import train_test_split +import numpy as np + +class Trainer(ABC): + def __init__(self): + """ + Constructor + + Parameters + ---------- + None + + Returns + ------- + Trainer + """ + + @abstractmethod + def train(self): + """ + Abstract method that should be implemented in every class that inherits TrainerModel + Parameters + ---------- + None + + Returns + ------- + None + """ + pass + +class TrainerSklearn(Trainer): + + def train(self, train, val, y_name, + classification: bool, + algorithm, + columns = None, + **params): + """ + Method that builds the Sklearn model + + Parameters + ---------- + train : pd.Dataframe + data to train the model + val : pd.Dataframe + data to validate the model + y_name : str + target name + algorithm : Sklearn algorithm + algorithm to be trained + classification : bool + if True, classification model training takes place, otherwise Regression + columns : array + columns name to be used in the train + + Returns + ------- + Wrapper + """ + model = algorithm(**params) #model + y_train = train[y_name] + y_val = val[y_name] + X_train = train[columns] + X_val = val[columns] + model.fit(X_train,y_train) + y_pred = model.predict(X_val) + y_probs = model.predict_proba(X_val)[:,1] + if classification: + res_metrics = Metrics.classification(y_val.values, y_pred, y_probs) + else: + res_metrics = Metrics.regression(y_val.values, y_pred) + model = Wrapper(model, res_metrics, X_train.columns) + return model + + +class TrainerSklearnUnsupervised(Trainer): + + def train(self, X, + algorithm, + **params): + """ + Method that builds the Sklearn model + + Parameters + ---------- + model_name : str + model name + + Returns + ------- + Wrapper + """ + model = algorithm(**params) #model + columns = list(X.columns) + model.fit(X) + labels = model.predict(X) + res_metrics = Metrics.clusterization(X, labels) + model = Wrapper(model, res_metrics, columns) + return model diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/.ipynb_checkpoints/wrapper-checkpoint.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/.ipynb_checkpoints/wrapper-checkpoint.py new file mode 100644 index 0000000..8f812cf --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/.ipynb_checkpoints/wrapper-checkpoint.py @@ -0,0 +1,252 @@ +from joblib import dump, load +from datetime import date +import mlflow.pyfunc +from mlflow import pyfunc +from interpret.ext.blackbox import TabularExplainer, MimicExplainer +from interpret.ext.glassbox import * +import pandas as pd + +from util import load_yaml, load_json + + +class Wrapper(mlflow.pyfunc.PythonModel): + def __init__(self, model=None, metrics=None, columns=None): + """ + Constructor + + Parameters + ---------- + model : object + If it's just a model: enter all parameters + if it is more than one model: do not enter parameters and use + the add method to add each of the models + metrics : dict + Dictionary with the metrics of the result of the model + columns : list + list with columns names + Returns + ------- + WrapperModel + """ + self.artifacts = dict() + self.artifacts["model"] = model + self.artifacts["metrics"] = metrics + self.artifacts["columns"] = columns + self.artifacts["creation_date"] = date.today() + + def predict(self, model_input, included_input=False): + """ + Method that returns the result of the prediction on a dataset + + Parameters + ---------- + df : pd.DataFrame + Data to be predicted + + Returns + ------- + list + """ + df_processed = model_input.copy() + model = self.artifacts["model"] + columns = self.artifacts["columns"] + result = model.predict(df_processed[columns]) + if included_input: + model_input['predict'] = result + result = model_input + return result + + def predict_proba(self, model_input, binary=False): + """ + Method that returns the result of the prediction on a dataset + + Parameters + ---------- + df : pd.DataFrame + data to be predicted + + Returns + ------- + list + """ + df_processed = model_input.copy() + model = self.artifacts["model"] + columns = self.artifacts["columns"] + if binary: + return model.predict_proba(df_processed[columns])[:, 1] + else: + return model.predict_proba(df_processed[columns]) + + def save_model(self, path): + """ + Saves the model object to a specific path + + Parameters + ---------- + path : str + path where the model object will be saved + + Returns + ------- + None + """ + dump(self, path) + + @staticmethod + def load_model(path): + """ + Loads the model object in a specific path + + Parameters + ---------- + path : str + path where the model object will be loaded. + + Returns + ------- + None + """ + model = load(path) + return model + + def save(self, path): + """ + Save model as a Wrapper class + + Parameters + ---------- + path : str + path where the model object will be loaded. + + Returns + ------- + None + """ + path_artifacts = path + "_artifacts.pkl" + dump(self.artifacts, path_artifacts) + content = load_json("config/arquivos.json") + conda_env = load_yaml(content["path_yaml"]) + mlflow.pyfunc.save_model( + path=path, + python_model=self, + artifacts={"model": path_artifacts}, + conda_env=conda_env, + ) + + def get_metrics(self): + """ + Return metrics + + Parameters + ---------- + self : object Wrapper + + Returns + ------- + dict + """ + return self.artifacts["metrics"] + + def get_columns(self): + """ + Return columns + + Parameters + ---------- + self : object Wrapper + + Returns + ------- + list + """ + return self.artifacts["columns"] + + def get_model(self): + """ + Return model + + Parameters + ---------- + self : object Wrapper + + Returns + ------- + dict + """ + return self.artifacts["model"] + + def train_interpret(self, X, model="tabular"): + """ + Train a interpret model + + Parameters + ---------- + self : object Wrapper + X : pd.DataFrame + Data that were used in the train for interpret + model : string, optional + Model to use for the interpret [tabular,mimic_LGBME, + mimic_Linear,mimic_SGDE,mimic_Dec_Tree] + Returns + ------- + None + """ + mimic_models = { + "mimic_LGBME": LGBMExplainableModel, + "mimic_Linear": LinearExplainableModel, + "mimic_SGDE": SGDExplainableModel, + "mimic_Dec_Tree": DecisionTreeExplainableModel, + } + if model == "tabular": + explainer = TabularExplainer( + self.artifacts["model"], X, features=self.artifacts["columns"] + ) + else: + explainer = MimicExplainer( + self.artifacts["model"], + X, + mimic_models[model], + augment_data=True, + max_num_of_augmentations=10, + features=self.artifacts["columns"], + ) + self.artifacts["explainer"] = explainer + + def local_interpret(self, X, n_feat=3, norm=True): + """ + Return a local interpret for each row in data + + Parameters + ---------- + self : object Wrapper + X : array[array], shape (n_linha, n_colunas) + Matrix with the data that were used to return interpret + n_feat : int, optional + Number of features to return + norm : bool, optional + if True, do normalization in the features importances + + Returns + ------- + pd.DataFrame + """ + local_explanation = self.artifacts["explainer"].explain_local(X) + n_obs = X.shape[0] + predictions = self.artifacts["model"].predict(X) + local_values = local_explanation.get_ranked_local_values() + local_values = [local_values[predictions[i]][i] for i in range(n_obs)] + local_names = local_explanation.get_ranked_local_names() + local_names = [local_names[predictions[i]][i] for i in range(n_obs)] + if norm: + local_values = [ + [(i - min(l)) / (max(l) - min(l)) for i in l] for l in local_values + ] + result = [ + (local_names[i][:n_feat] + local_values[i][:n_feat]) for i in range(n_obs) + ] + column_names = [ + f"Importance_{item}_{str(i)}" + for item in ["Name", "Value"] + for i in range(n_feat) + ] + return pd.DataFrame(result, columns=column_names) \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/metrics.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/metrics.py new file mode 100644 index 0000000..34cd079 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/metrics.py @@ -0,0 +1,212 @@ +from sklearn.metrics import * +import numpy as np +from sklearn.metrics import make_scorer +from sklearn.model_selection import cross_validate + +class Metrics: + + @classmethod + def smape(cls, A, F): + """ + Calculates the smape value between the real and the predicted + + Parameters + ---------- + A : array + Target values + F : array + Predicted values + + Returns + ------- + float: smape value + """ + return 100/len(A) * np.sum(np.abs(F - A) / (np.abs(A) + np.abs(F))) + + @classmethod + def __custom_score(cls, y_true, y_pred): + """ + Creates a custom metric + + Parameters + ---------- + y_true : array + Target values + y_pred : array + Predicted values + + Returns + ------- + sklearn.metrics + """ + #return sklearn.metrics.fbeta_score(y_true, y_pred, 2) + pass + + @classmethod + def customized(cls, y_true, y_pred): + """ + Creates a custom metric + + Parameters + ---------- + y_true : array + Target values + y_pred : array + Predicted values + + Returns + ------- + float + """ + custom_metric = make_scorer(cls.__custom_score, greater_is_better=True) + return custom_metric + + @classmethod + def mape(cls, y_true, y_pred): + """ + Calculates the map value between the real and the predicted + + Parameters + ---------- + y_true : array + Target values + y_pred : array + Predicted values + + Returns + ------- + float : value of mape + """ + y_true, y_pred = np.array(y_true), np.array(y_pred) + return np.mean(np.abs(((y_true+1) - (y_pred+1)) / (y_true+1))) * 100 + + + @classmethod + def regression(cls, y_true, y_pred): + """ + Calculates some metrics for regression problems + + Parameters + ---------- + y_true : array + Target values + y_pred : array + Predicted values + + Returns + ------- + dict : metrics results + """ + results = {'mean_absolute_error': round(mean_absolute_error(y_true, y_pred), 7), + 'root_mean_squared_error': round(np.sqrt(mean_squared_error(y_true, y_pred)), 7), + 'r2': round(r2_score(y_true, y_pred), 7), + 'smape': round(cls.smape(y_true, y_pred), 7), + 'mape': round(cls.mape(y_true, y_pred), 7) + } + return results + + @classmethod + def crossvalidation(cls, model, X, y, classification: bool, cv=5, agg=np.mean): + if classification: + if len(set(y)) > 2: + metrics = ['accuracy','f1_weighted', 'recall_weighted','precision_weighted'] + else: + metrics = ['accuracy','f1', 'recall','precision', 'roc_auc'] + else: + metrics = ['mean_absolute_error', 'r2', 'root_mean_squared_error', 'smape', 'mape'] + res_metrics = cross_validate(model, X, y, cv=cv, return_train_score=False, scoring=metrics) + results = {metric.replace("test_", ""): round(agg(res_metrics[metric]),7) for metric in res_metrics} + return results + + @classmethod + def __multiclass_classification(cls, y_true, y_pred): + """ + Calculates some metrics for multiclass classification problems + + Parameters + ---------- + y_true : array + Target values + y_pred : array + Predicted values + + Returns + ------- + dict : metrics results + """ + results = {'accuracy': accuracy_score(y_true, y_pred), + 'f1': f1_score(y_true, y_pred, average='weighted'), + 'precision': precision_score(y_true, y_pred, average='weighted'), + 'recall': recall_score(y_true, y_pred, average='weighted'), + } + return results + + @classmethod + def __binary_classification(cls, y_true, y_pred, y_probs): + """ + Calculates some metrics for binary classification problems + + Parameters + ---------- + y_true : array + Target values + y_pred : array + Predicted values + + Returns + ------- + dict : metrics results + """ + results = {'accuracy': accuracy_score(y_true, y_pred), + 'f1': f1_score(y_true, y_pred), + 'precision': precision_score(y_true, y_pred), + 'recall': recall_score(y_true, y_pred), + 'roc_auc': roc_auc_score(y_true, y_probs) + } + return results + + @classmethod + def classification(cls, y_true, y_pred, y_probs): + """ + Checks which classification method will be applied: binary or multiclass + + Parameters + ---------- + y_true : array + Target values + y_pred : array + Predicted values + y_probs : array + Probabilities values + + Returns + ------- + dict: metrics results + """ + if len(set(y_true)) > 2: + results = cls.__multiclass_classification(y_true, y_pred) + else: + results = cls.__binary_classification(y_true, y_pred, y_probs) + return results + + + @classmethod + def clusterization(cls, X, labels): + """ + Calculates some metrics on clustering quality + + Parameters + ---------- + X : array[array], shape (n_linha, n_colunas) + Matrix with the values that were used in the cluster + labels : array, shape (n_linha, 1) + Vector with labels selected by the clustering method (eg KMeans) + + Returns + ------- + dict : metrics results + """ + results = {'silhouette': silhouette_score(X, labels, metric='euclidean'), + 'calinski_harabaz': calinski_harabaz_score(X, labels) + } + return results \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/trainer.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/trainer.py new file mode 100644 index 0000000..1266611 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/trainer.py @@ -0,0 +1,104 @@ +from abc import ABC, abstractmethod +from ml.model.wrapper import Wrapper +from ml.model.metrics import Metrics +import statsmodels.formula.api as smf +from sklearn.model_selection import train_test_split +import numpy as np + +class Trainer(ABC): + def __init__(self): + """ + Constructor + + Parameters + ---------- + None + + Returns + ------- + Trainer + """ + + @abstractmethod + def train(self): + """ + Abstract method that should be implemented in every class that inherits TrainerModel + Parameters + ---------- + None + + Returns + ------- + None + """ + pass + +class TrainerSklearn(Trainer): + + def train(self, train, val, y_name, + classification: bool, + algorithm, + columns = None, + **params): + """ + Method that builds the Sklearn model + + Parameters + ---------- + train : pd.Dataframe + data to train the model + val : pd.Dataframe + data to validate the model + y_name : str + target name + algorithm : Sklearn algorithm + algorithm to be trained + classification : bool + if True, classification model training takes place, otherwise Regression + columns : array + columns name to be used in the train + + Returns + ------- + Wrapper + """ + model = algorithm(**params) #model + y_train = train[y_name] + y_val = val[y_name] + X_train = train[columns] + X_val = val[columns] + model.fit(X_train,y_train) + y_pred = model.predict(X_val) + y_probs = model.predict_proba(X_val)[:,1] + if classification: + res_metrics = Metrics.classification(y_val.values, y_pred, y_probs) + else: + res_metrics = Metrics.regression(y_val.values, y_pred) + model = Wrapper(model, res_metrics, X_train.columns) + return model + + +class TrainerSklearnUnsupervised(Trainer): + + def train(self, X, + algorithm, + **params): + """ + Method that builds the Sklearn model + + Parameters + ---------- + model_name : str + model name + + Returns + ------- + Wrapper + """ + model = algorithm(**params) #model + columns = list(X.columns) + model.fit(X) + labels = model.predict(X) + res_metrics = Metrics.clusterization(X, labels) + model = Wrapper(model, res_metrics, columns) + return model diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/wrapper.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/wrapper.py new file mode 100644 index 0000000..8f812cf --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/wrapper.py @@ -0,0 +1,252 @@ +from joblib import dump, load +from datetime import date +import mlflow.pyfunc +from mlflow import pyfunc +from interpret.ext.blackbox import TabularExplainer, MimicExplainer +from interpret.ext.glassbox import * +import pandas as pd + +from util import load_yaml, load_json + + +class Wrapper(mlflow.pyfunc.PythonModel): + def __init__(self, model=None, metrics=None, columns=None): + """ + Constructor + + Parameters + ---------- + model : object + If it's just a model: enter all parameters + if it is more than one model: do not enter parameters and use + the add method to add each of the models + metrics : dict + Dictionary with the metrics of the result of the model + columns : list + list with columns names + Returns + ------- + WrapperModel + """ + self.artifacts = dict() + self.artifacts["model"] = model + self.artifacts["metrics"] = metrics + self.artifacts["columns"] = columns + self.artifacts["creation_date"] = date.today() + + def predict(self, model_input, included_input=False): + """ + Method that returns the result of the prediction on a dataset + + Parameters + ---------- + df : pd.DataFrame + Data to be predicted + + Returns + ------- + list + """ + df_processed = model_input.copy() + model = self.artifacts["model"] + columns = self.artifacts["columns"] + result = model.predict(df_processed[columns]) + if included_input: + model_input['predict'] = result + result = model_input + return result + + def predict_proba(self, model_input, binary=False): + """ + Method that returns the result of the prediction on a dataset + + Parameters + ---------- + df : pd.DataFrame + data to be predicted + + Returns + ------- + list + """ + df_processed = model_input.copy() + model = self.artifacts["model"] + columns = self.artifacts["columns"] + if binary: + return model.predict_proba(df_processed[columns])[:, 1] + else: + return model.predict_proba(df_processed[columns]) + + def save_model(self, path): + """ + Saves the model object to a specific path + + Parameters + ---------- + path : str + path where the model object will be saved + + Returns + ------- + None + """ + dump(self, path) + + @staticmethod + def load_model(path): + """ + Loads the model object in a specific path + + Parameters + ---------- + path : str + path where the model object will be loaded. + + Returns + ------- + None + """ + model = load(path) + return model + + def save(self, path): + """ + Save model as a Wrapper class + + Parameters + ---------- + path : str + path where the model object will be loaded. + + Returns + ------- + None + """ + path_artifacts = path + "_artifacts.pkl" + dump(self.artifacts, path_artifacts) + content = load_json("config/arquivos.json") + conda_env = load_yaml(content["path_yaml"]) + mlflow.pyfunc.save_model( + path=path, + python_model=self, + artifacts={"model": path_artifacts}, + conda_env=conda_env, + ) + + def get_metrics(self): + """ + Return metrics + + Parameters + ---------- + self : object Wrapper + + Returns + ------- + dict + """ + return self.artifacts["metrics"] + + def get_columns(self): + """ + Return columns + + Parameters + ---------- + self : object Wrapper + + Returns + ------- + list + """ + return self.artifacts["columns"] + + def get_model(self): + """ + Return model + + Parameters + ---------- + self : object Wrapper + + Returns + ------- + dict + """ + return self.artifacts["model"] + + def train_interpret(self, X, model="tabular"): + """ + Train a interpret model + + Parameters + ---------- + self : object Wrapper + X : pd.DataFrame + Data that were used in the train for interpret + model : string, optional + Model to use for the interpret [tabular,mimic_LGBME, + mimic_Linear,mimic_SGDE,mimic_Dec_Tree] + Returns + ------- + None + """ + mimic_models = { + "mimic_LGBME": LGBMExplainableModel, + "mimic_Linear": LinearExplainableModel, + "mimic_SGDE": SGDExplainableModel, + "mimic_Dec_Tree": DecisionTreeExplainableModel, + } + if model == "tabular": + explainer = TabularExplainer( + self.artifacts["model"], X, features=self.artifacts["columns"] + ) + else: + explainer = MimicExplainer( + self.artifacts["model"], + X, + mimic_models[model], + augment_data=True, + max_num_of_augmentations=10, + features=self.artifacts["columns"], + ) + self.artifacts["explainer"] = explainer + + def local_interpret(self, X, n_feat=3, norm=True): + """ + Return a local interpret for each row in data + + Parameters + ---------- + self : object Wrapper + X : array[array], shape (n_linha, n_colunas) + Matrix with the data that were used to return interpret + n_feat : int, optional + Number of features to return + norm : bool, optional + if True, do normalization in the features importances + + Returns + ------- + pd.DataFrame + """ + local_explanation = self.artifacts["explainer"].explain_local(X) + n_obs = X.shape[0] + predictions = self.artifacts["model"].predict(X) + local_values = local_explanation.get_ranked_local_values() + local_values = [local_values[predictions[i]][i] for i in range(n_obs)] + local_names = local_explanation.get_ranked_local_names() + local_names = [local_names[predictions[i]][i] for i in range(n_obs)] + if norm: + local_values = [ + [(i - min(l)) / (max(l) - min(l)) for i in l] for l in local_values + ] + result = [ + (local_names[i][:n_feat] + local_values[i][:n_feat]) for i in range(n_obs) + ] + column_names = [ + f"Importance_{item}_{str(i)}" + for item in ["Name", "Value"] + for i in range(n_feat) + ] + return pd.DataFrame(result, columns=column_names) \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_Inference-checkpoint.ipynb b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_Inference-checkpoint.ipynb new file mode 100644 index 0000000..aa21796 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_Inference-checkpoint.ipynb @@ -0,0 +1,322 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "616d65aa", + "metadata": {}, + "source": [ + "# Sagemaker Inference" + ] + }, + { + "cell_type": "markdown", + "id": "aee7320a", + "metadata": {}, + "source": [ + "This script predicts new data with the uploaded image in ECR." + ] + }, + { + "cell_type": "markdown", + "id": "ea32612e", + "metadata": {}, + "source": [ + "## Import modules" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "3f188c9f", + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "import boto3\n", + "import sagemaker\n", + "from sagemaker import get_execution_role" + ] + }, + { + "cell_type": "markdown", + "id": "430e1eb4", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "markdown", + "id": "ebe50488", + "metadata": {}, + "source": [ + "Modify according to your configurations." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8893b148", + "metadata": {}, + "outputs": [], + "source": [ + "# Bucket name in S3\n", + "bucket = \"hermione-sagemaker\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a6ba2451", + "metadata": {}, + "outputs": [], + "source": [ + "# Set session\n", + "region_name=\"us-east-1\"\n", + "boto3.setup_default_session(region_name=region_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "797c5fa6", + "metadata": {}, + "outputs": [], + "source": [ + "# Get user role\n", + "role = get_execution_role()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "d8148140", + "metadata": {}, + "outputs": [], + "source": [ + "# Get AWS Account ID\n", + "account_number = boto3.client(\"sts\").get_caller_identity()[\"Account\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "1b1fba48", + "metadata": {}, + "outputs": [], + "source": [ + "# Image previous uploaded in ECR\n", + "image_name = \"hermione-inference\"\n", + "image_uri = f\"{account_number}.dkr.ecr.{region_name}.amazonaws.com/{image_name}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f907e610", + "metadata": {}, + "outputs": [], + "source": [ + "# Input and output paths to execute inference\n", + "paths = {\n", + " 'inference_processed': f\"s3://{bucket}/PREPROCESSING/INFERENCE_PROCESSED/inference.csv\",\n", + " 'model': f\"s3://{bucket}/PREPROCESSING/MODEL/Hermione-train-2021-05-26-12-41-29-505/output/model.tar.gz\",\n", + " 'output_path': f\"s3://{bucket}/PREPROCESSING/OUTPUT\"\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "f5fdfdd8", + "metadata": {}, + "outputs": [], + "source": [ + "# instance to run the code\n", + "instance_type=\"ml.m5.large\"" + ] + }, + { + "cell_type": "markdown", + "id": "55fe64d7", + "metadata": {}, + "source": [ + "## Inference" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "60b7dc56", + "metadata": {}, + "outputs": [], + "source": [ + "# Receives the processed inference data in S3\n", + "input_path = paths['inference_processed']" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "e3dc913c", + "metadata": {}, + "outputs": [], + "source": [ + "# Receives the model created during the training in S3\n", + "model_path = paths['model']" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "5b69f31c", + "metadata": {}, + "outputs": [], + "source": [ + "# Saves the prediction in S3\n", + "output_path = paths['output_path']" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "29f7ce88", + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the model to access the ECR image\n", + "model = sagemaker.model.Model(\n", + " image_uri= image_uri,\n", + " model_data=model_path,\n", + " role=role)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "aacdf22a", + "metadata": {}, + "outputs": [], + "source": [ + "# Creates a transformer object from the trained model\n", + "transformer = model.transformer(\n", + " instance_count=1,\n", + " instance_type=instance_type, \n", + " output_path=output_path,\n", + " accept = 'text/csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "6452e276", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ".........................\u001b[34m2021-05-26 12:57:00,312 [INFO ] main com.amazonaws.ml.mms.ModelServer - \u001b[0m\n", + "\u001b[34mMMS Home: /usr/local/lib/python3.8/dist-packages\u001b[0m\n", + "\u001b[34mCurrent directory: /\u001b[0m\n", + "\u001b[34mTemp directory: /tmp\u001b[0m\n", + "\u001b[34mNumber of GPUs: 0\u001b[0m\n", + "\u001b[34mNumber of CPUs: 2\u001b[0m\n", + "\u001b[34mMax heap size: 857 M\u001b[0m\n", + "\u001b[34mPython executable: /usr/bin/python3\u001b[0m\n", + "\u001b[34mConfig file: /etc/sagemaker-mms.properties\u001b[0m\n", + "\u001b[34mInference address: http://0.0.0.0:8080\u001b[0m\n", + "\u001b[34mManagement address: http://0.0.0.0:8080\u001b[0m\n", + "\u001b[34mModel Store: /.sagemaker/mms/models\u001b[0m\n", + "\u001b[34mInitial Models: ALL\u001b[0m\n", + "\u001b[34mLog dir: /logs\u001b[0m\n", + "\u001b[34mMetrics dir: /logs\u001b[0m\n", + "\u001b[34mNetty threads: 0\u001b[0m\n", + "\u001b[34mNetty client threads: 0\u001b[0m\n", + "\u001b[34mDefault workers per model: 2\u001b[0m\n", + "\u001b[34mBlacklist Regex: N/A\u001b[0m\n", + "\u001b[34mMaximum Response Size: 6553500\u001b[0m\n", + "\u001b[34mMaximum Request Size: 6553500\u001b[0m\n", + "\u001b[34mPreload model: false\u001b[0m\n", + "\u001b[34mPrefer direct buffer: false\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:00,419 [WARN ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerLifeCycle - attachIOStreams() threadName=W-9000-model\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:00,506 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - model_service_worker started with args: --sock-type unix --sock-name /tmp/.mms.sock.9000 --handler serving.handler --model-path /.sagemaker/mms/models/model --model-name model --preload-model false --tmp-dir /tmp\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:00,508 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Listening on port: /tmp/.mms.sock.9000\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:00,509 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - [PID] 23\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:00,509 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - MMS worker started.\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:00,509 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Python runtime: 3.8.5\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:00,512 [INFO ] main com.amazonaws.ml.mms.wlm.ModelManager - Model model loaded.\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:00,517 [INFO ] main com.amazonaws.ml.mms.ModelServer - Initialize Inference server with: EpollServerSocketChannel.\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:00,536 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Connecting to: /tmp/.mms.sock.9000\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:00,536 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Connecting to: /tmp/.mms.sock.9000\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:00,607 [INFO ] main com.amazonaws.ml.mms.ModelServer - Inference API bind to: http://0.0.0.0:8080\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:00,613 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Connection accepted: /tmp/.mms.sock.9000.\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:00,614 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Connection accepted: /tmp/.mms.sock.9000.\u001b[0m\n", + "\u001b[34mModel server started.\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:00,636 [WARN ] pool-2-thread-1 com.amazonaws.ml.mms.metrics.MetricCollector - worker pid is not available yet.\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:02,508 [WARN ] W-9000-model-stderr com.amazonaws.ml.mms.wlm.WorkerLifeCycle - /usr/local/lib/python3.8/dist-packages/interpret_community/common/gpu_kmeans.py:30: UserWarning: cuML is required to use GPU explainers. Check https://rapids.ai/start.html for more information on how to install it.\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:02,510 [WARN ] W-9000-model-stderr com.amazonaws.ml.mms.wlm.WorkerLifeCycle - warnings.warn(\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:02,510 [WARN ] W-9000-model-stderr com.amazonaws.ml.mms.wlm.WorkerLifeCycle - /usr/local/lib/python3.8/dist-packages/interpret_community/common/gpu_kmeans.py:30: UserWarning: cuML is required to use GPU explainers. Check https://rapids.ai/start.html for more information on how to install it.\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:02,510 [WARN ] W-9000-model-stderr com.amazonaws.ml.mms.wlm.WorkerLifeCycle - warnings.warn(\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:03,375 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - generated new fontManager\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:03,393 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - generated new fontManager\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:03,635 [WARN ] W-9000-model-stderr com.amazonaws.ml.mms.wlm.WorkerLifeCycle - cuML is required to use GPU explainers. Check https://rapids.ai/start.html for more information on how to install it.\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:03,658 [WARN ] W-9000-model-stderr com.amazonaws.ml.mms.wlm.WorkerLifeCycle - cuML is required to use GPU explainers. Check https://rapids.ai/start.html for more information on how to install it.\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:03,690 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Loading the model\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:03,715 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Loading the model\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:03,741 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Model model loaded io_fd=0242a9fffefeff83-00000009-00000002-e6c9db643cbfeb7b-a47635f7\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:03,750 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Backend response time: 3046\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:03,752 [WARN ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerLifeCycle - attachIOStreams() threadName=W-model-1\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:03,768 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Model model loaded io_fd=0242a9fffefeff83-00000009-00000001-f549db643cbfeb7b-e2a66100\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:03,768 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Backend response time: 3065\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:03,769 [WARN ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerLifeCycle - attachIOStreams() threadName=W-model-2\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:09,272 [INFO ] pool-1-thread-4 ACCESS_LOG - /169.254.255.130:59054 \"GET /ping HTTP/1.1\" 200 11\u001b[0m\n", + "\u001b[35m2021-05-26 12:57:09,272 [INFO ] pool-1-thread-4 ACCESS_LOG - /169.254.255.130:59054 \"GET /ping HTTP/1.1\" 200 11\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:09,353 [INFO ] epollEventLoopGroup-3-2 ACCESS_LOG - /169.254.255.130:59058 \"GET /execution-parameters HTTP/1.1\" 404 2\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:09,462 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Predicting...\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:09,486 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Prediction Complete\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:09,491 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Saving\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:09,494 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Backend response time: 37\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:09,494 [INFO ] W-9000-model ACCESS_LOG - /169.254.255.130:59068 \"POST /invocations HTTP/1.1\" 200 42\u001b[0m\n", + "\u001b[35m2021-05-26 12:57:09,353 [INFO ] epollEventLoopGroup-3-2 ACCESS_LOG - /169.254.255.130:59058 \"GET /execution-parameters HTTP/1.1\" 404 2\u001b[0m\n", + "\u001b[35m2021-05-26 12:57:09,462 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Predicting...\u001b[0m\n", + "\u001b[35m2021-05-26 12:57:09,486 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Prediction Complete\u001b[0m\n", + "\u001b[35m2021-05-26 12:57:09,491 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Saving\u001b[0m\n", + "\u001b[35m2021-05-26 12:57:09,494 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Backend response time: 37\u001b[0m\n", + "\u001b[35m2021-05-26 12:57:09,494 [INFO ] W-9000-model ACCESS_LOG - /169.254.255.130:59068 \"POST /invocations HTTP/1.1\" 200 42\u001b[0m\n", + "\u001b[32m2021-05-26T12:57:09.364:[sagemaker logs]: MaxConcurrentTransforms=1, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD\u001b[0m\n", + "\n", + "CPU times: user 547 ms, sys: 59 ms, total: 606 ms\n", + "Wall time: 4min 43s\n" + ] + } + ], + "source": [ + "%%time\n", + "# Predicts the data\n", + "transformer.transform(data=input_path, data_type='S3Prefix', content_type='text/csv', split_type='Line')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_Processor-checkpoint.ipynb b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_Processor-checkpoint.ipynb new file mode 100644 index 0000000..ad85e0f --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_Processor-checkpoint.ipynb @@ -0,0 +1,396 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b5264128", + "metadata": {}, + "source": [ + "# Sagemaker Processor" + ] + }, + { + "cell_type": "markdown", + "id": "5bd7a5cd", + "metadata": {}, + "source": [ + "This script generates the train, val and inference files with the processor previous uploaded in ECR." + ] + }, + { + "cell_type": "markdown", + "id": "0488ed05", + "metadata": {}, + "source": [ + "## Import modules" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "e7b20785", + "metadata": {}, + "outputs": [], + "source": [ + "import boto3\n", + "import time\n", + "from datetime import datetime\n", + "from sagemaker import get_execution_role\n", + "from sagemaker.processing import Processor, ProcessingInput, ProcessingOutput" + ] + }, + { + "cell_type": "markdown", + "id": "7f3fd305", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "markdown", + "id": "6528a20b", + "metadata": {}, + "source": [ + "Modify according to your configurations." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d5cdd5d1", + "metadata": {}, + "outputs": [], + "source": [ + "# Bucket name in S3\n", + "bucket = \"hermione-sagemaker\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "5ec68bf7", + "metadata": {}, + "outputs": [], + "source": [ + "# Set session\n", + "region_name=\"us-east-1\"\n", + "boto3.setup_default_session(region_name=region_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "4d011a47", + "metadata": {}, + "outputs": [], + "source": [ + "# Get user role\n", + "role = get_execution_role()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "25f76666", + "metadata": {}, + "outputs": [], + "source": [ + "# Get AWS Account ID\n", + "account_number = boto3.client(\"sts\").get_caller_identity()[\"Account\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "fafb5f18", + "metadata": {}, + "outputs": [], + "source": [ + "# Image previous uploaded in ECR\n", + "image_name = \"hermione-processor\"\n", + "image_uri = f\"{account_number}.dkr.ecr.{region_name}.amazonaws.com/{image_name}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "2ef594d3", + "metadata": {}, + "outputs": [], + "source": [ + "# Input and output paths to execute train and inference\n", + "paths = {\n", + " 'train_raw': f\"s3://{bucket}/TRAIN_RAW\",\n", + " 'expectations': f\"s3://{bucket}/PREPROCESSING/EXPECTATIONS\",\n", + " 'preprocessing': f\"s3://{bucket}/PREPROCESSING/PREPROCESSING\",\n", + " 'train_processed': f\"s3://{bucket}/PREPROCESSING/TRAIN_PROCESSED\",\n", + " 'val_processed': f\"s3://{bucket}/PREPROCESSING/VAL_PROCESSED\",\n", + " 'test_raw': f\"s3://{bucket}/TEST_RAW\",\n", + " 'inference_processed': f\"s3://{bucket}/PREPROCESSING/INFERENCE_PROCESSED\",\n", + " 'validations': f\"s3://{bucket}/PREPROCESSING/VALIDATIONS\"\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "2b625b74", + "metadata": {}, + "outputs": [], + "source": [ + "# instance to run the code\n", + "instance_type_train=\"ml.t3.medium\"\n", + "instance_type_inference=\"ml.t3.medium\"" + ] + }, + { + "cell_type": "markdown", + "id": "6e8e92ba", + "metadata": {}, + "source": [ + "## Processor - Train" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "e1b41ed1", + "metadata": {}, + "outputs": [], + "source": [ + "# Receives a raw data in S3\n", + "inputs=[\n", + " ProcessingInput(source=paths['train_raw'], \n", + " destination='/opt/ml/processing/input/raw_data', \n", + " input_name=\"raw_data\")\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "cd67446b", + "metadata": {}, + "outputs": [], + "source": [ + "# Returns the great expectation object, preprocessing object, \n", + "# processed training data and processed validation data, and saves them in S3\n", + "outputs = [\n", + " ProcessingOutput(\n", + " source=\"/opt/ml/processing/output/expectations\",\n", + " destination=paths['expectations'],\n", + " output_name=\"expectations\",\n", + " ),\n", + " ProcessingOutput(\n", + " source=\"/opt/ml/processing/output/preprocessing\",\n", + " destination=paths['preprocessing'],\n", + " output_name=\"preprocessing\",\n", + " ),\n", + " ProcessingOutput(\n", + " source=\"/opt/ml/processing/output/processed/train\",\n", + " destination=paths['train_processed'],\n", + " output_name=\"train_data\",\n", + " ),\n", + " ProcessingOutput(\n", + " source=\"/opt/ml/processing/output/processed/val\",\n", + " destination=paths['val_processed'],\n", + " output_name=\"val_data\",\n", + " )\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "902f8e4f", + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the processor to access the ECR image\n", + "processor = Processor(image_uri=image_uri,\n", + " role=role,\n", + " instance_count=1,\n", + " instance_type=instance_type_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "fd8a28a1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Job Name: hermione-processor-2021-05-25-21-03-59-873\n", + "Inputs: [{'InputName': 'raw_data', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://hermione-sagemaker/TRAIN_RAW', 'LocalPath': '/opt/ml/processing/input/raw_data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]\n", + "Outputs: [{'OutputName': 'expectations', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/EXPECTATIONS', 'LocalPath': '/opt/ml/processing/output/expectations', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'preprocessing', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/PREPROCESSING', 'LocalPath': '/opt/ml/processing/output/preprocessing', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'train_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/TRAIN_PROCESSED', 'LocalPath': '/opt/ml/processing/output/processed/train', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'val_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/VAL_PROCESSED', 'LocalPath': '/opt/ml/processing/output/processed/val', 'S3UploadMode': 'EndOfJob'}}]\n", + "......................................................\n", + "\u001b[34mINFO:root:Starting the preprocessing\u001b[0m\n", + "\u001b[34mINFO:root:step_train: True\u001b[0m\n", + "\u001b[34mINFO:root:Reading the inputs\u001b[0m\n", + "\u001b[34mINFO:root:Reading file: /opt/ml/processing/input/raw_data/raw_train.csv\u001b[0m\n", + "\u001b[34mINFO:root:Data Quality\u001b[0m\n", + "\u001b[34mINFO:great_expectations.data_asset.data_asset:#01110 expectation(s) included in expectation_suite. Omitting 1 expectation(s) that failed when last run; set discard_failed_expectations=False to include them. result_format settings filtered.\u001b[0m\n", + "\u001b[34mINFO:root:Preprocessing\u001b[0m\n", + "\u001b[34mINFO:root:Cleaning data\u001b[0m\n", + "\u001b[34mINFO:root:One hot encoding\u001b[0m\n", + "\u001b[34mWARNING:py.warnings:/usr/local/lib/python3.8/dist-packages/category_encoders/utils.py:21: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead\n", + " elif pd.api.types.is_categorical(cols):\n", + "\u001b[0m\n", + "\u001b[34mINFO:root:Divide train and test\u001b[0m\n", + "\u001b[34mINFO:root:Normalizing\u001b[0m\n", + "\u001b[34mWARNING:py.warnings:/usr/local/lib/python3.8/dist-packages/pandas/core/indexing.py:1738: SettingWithCopyWarning: \u001b[0m\n", + "\u001b[34mA value is trying to be set on a copy of a slice from a DataFrame.\u001b[0m\n", + "\u001b[34mTry using .loc[row_indexer,col_indexer] = value instead\n", + "\u001b[0m\n", + "\u001b[34mSee the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " self._setitem_single_column(loc, value[:, i].tolist(), pi)\n", + "\u001b[0m\n", + "\u001b[34mINFO:root:Normalizing\u001b[0m\n", + "\u001b[34mINFO:root:shape train (393, 7) val (99, 7)\u001b[0m\n", + "\u001b[34mINFO:root:Saving\u001b[0m\n", + "CPU times: user 1.02 s, sys: 104 ms, total: 1.13 s\n", + "Wall time: 9min 14s\n" + ] + } + ], + "source": [ + "%%time\n", + "# Runs the processor to access the ECR image and process the training data\n", + "processor.run(inputs=inputs,\n", + " outputs= outputs,\n", + " arguments=[\"--step\", \"train\"] \n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "a0b0636e", + "metadata": {}, + "source": [ + "## Processor - Inference" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "4e1df020", + "metadata": {}, + "outputs": [], + "source": [ + "# Receives a raw data in S3, the preprocessing and great expectation objects created in the training\n", + "inputs=[\n", + " ProcessingInput(source=paths['test_raw'],\n", + " destination='/opt/ml/processing/input/raw_data', \n", + " input_name='raw_data'),\n", + " ProcessingInput(source=paths['preprocessing'], \n", + " destination='/opt/ml/processing/input/preprocessing', \n", + " input_name='preprocessing'),\n", + " ProcessingInput(source=paths['expectations'], \n", + " destination='/opt/ml/processing/input/expectations', \n", + " input_name='expectations')\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "4fa3439a", + "metadata": {}, + "outputs": [], + "source": [ + "# Returns the processed inference data and validations, and saves them in S3\n", + "outputs = [\n", + " ProcessingOutput(\n", + " source=\"/opt/ml/processing/output/processed/inference\",\n", + " destination=paths['inference_processed'],\n", + " output_name=\"inference_data\",\n", + " ),\n", + " ProcessingOutput(\n", + " source=\"/opt/ml/processing/output/validations\",\n", + " destination=paths['validations'],\n", + " output_name=\"validations\",\n", + " )\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "c399b969", + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the processor to access the ECR image\n", + "processor = Processor(image_uri=image_uri,\n", + " role=role,\n", + " instance_count=1,\n", + " instance_type=instance_type_inference)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "8cb61e97", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Job Name: hermione-processor-2021-05-25-21-13-13-987\n", + "Inputs: [{'InputName': 'raw_data', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://hermione-sagemaker/TEST_RAW', 'LocalPath': '/opt/ml/processing/input/raw_data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'preprocessing', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/PREPROCESSING', 'LocalPath': '/opt/ml/processing/input/preprocessing', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'expectations', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/EXPECTATIONS', 'LocalPath': '/opt/ml/processing/input/expectations', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]\n", + "Outputs: [{'OutputName': 'inference_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/INFERENCE_PROCESSED', 'LocalPath': '/opt/ml/processing/output/processed/inference', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'validations', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/VALIDATIONS', 'LocalPath': '/opt/ml/processing/output/validations', 'S3UploadMode': 'EndOfJob'}}]\n", + "............................................................\n", + "\u001b[34mINFO:root:Starting the preprocessing\u001b[0m\n", + "\u001b[34mINFO:root:step_train: False\u001b[0m\n", + "\u001b[34mINFO:root:Reading the inputs\u001b[0m\n", + "\u001b[34mINFO:root:Reading file: /opt/ml/processing/input/raw_data/raw_test.csv\u001b[0m\n", + "\u001b[34mINFO:root:Data Quality\u001b[0m\n", + "\u001b[34mINFO:root:Preprocessing\u001b[0m\n", + "\u001b[34mINFO:root:Cleaning data\u001b[0m\n", + "\u001b[34mINFO:root:One hot encoding\u001b[0m\n", + "\u001b[34mINFO:root:Normalizing\u001b[0m\n", + "\u001b[34mINFO:root:shape (222, 7)\u001b[0m\n", + "\u001b[34mINFO:root:Saving\u001b[0m\n", + "CPU times: user 1.19 s, sys: 38.4 ms, total: 1.23 s\n", + "Wall time: 10min 14s\n" + ] + } + ], + "source": [ + "%%time\n", + "# Runs the processor to access the ECR image and process the inference data\n", + "processor.run(inputs=inputs,\n", + " outputs= outputs,\n", + " arguments=[\"--step\", \"test\"] \n", + " )" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_StepFunctions_Inference-checkpoint.ipynb b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_StepFunctions_Inference-checkpoint.ipynb new file mode 100644 index 0000000..1c9af76 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_StepFunctions_Inference-checkpoint.ipynb @@ -0,0 +1,737 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Build machine learning workflow to predict new data with Amazon SageMaker and AWS Step Functions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This script creates a Step Function state machine to preprocess the inference data and predict with the images in ECR." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import modules" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import uuid\n", + "import boto3\n", + "import sagemaker\n", + "from sagemaker.amazon.amazon_estimator import get_image_uri\n", + "from sagemaker.s3 import S3Uploader\n", + "from sagemaker import get_execution_role\n", + "from sagemaker.sklearn.processing import SKLearnProcessor\n", + "from sagemaker.processing import Processor, ProcessingInput, ProcessingOutput\n", + "import stepfunctions\n", + "from stepfunctions.steps import (\n", + " Chain,\n", + " ProcessingStep,\n", + " TransformStep\n", + ")\n", + "from stepfunctions.inputs import ExecutionInput\n", + "from stepfunctions.workflow import Workflow" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Modify according to your configurations." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Bucket name in S3\n", + "bucket = \"hermione-sagemaker\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Set session\n", + "region_name=\"us-east-1\"\n", + "boto3.setup_default_session(region_name=region_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Get user role\n", + "role = get_execution_role()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Role to create and execute step functions\n", + "# paste the AmazonSageMaker-StepFunctionsWorkflowExecutionRole ARN\n", + "workflow_execution_role = \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# SageMaker expects unique names for each job, model and endpoint.\n", + "# Otherwise, the execution will fail. The ExecutionInput creates\n", + "# dynamically names for each execution.\n", + "execution_input = ExecutionInput(\n", + " schema={\n", + " \"PreprocessingJobName\": str,\n", + " \"TransformJobName\": str \n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# Get AWS Account ID\n", + "account_number = boto3.client(\"sts\").get_caller_identity()[\"Account\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# Processor image name previous uploaded in ECR\n", + "image_name_processor = \"hermione-processor\"" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Inference image name previous uploaded in ECR\n", + "image_name_inference = \"hermione-inference\"" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Input and output paths to execute train and inference\n", + "paths = {\n", + " 'expectations': f\"s3://{bucket}/PREPROCESSING/EXPECTATIONS\",\n", + " 'preprocessing': f\"s3://{bucket}/PREPROCESSING/PREPROCESSING\",\n", + " 'test_raw': f\"s3://{bucket}/TEST_RAW\",\n", + " 'inference_processed': f\"s3://{bucket}/PREPROCESSING/INFERENCE_PROCESSED\",\n", + " 'validations': f\"s3://{bucket}/PREPROCESSING/VALIDATIONS\",\n", + " 'model': f\"s3://{bucket}/PREPROCESSING/MODEL/Hermione-train-2021-05-26-12-41-29-505/output/model.tar.gz\",\n", + " 'output_path': f\"s3://{bucket}/PREPROCESSING/OUTPUT\"\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# instance to run the code\n", + "instance_type_preprocessing=\"ml.t3.medium\"\n", + "instance_type_inference=\"ml.m5.large\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preprocessing Step" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# Processor image previous uploaded in ECR\n", + "image_uri_processor = f\"{account_number}.dkr.ecr.{region_name}.amazonaws.com/{image_name_processor}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the processor to access the ECR image\n", + "processor = Processor(image_uri=image_uri_processor,\n", + " role=role,\n", + " instance_count=1,\n", + " instance_type=instance_type_preprocessing)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates input and output objects for ProcessingStep\n", + "inputs=[\n", + " ProcessingInput(source=paths['test_raw'],\n", + " destination='/opt/ml/processing/input/raw_data', \n", + " input_name='raw_data'),\n", + " ProcessingInput(source=paths['preprocessing'], \n", + " destination='/opt/ml/processing/input/preprocessing', \n", + " input_name='preprocessing'),\n", + " ProcessingInput(source=paths['expectations'], \n", + " destination='/opt/ml/processing/input/expectations', \n", + " input_name='expectations')\n", + "]\n", + "outputs = [\n", + " ProcessingOutput(\n", + " source=\"/opt/ml/processing/output/processed/inference\",\n", + " destination=paths['inference_processed'],\n", + " output_name=\"inference_data\",\n", + " ),\n", + " ProcessingOutput(\n", + " source=\"/opt/ml/processing/output/validations\",\n", + " destination=paths['validations'],\n", + " output_name=\"validations\",\n", + " )\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the ProcessingStep\n", + "processing_step = ProcessingStep(\n", + " \"SageMaker Preprocessing step\",\n", + " processor=processor,\n", + " job_name=execution_input[\"PreprocessingJobName\"],\n", + " inputs=inputs,\n", + " outputs=outputs,\n", + " container_arguments=[\"--step\", \"test\"]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Inference Step" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# Inference image previous uploaded in ECR\n", + "image_uri_inference = f\"{account_number}.dkr.ecr.{region_name}.amazonaws.com/{image_name_inference}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates input and output objects for TransformStep\n", + "input_path = paths['inference_processed']\n", + "model_path = paths['model']\n", + "output_path = paths['output_path']" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the model to access the ECR image\n", + "model = sagemaker.model.Model(\n", + " image_uri = image_uri_inference,\n", + " model_data=model_path,\n", + " role=role)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates a transformer object from the trained model\n", + "transformer = model.transformer(\n", + " instance_count=1,\n", + " instance_type=instance_type_inference, \n", + " output_path=output_path,\n", + " accept = 'text/csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the TransformStep\n", + "transform_step = TransformStep(\n", + " \"Inference Step\",\n", + " transformer=transformer,\n", + " job_name=execution_input[\"TransformJobName\"],\n", + " data=input_path,\n", + " content_type='text/csv',\n", + " wait_for_completion=True,\n", + " model_name=model.name\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Workflow and Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates Fail state to mark the workflow failed in case any of the steps fail.\n", + "failed_state_sagemaker_processing_failure = stepfunctions.steps.states.Fail(\n", + " \"ML Workflow failed\", cause=\"SageMakerProcessingJobFailed\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "# Adds the Error handling in the workflow\n", + "catch_state_processing = stepfunctions.steps.states.Catch(\n", + " error_equals=[\"States.TaskFailed\"],\n", + " next_step=failed_state_sagemaker_processing_failure,\n", + ")\n", + "\n", + "processing_step.add_catch(catch_state_processing)\n", + "transform_step.add_catch(catch_state_processing)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates workflow with Pre-Processing Job and Transform Job\n", + "workflow_graph = Chain([processing_step, transform_step])\n", + "branching_workflow = Workflow(\n", + " name=\"SFN_Hermione_Inference\",\n", + " definition=workflow_graph,\n", + " role=workflow_execution_role,\n", + ")\n", + "branching_workflow.create()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "# Generates unique names for Pre-Processing Job and Training Job\n", + "# Each job requires a unique name\n", + "preprocessing_job_name = \"Hermione-Preprocessing-{}\".format(\n", + " uuid.uuid1().hex\n", + ") \n", + "inference_job_name = \"Hermione-Inference-{}\".format(\n", + " uuid.uuid1().hex\n", + ") " + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + " \n", + " \n", + "
\n", + "
    \n", + "
  • \n", + "
    \n", + " Success\n", + "
  • \n", + "
  • \n", + "
    \n", + " Failed\n", + "
  • \n", + "
  • \n", + "
    \n", + " Cancelled\n", + "
  • \n", + "
  • \n", + "
    \n", + " In Progress\n", + "
  • \n", + "
  • \n", + "
    \n", + " Caught Error\n", + "
  • \n", + "
\n", + "
\n", + "\n", + " \n", + " Inspect in AWS Step Functions \n", + "
\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Executes the workflow\n", + "execution = branching_workflow.execute(\n", + " inputs={\n", + " \"PreprocessingJobName\": preprocessing_job_name,\n", + " \"TransformJobName\": inference_job_name\n", + " }\n", + ")\n", + "execution_output = execution.get_output(wait=False)\n", + "execution.render_progress()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Results" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SurvivedAgePclass_1Pclass_2Pclass_3Sex_1Sex_2predict
01.00.0072880.00.01.01.00.01.0
10.00.3717010.01.00.00.01.00.0
20.00.7612470.01.00.00.01.00.0
30.00.3340040.00.01.00.01.00.0
40.00.5727571.00.00.00.01.00.0
...........................
2170.00.2083440.00.01.00.01.00.0
2180.00.2334760.00.01.00.01.00.0
2190.00.0198540.00.01.01.00.01.0
2201.00.2209101.00.00.01.00.01.0
2211.00.6481530.01.00.01.00.01.0
\n", + "

222 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " Survived Age Pclass_1 Pclass_2 Pclass_3 Sex_1 Sex_2 predict\n", + "0 1.0 0.007288 0.0 0.0 1.0 1.0 0.0 1.0\n", + "1 0.0 0.371701 0.0 1.0 0.0 0.0 1.0 0.0\n", + "2 0.0 0.761247 0.0 1.0 0.0 0.0 1.0 0.0\n", + "3 0.0 0.334004 0.0 0.0 1.0 0.0 1.0 0.0\n", + "4 0.0 0.572757 1.0 0.0 0.0 0.0 1.0 0.0\n", + ".. ... ... ... ... ... ... ... ...\n", + "217 0.0 0.208344 0.0 0.0 1.0 0.0 1.0 0.0\n", + "218 0.0 0.233476 0.0 0.0 1.0 0.0 1.0 0.0\n", + "219 0.0 0.019854 0.0 0.0 1.0 1.0 0.0 1.0\n", + "220 1.0 0.220910 1.0 0.0 0.0 1.0 0.0 1.0\n", + "221 1.0 0.648153 0.0 1.0 0.0 1.0 0.0 1.0\n", + "\n", + "[222 rows x 8 columns]" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "pd.read_csv('s3://hermione-sagemaker/PREPROCESSING/OUTPUT/inference.csv.out')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_StepFunctions_Train-checkpoint.ipynb b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_StepFunctions_Train-checkpoint.ipynb new file mode 100644 index 0000000..a4c655a --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_StepFunctions_Train-checkpoint.ipynb @@ -0,0 +1,540 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Build machine learning workflow to train a model with Amazon SageMaker and AWS Step Functions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This script creates a Step Function state machine to preprocess the training data and train a model with the images in ECR." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import modules" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import uuid\n", + "import boto3\n", + "import sagemaker\n", + "from sagemaker import get_execution_role\n", + "from sagemaker.processing import Processor, ProcessingInput, ProcessingOutput\n", + "import stepfunctions\n", + "from stepfunctions.inputs import ExecutionInput\n", + "from stepfunctions.workflow import Workflow\n", + "from stepfunctions.steps import (\n", + " TrainingStep, \n", + " Chain,\n", + " ProcessingStep,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Modify according to your configurations." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Bucket name in S3\n", + "bucket = \"hermione-sagemaker\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Set session\n", + "region_name=\"us-east-1\"\n", + "boto3.setup_default_session(region_name=region_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Get user role\n", + "role = get_execution_role()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Role to create and execute step functions\n", + "# paste the AmazonSageMaker-StepFunctionsWorkflowExecutionRole ARN\n", + "workflow_execution_role = \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# SageMaker expects unique names for each job, model and endpoint.\n", + "# Otherwise, the execution will fail. The ExecutionInput creates\n", + "# dynamically names for each execution.\n", + "execution_input = ExecutionInput(\n", + " schema={\n", + " \"PreprocessingJobName\": str,\n", + " \"TrainingJobName\": str\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# Get AWS Account ID\n", + "account_number = boto3.client(\"sts\").get_caller_identity()[\"Account\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# Processor image name previous uploaded in ECR\n", + "image_name_processor = \"hermione-processor\"" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Training image name previous uploaded in ECR\n", + "image_name_train = \"hermione-train\"" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Input and output paths to execute\n", + "paths = {\n", + " 'train_raw': f\"s3://{bucket}/TRAIN_RAW\",\n", + " 'expectations': f\"s3://{bucket}/PREPROCESSING/EXPECTATIONS\",\n", + " 'preprocessing': f\"s3://{bucket}/PREPROCESSING/PREPROCESSING\",\n", + " 'train_processed': f\"s3://{bucket}/PREPROCESSING/TRAIN_PROCESSED\",\n", + " 'val_processed': f\"s3://{bucket}/PREPROCESSING/VAL_PROCESSED\",\n", + " 'model': f\"s3://{bucket}/PREPROCESSING/MODEL\"\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# instance to run the code\n", + "instance_type_preprocessing=\"ml.t3.medium\"\n", + "instance_type_train=\"ml.m5.large\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preprocessing Step" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# Processor image previous uploaded in ECR\n", + "image_uri_processor = f\"{account_number}.dkr.ecr.{region_name}.amazonaws.com/{image_name_processor}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the processor to access the ECR image\n", + "processor = Processor(image_uri=image_uri_processor,\n", + " role=role,\n", + " instance_count=1,\n", + " instance_type=instance_type_preprocessing)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates input and output objects for ProcessingStep\n", + "inputs=[\n", + " ProcessingInput(source=paths['train_raw'], \n", + " destination='/opt/ml/processing/input/raw_data', \n", + " input_name=\"raw_data\")\n", + "]\n", + "outputs = [\n", + " ProcessingOutput(\n", + " source=\"/opt/ml/processing/output/expectations\",\n", + " destination=paths['expectations'],\n", + " output_name=\"expectations\",\n", + " ),\n", + " ProcessingOutput(\n", + " source=\"/opt/ml/processing/output/preprocessing\",\n", + " destination=paths['preprocessing'],\n", + " output_name=\"preprocessing\",\n", + " ),\n", + " ProcessingOutput(\n", + " source=\"/opt/ml/processing/output/processed/train\",\n", + " destination=paths['train_processed'],\n", + " output_name=\"train_data\",\n", + " ),\n", + " ProcessingOutput(\n", + " source=\"/opt/ml/processing/output/processed/val\",\n", + " destination=paths['val_processed'],\n", + " output_name=\"val_data\",\n", + " )\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the ProcessingStep\n", + "processing_step = ProcessingStep(\n", + " \"Preprocessing step\",\n", + " processor=processor,\n", + " job_name=execution_input[\"PreprocessingJobName\"],\n", + " inputs=inputs,\n", + " outputs=outputs,\n", + " container_arguments=[\"--step\", \"train\"]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## TrainingStep" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# Training image previous uploaded in ECR\n", + "image_uri_train = f\"{account_number}.dkr.ecr.{region_name}.amazonaws.com/{image_name_train}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates input and output objects for TrainingStep\n", + "train_config = sagemaker.inputs.TrainingInput(\n", + " paths['train_processed'],\n", + " content_type='text/csv',\n", + ")\n", + "val_config = sagemaker.inputs.TrainingInput(\n", + " paths['val_processed'],\n", + " content_type='text/csv'\n", + ")\n", + "output_path = paths['model']" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the estimator to access the ECR image\n", + "est = sagemaker.estimator.Estimator(\n", + " image_uri_train,\n", + " role, \n", + " instance_count=1, \n", + " instance_type=instance_type_train,\n", + " volume_size = 30,\n", + " output_path = output_path,\n", + " base_job_name = \"Hermione-Train\",\n", + " use_spot_instances=True, # Usar instâncias SPOT\n", + " max_run = 24*60*60,\n", + " max_wait = 24*60*60 # timeout em segundos. Required if use_spot_instances == True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the TrainingStep\n", + "training_step = TrainingStep(\n", + " 'TrainStep',\n", + " estimator=est,\n", + " data={\n", + " 'train': train_config,\n", + " 'validation': val_config\n", + " }, \n", + " job_name=execution_input[\"TrainingJobName\"] \n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Workflow and Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates Fail state to mark the workflow failed in case any of the steps fail.\n", + "failed_state_sagemaker_processing_failure = stepfunctions.steps.states.Fail(\n", + " \"ML Workflow failed\", cause=\"SageMakerProcessingJobFailed\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "# Adds the Error handling in the workflow\n", + "catch_state_processing = stepfunctions.steps.states.Catch(\n", + " error_equals=[\"States.TaskFailed\"],\n", + " next_step=failed_state_sagemaker_processing_failure,\n", + ")\n", + "\n", + "processing_step.add_catch(catch_state_processing)\n", + "training_step.add_catch(catch_state_processing)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates workflow with Pre-Processing Job and Training Job\n", + "workflow_graph = Chain([processing_step, training_step])\n", + "branching_workflow = Workflow(\n", + " name=\"SFN_Hermione_Train\",\n", + " definition=workflow_graph,\n", + " role=workflow_execution_role,\n", + ")\n", + "branching_workflow.create()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "# Generates unique names for Pre-Processing Job and Training Job\n", + "# Each job requires a unique name\n", + "preprocessing_job_name = \"Hermione-Preprocessing-{}\".format(\n", + " uuid.uuid1().hex\n", + ") \n", + "training_job_name = \"Hermione-Training-{}\".format(\n", + " uuid.uuid1().hex\n", + ") " + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + " \n", + " \n", + "
\n", + "
    \n", + "
  • \n", + "
    \n", + " Success\n", + "
  • \n", + "
  • \n", + "
    \n", + " Failed\n", + "
  • \n", + "
  • \n", + "
    \n", + " Cancelled\n", + "
  • \n", + "
  • \n", + "
    \n", + " In Progress\n", + "
  • \n", + "
  • \n", + "
    \n", + " Caught Error\n", + "
  • \n", + "
\n", + "
\n", + "\n", + " \n", + " Inspect in AWS Step Functions \n", + "
\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Executes the workflow\n", + "execution = branching_workflow.execute(\n", + " inputs={\n", + " \"PreprocessingJobName\": preprocessing_job_name,\n", + " \"TrainingJobName\": training_job_name\n", + " }\n", + ")\n", + "execution_output = execution.get_output(wait=False)\n", + "execution.render_progress()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_Train-checkpoint.ipynb b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_Train-checkpoint.ipynb new file mode 100644 index 0000000..b0a796f --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_Train-checkpoint.ipynb @@ -0,0 +1,393 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "577c4f6b", + "metadata": {}, + "source": [ + "# Sagemaker Train" + ] + }, + { + "cell_type": "markdown", + "id": "501ef5b6", + "metadata": {}, + "source": [ + "This script creates and trains the model with the uploaded image in ECR." + ] + }, + { + "cell_type": "markdown", + "id": "e66b3975", + "metadata": {}, + "source": [ + "## Import modules" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "d658fb44", + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "import boto3\n", + "import sagemaker\n", + "from sagemaker import get_execution_role" + ] + }, + { + "cell_type": "markdown", + "id": "64036230", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "markdown", + "id": "28411012", + "metadata": {}, + "source": [ + "Modify according to your configurations." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "7e937373", + "metadata": {}, + "outputs": [], + "source": [ + "# Bucket name in S3\n", + "bucket = \"hermione-sagemaker\"" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "16450249", + "metadata": {}, + "outputs": [], + "source": [ + "# Set session\n", + "region_name=\"us-east-1\"\n", + "boto3.setup_default_session(region_name=region_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "2e144eb8", + "metadata": {}, + "outputs": [], + "source": [ + "# Get user role\n", + "role = get_execution_role()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "50b4a590", + "metadata": {}, + "outputs": [], + "source": [ + "# Get AWS Account ID\n", + "account_number = boto3.client(\"sts\").get_caller_identity()[\"Account\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "8d56e6ca", + "metadata": {}, + "outputs": [], + "source": [ + "# Image previous uploaded in ECR\n", + "image_name = \"hermione-train\"\n", + "image_uri = f\"{account_number}.dkr.ecr.{region_name}.amazonaws.com/{image_name}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "e710ea0a", + "metadata": {}, + "outputs": [], + "source": [ + "# Input and output paths to execute train\n", + "paths = {\n", + " 'train_processed': f\"s3://{bucket}/PREPROCESSING/TRAIN_PROCESSED\",\n", + " 'val_processed': f\"s3://{bucket}/PREPROCESSING/VAL_PROCESSED\",\n", + " 'model': f\"s3://{bucket}/PREPROCESSING/MODEL\"\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "f8a27026", + "metadata": {}, + "outputs": [], + "source": [ + "# instance to run the code\n", + "instance_type=\"ml.m5.large\"" + ] + }, + { + "cell_type": "markdown", + "id": "b6efb8ce", + "metadata": {}, + "source": [ + "## Train" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "ed9cb39b", + "metadata": {}, + "outputs": [], + "source": [ + "# Receives the processed train data in S3\n", + "train_config = sagemaker.inputs.TrainingInput(\n", + " paths['train_processed'],\n", + " content_type='text/csv',\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "34f144e0", + "metadata": {}, + "outputs": [], + "source": [ + "# Receives the processed validation data in S3\n", + "val_config = sagemaker.inputs.TrainingInput(\n", + " paths['val_processed'],\n", + " content_type='text/csv'\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "a0bbbf7d", + "metadata": {}, + "outputs": [], + "source": [ + "# Saves the model object in S3\n", + "output_path = paths['model']" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "299813d5", + "metadata": {}, + "outputs": [], + "source": [ + "# Metrics to visualize in the Monitor\n", + "metrics = [\n", + " {\n", + " \"Name\": \"accuracy\",\n", + " \"Regex\": \"accuracy=(.*?);\",\n", + " },\n", + " {\n", + " \"Name\": \"f1\",\n", + " \"Regex\": \"f1=(.*?);\",\n", + " },\n", + " {\n", + " \"Name\": \"precision\",\n", + " \"Regex\": \"precision=(.*?);\",\n", + " },\n", + " {\n", + " \"Name\": \"recall\",\n", + " \"Regex\": \"recall=(.*?);\",\n", + " },\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "4ad41d36", + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the estimator to access the ECR image\n", + "est = sagemaker.estimator.Estimator(\n", + " image_uri,\n", + " role, \n", + " instance_count=1, \n", + " instance_type=instance_type,\n", + " volume_size = 30,\n", + " output_path = output_path,\n", + " base_job_name = \"Hermione-train\",\n", + " use_spot_instances=True,\n", + " max_run = 24*60*60,\n", + " max_wait = 24*60*60, # timeout in seconds. Required if use_spot_instances == True\n", + " metric_definitions=metrics\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "62c1894f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2021-05-26 12:41:29 Starting - Starting the training job...\n", + "2021-05-26 12:41:52 Starting - Launching requested ML instancesProfilerReport-1622032889: InProgress\n", + "......\n", + "2021-05-26 12:42:52 Starting - Preparing the instances for training......\n", + "2021-05-26 12:43:52 Downloading - Downloading input data\n", + "2021-05-26 12:43:52 Training - Downloading the training image.....\u001b[34m2021-05-26 09:44:41,407 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\n", + "2021-05-26 12:45:00 Uploading - Uploading generated training model\n", + "2021-05-26 12:45:00 Completed - Training job completed\n", + "\u001b[34m2021-05-26 09:44:47,642 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34m2021-05-26 09:44:47,653 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34m2021-05-26 09:44:47,663 sagemaker-training-toolkit INFO Invoking user script\n", + "\u001b[0m\n", + "\u001b[34mTraining Env:\n", + "\u001b[0m\n", + "\u001b[34m{\n", + " \"additional_framework_parameters\": {},\n", + " \"channel_input_dirs\": {\n", + " \"validation\": \"/opt/ml/input/data/validation\",\n", + " \"train\": \"/opt/ml/input/data/train\"\n", + " },\n", + " \"current_host\": \"algo-1\",\n", + " \"framework_module\": null,\n", + " \"hosts\": [\n", + " \"algo-1\"\n", + " ],\n", + " \"hyperparameters\": {},\n", + " \"input_config_dir\": \"/opt/ml/input/config\",\n", + " \"input_data_config\": {\n", + " \"validation\": {\n", + " \"ContentType\": \"text/csv\",\n", + " \"TrainingInputMode\": \"File\",\n", + " \"S3DistributionType\": \"FullyReplicated\",\n", + " \"RecordWrapperType\": \"None\"\n", + " },\n", + " \"train\": {\n", + " \"ContentType\": \"text/csv\",\n", + " \"TrainingInputMode\": \"File\",\n", + " \"S3DistributionType\": \"FullyReplicated\",\n", + " \"RecordWrapperType\": \"None\"\n", + " }\n", + " },\n", + " \"input_dir\": \"/opt/ml/input\",\n", + " \"is_master\": true,\n", + " \"job_name\": \"Hermione-train-2021-05-26-12-41-29-505\",\n", + " \"log_level\": 20,\n", + " \"master_hostname\": \"algo-1\",\n", + " \"model_dir\": \"/opt/ml/model\",\n", + " \"module_dir\": \"/opt/ml/code\",\n", + " \"module_name\": \"train\",\n", + " \"network_interface_name\": \"eth0\",\n", + " \"num_cpus\": 2,\n", + " \"num_gpus\": 0,\n", + " \"output_data_dir\": \"/opt/ml/output/data\",\n", + " \"output_dir\": \"/opt/ml/output\",\n", + " \"output_intermediate_dir\": \"/opt/ml/output/intermediate\",\n", + " \"resource_config\": {\n", + " \"current_host\": \"algo-1\",\n", + " \"hosts\": [\n", + " \"algo-1\"\n", + " ],\n", + " \"network_interface_name\": \"eth0\"\n", + " },\n", + " \"user_entry_point\": \"train.py\"\u001b[0m\n", + "\u001b[34m}\n", + "\u001b[0m\n", + "\u001b[34mEnvironment variables:\n", + "\u001b[0m\n", + "\u001b[34mSM_HOSTS=[\"algo-1\"]\u001b[0m\n", + "\u001b[34mSM_NETWORK_INTERFACE_NAME=eth0\u001b[0m\n", + "\u001b[34mSM_HPS={}\u001b[0m\n", + "\u001b[34mSM_USER_ENTRY_POINT=train.py\u001b[0m\n", + "\u001b[34mSM_FRAMEWORK_PARAMS={}\u001b[0m\n", + "\u001b[34mSM_RESOURCE_CONFIG={\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"}\u001b[0m\n", + "\u001b[34mSM_INPUT_DATA_CONFIG={\"train\":{\"ContentType\":\"text/csv\",\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"},\"validation\":{\"ContentType\":\"text/csv\",\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}}\u001b[0m\n", + "\u001b[34mSM_OUTPUT_DATA_DIR=/opt/ml/output/data\u001b[0m\n", + "\u001b[34mSM_CHANNELS=[\"train\",\"validation\"]\u001b[0m\n", + "\u001b[34mSM_CURRENT_HOST=algo-1\u001b[0m\n", + "\u001b[34mSM_MODULE_NAME=train\u001b[0m\n", + "\u001b[34mSM_LOG_LEVEL=20\u001b[0m\n", + "\u001b[34mSM_FRAMEWORK_MODULE=\u001b[0m\n", + "\u001b[34mSM_INPUT_DIR=/opt/ml/input\u001b[0m\n", + "\u001b[34mSM_INPUT_CONFIG_DIR=/opt/ml/input/config\u001b[0m\n", + "\u001b[34mSM_OUTPUT_DIR=/opt/ml/output\u001b[0m\n", + "\u001b[34mSM_NUM_CPUS=2\u001b[0m\n", + "\u001b[34mSM_NUM_GPUS=0\u001b[0m\n", + "\u001b[34mSM_MODEL_DIR=/opt/ml/model\u001b[0m\n", + "\u001b[34mSM_MODULE_DIR=/opt/ml/code\u001b[0m\n", + "\u001b[34mSM_TRAINING_ENV={\"additional_framework_parameters\":{},\"channel_input_dirs\":{\"train\":\"/opt/ml/input/data/train\",\"validation\":\"/opt/ml/input/data/validation\"},\"current_host\":\"algo-1\",\"framework_module\":null,\"hosts\":[\"algo-1\"],\"hyperparameters\":{},\"input_config_dir\":\"/opt/ml/input/config\",\"input_data_config\":{\"train\":{\"ContentType\":\"text/csv\",\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"},\"validation\":{\"ContentType\":\"text/csv\",\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}},\"input_dir\":\"/opt/ml/input\",\"is_master\":true,\"job_name\":\"Hermione-train-2021-05-26-12-41-29-505\",\"log_level\":20,\"master_hostname\":\"algo-1\",\"model_dir\":\"/opt/ml/model\",\"module_dir\":\"/opt/ml/code\",\"module_name\":\"train\",\"network_interface_name\":\"eth0\",\"num_cpus\":2,\"num_gpus\":0,\"output_data_dir\":\"/opt/ml/output/data\",\"output_dir\":\"/opt/ml/output\",\"output_intermediate_dir\":\"/opt/ml/output/intermediate\",\"resource_config\":{\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"},\"user_entry_point\":\"train.py\"}\u001b[0m\n", + "\u001b[34mSM_USER_ARGS=[]\u001b[0m\n", + "\u001b[34mSM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate\u001b[0m\n", + "\u001b[34mSM_CHANNEL_VALIDATION=/opt/ml/input/data/validation\u001b[0m\n", + "\u001b[34mSM_CHANNEL_TRAIN=/opt/ml/input/data/train\u001b[0m\n", + "\u001b[34mPYTHONPATH=/usr/local/bin:/opt/ml/code:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/lib/python38.zip:/usr/lib/python3.8:/usr/lib/python3.8/lib-dynload:/usr/local/lib/python3.8/dist-packages:/usr/lib/python3/dist-packages\n", + "\u001b[0m\n", + "\u001b[34mInvoking script with the following command:\n", + "\u001b[0m\n", + "\u001b[34m/usr/bin/python3 train.py\n", + "\n", + "\u001b[0m\n", + "\u001b[34m/usr/local/lib/python3.8/dist-packages/interpret_community/common/gpu_kmeans.py:30: UserWarning: cuML is required to use GPU explainers. Check https://rapids.ai/start.html for more information on how to install it.\n", + " warnings.warn(\u001b[0m\n", + "\u001b[34mcuML is required to use GPU explainers. Check https://rapids.ai/start.html for more information on how to install it.\u001b[0m\n", + "\u001b[34mINFO:root:Starting the training\u001b[0m\n", + "\u001b[34mINFO:root:Reading the inputs\u001b[0m\n", + "\u001b[34mINFO:root:Training the model\u001b[0m\n", + "\u001b[34mINFO:root:Saving\u001b[0m\n", + "\u001b[34mINFO:root:accuracy=0.7373737373737373; f1=0.6976744186046512; precision=0.6382978723404256; recall=0.7692307692307693;\u001b[0m\n", + "\u001b[34mINFO:root:Training complete.\u001b[0m\n", + "\u001b[34m2021-05-26 09:44:51,898 sagemaker-training-toolkit INFO Reporting training SUCCESS\u001b[0m\n", + "Training seconds: 85\n", + "Billable seconds: 36\n", + "Managed Spot Training savings: 57.6%\n", + "CPU times: user 450 ms, sys: 19.9 ms, total: 470 ms\n", + "Wall time: 3min 42s\n" + ] + } + ], + "source": [ + "%%time\n", + "# Train the model and validate\n", + "est.fit({'train':train_config, 'validation':val_config}, wait=True, logs=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_Inference.ipynb b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_Inference.ipynb new file mode 100644 index 0000000..aa21796 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_Inference.ipynb @@ -0,0 +1,322 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "616d65aa", + "metadata": {}, + "source": [ + "# Sagemaker Inference" + ] + }, + { + "cell_type": "markdown", + "id": "aee7320a", + "metadata": {}, + "source": [ + "This script predicts new data with the uploaded image in ECR." + ] + }, + { + "cell_type": "markdown", + "id": "ea32612e", + "metadata": {}, + "source": [ + "## Import modules" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "3f188c9f", + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "import boto3\n", + "import sagemaker\n", + "from sagemaker import get_execution_role" + ] + }, + { + "cell_type": "markdown", + "id": "430e1eb4", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "markdown", + "id": "ebe50488", + "metadata": {}, + "source": [ + "Modify according to your configurations." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8893b148", + "metadata": {}, + "outputs": [], + "source": [ + "# Bucket name in S3\n", + "bucket = \"hermione-sagemaker\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a6ba2451", + "metadata": {}, + "outputs": [], + "source": [ + "# Set session\n", + "region_name=\"us-east-1\"\n", + "boto3.setup_default_session(region_name=region_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "797c5fa6", + "metadata": {}, + "outputs": [], + "source": [ + "# Get user role\n", + "role = get_execution_role()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "d8148140", + "metadata": {}, + "outputs": [], + "source": [ + "# Get AWS Account ID\n", + "account_number = boto3.client(\"sts\").get_caller_identity()[\"Account\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "1b1fba48", + "metadata": {}, + "outputs": [], + "source": [ + "# Image previous uploaded in ECR\n", + "image_name = \"hermione-inference\"\n", + "image_uri = f\"{account_number}.dkr.ecr.{region_name}.amazonaws.com/{image_name}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f907e610", + "metadata": {}, + "outputs": [], + "source": [ + "# Input and output paths to execute inference\n", + "paths = {\n", + " 'inference_processed': f\"s3://{bucket}/PREPROCESSING/INFERENCE_PROCESSED/inference.csv\",\n", + " 'model': f\"s3://{bucket}/PREPROCESSING/MODEL/Hermione-train-2021-05-26-12-41-29-505/output/model.tar.gz\",\n", + " 'output_path': f\"s3://{bucket}/PREPROCESSING/OUTPUT\"\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "f5fdfdd8", + "metadata": {}, + "outputs": [], + "source": [ + "# instance to run the code\n", + "instance_type=\"ml.m5.large\"" + ] + }, + { + "cell_type": "markdown", + "id": "55fe64d7", + "metadata": {}, + "source": [ + "## Inference" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "60b7dc56", + "metadata": {}, + "outputs": [], + "source": [ + "# Receives the processed inference data in S3\n", + "input_path = paths['inference_processed']" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "e3dc913c", + "metadata": {}, + "outputs": [], + "source": [ + "# Receives the model created during the training in S3\n", + "model_path = paths['model']" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "5b69f31c", + "metadata": {}, + "outputs": [], + "source": [ + "# Saves the prediction in S3\n", + "output_path = paths['output_path']" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "29f7ce88", + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the model to access the ECR image\n", + "model = sagemaker.model.Model(\n", + " image_uri= image_uri,\n", + " model_data=model_path,\n", + " role=role)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "aacdf22a", + "metadata": {}, + "outputs": [], + "source": [ + "# Creates a transformer object from the trained model\n", + "transformer = model.transformer(\n", + " instance_count=1,\n", + " instance_type=instance_type, \n", + " output_path=output_path,\n", + " accept = 'text/csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "6452e276", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ".........................\u001b[34m2021-05-26 12:57:00,312 [INFO ] main com.amazonaws.ml.mms.ModelServer - \u001b[0m\n", + "\u001b[34mMMS Home: /usr/local/lib/python3.8/dist-packages\u001b[0m\n", + "\u001b[34mCurrent directory: /\u001b[0m\n", + "\u001b[34mTemp directory: /tmp\u001b[0m\n", + "\u001b[34mNumber of GPUs: 0\u001b[0m\n", + "\u001b[34mNumber of CPUs: 2\u001b[0m\n", + "\u001b[34mMax heap size: 857 M\u001b[0m\n", + "\u001b[34mPython executable: /usr/bin/python3\u001b[0m\n", + "\u001b[34mConfig file: /etc/sagemaker-mms.properties\u001b[0m\n", + "\u001b[34mInference address: http://0.0.0.0:8080\u001b[0m\n", + "\u001b[34mManagement address: http://0.0.0.0:8080\u001b[0m\n", + "\u001b[34mModel Store: /.sagemaker/mms/models\u001b[0m\n", + "\u001b[34mInitial Models: ALL\u001b[0m\n", + "\u001b[34mLog dir: /logs\u001b[0m\n", + "\u001b[34mMetrics dir: /logs\u001b[0m\n", + "\u001b[34mNetty threads: 0\u001b[0m\n", + "\u001b[34mNetty client threads: 0\u001b[0m\n", + "\u001b[34mDefault workers per model: 2\u001b[0m\n", + "\u001b[34mBlacklist Regex: N/A\u001b[0m\n", + "\u001b[34mMaximum Response Size: 6553500\u001b[0m\n", + "\u001b[34mMaximum Request Size: 6553500\u001b[0m\n", + "\u001b[34mPreload model: false\u001b[0m\n", + "\u001b[34mPrefer direct buffer: false\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:00,419 [WARN ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerLifeCycle - attachIOStreams() threadName=W-9000-model\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:00,506 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - model_service_worker started with args: --sock-type unix --sock-name /tmp/.mms.sock.9000 --handler serving.handler --model-path /.sagemaker/mms/models/model --model-name model --preload-model false --tmp-dir /tmp\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:00,508 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Listening on port: /tmp/.mms.sock.9000\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:00,509 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - [PID] 23\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:00,509 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - MMS worker started.\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:00,509 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Python runtime: 3.8.5\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:00,512 [INFO ] main com.amazonaws.ml.mms.wlm.ModelManager - Model model loaded.\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:00,517 [INFO ] main com.amazonaws.ml.mms.ModelServer - Initialize Inference server with: EpollServerSocketChannel.\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:00,536 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Connecting to: /tmp/.mms.sock.9000\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:00,536 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Connecting to: /tmp/.mms.sock.9000\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:00,607 [INFO ] main com.amazonaws.ml.mms.ModelServer - Inference API bind to: http://0.0.0.0:8080\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:00,613 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Connection accepted: /tmp/.mms.sock.9000.\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:00,614 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Connection accepted: /tmp/.mms.sock.9000.\u001b[0m\n", + "\u001b[34mModel server started.\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:00,636 [WARN ] pool-2-thread-1 com.amazonaws.ml.mms.metrics.MetricCollector - worker pid is not available yet.\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:02,508 [WARN ] W-9000-model-stderr com.amazonaws.ml.mms.wlm.WorkerLifeCycle - /usr/local/lib/python3.8/dist-packages/interpret_community/common/gpu_kmeans.py:30: UserWarning: cuML is required to use GPU explainers. Check https://rapids.ai/start.html for more information on how to install it.\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:02,510 [WARN ] W-9000-model-stderr com.amazonaws.ml.mms.wlm.WorkerLifeCycle - warnings.warn(\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:02,510 [WARN ] W-9000-model-stderr com.amazonaws.ml.mms.wlm.WorkerLifeCycle - /usr/local/lib/python3.8/dist-packages/interpret_community/common/gpu_kmeans.py:30: UserWarning: cuML is required to use GPU explainers. Check https://rapids.ai/start.html for more information on how to install it.\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:02,510 [WARN ] W-9000-model-stderr com.amazonaws.ml.mms.wlm.WorkerLifeCycle - warnings.warn(\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:03,375 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - generated new fontManager\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:03,393 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - generated new fontManager\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:03,635 [WARN ] W-9000-model-stderr com.amazonaws.ml.mms.wlm.WorkerLifeCycle - cuML is required to use GPU explainers. Check https://rapids.ai/start.html for more information on how to install it.\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:03,658 [WARN ] W-9000-model-stderr com.amazonaws.ml.mms.wlm.WorkerLifeCycle - cuML is required to use GPU explainers. Check https://rapids.ai/start.html for more information on how to install it.\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:03,690 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Loading the model\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:03,715 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Loading the model\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:03,741 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Model model loaded io_fd=0242a9fffefeff83-00000009-00000002-e6c9db643cbfeb7b-a47635f7\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:03,750 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Backend response time: 3046\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:03,752 [WARN ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerLifeCycle - attachIOStreams() threadName=W-model-1\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:03,768 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Model model loaded io_fd=0242a9fffefeff83-00000009-00000001-f549db643cbfeb7b-e2a66100\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:03,768 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Backend response time: 3065\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:03,769 [WARN ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerLifeCycle - attachIOStreams() threadName=W-model-2\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:09,272 [INFO ] pool-1-thread-4 ACCESS_LOG - /169.254.255.130:59054 \"GET /ping HTTP/1.1\" 200 11\u001b[0m\n", + "\u001b[35m2021-05-26 12:57:09,272 [INFO ] pool-1-thread-4 ACCESS_LOG - /169.254.255.130:59054 \"GET /ping HTTP/1.1\" 200 11\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:09,353 [INFO ] epollEventLoopGroup-3-2 ACCESS_LOG - /169.254.255.130:59058 \"GET /execution-parameters HTTP/1.1\" 404 2\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:09,462 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Predicting...\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:09,486 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Prediction Complete\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:09,491 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Saving\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:09,494 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Backend response time: 37\u001b[0m\n", + "\u001b[34m2021-05-26 12:57:09,494 [INFO ] W-9000-model ACCESS_LOG - /169.254.255.130:59068 \"POST /invocations HTTP/1.1\" 200 42\u001b[0m\n", + "\u001b[35m2021-05-26 12:57:09,353 [INFO ] epollEventLoopGroup-3-2 ACCESS_LOG - /169.254.255.130:59058 \"GET /execution-parameters HTTP/1.1\" 404 2\u001b[0m\n", + "\u001b[35m2021-05-26 12:57:09,462 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Predicting...\u001b[0m\n", + "\u001b[35m2021-05-26 12:57:09,486 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Prediction Complete\u001b[0m\n", + "\u001b[35m2021-05-26 12:57:09,491 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Saving\u001b[0m\n", + "\u001b[35m2021-05-26 12:57:09,494 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Backend response time: 37\u001b[0m\n", + "\u001b[35m2021-05-26 12:57:09,494 [INFO ] W-9000-model ACCESS_LOG - /169.254.255.130:59068 \"POST /invocations HTTP/1.1\" 200 42\u001b[0m\n", + "\u001b[32m2021-05-26T12:57:09.364:[sagemaker logs]: MaxConcurrentTransforms=1, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD\u001b[0m\n", + "\n", + "CPU times: user 547 ms, sys: 59 ms, total: 606 ms\n", + "Wall time: 4min 43s\n" + ] + } + ], + "source": [ + "%%time\n", + "# Predicts the data\n", + "transformer.transform(data=input_path, data_type='S3Prefix', content_type='text/csv', split_type='Line')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_Processor.ipynb b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_Processor.ipynb new file mode 100644 index 0000000..ad85e0f --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_Processor.ipynb @@ -0,0 +1,396 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b5264128", + "metadata": {}, + "source": [ + "# Sagemaker Processor" + ] + }, + { + "cell_type": "markdown", + "id": "5bd7a5cd", + "metadata": {}, + "source": [ + "This script generates the train, val and inference files with the processor previous uploaded in ECR." + ] + }, + { + "cell_type": "markdown", + "id": "0488ed05", + "metadata": {}, + "source": [ + "## Import modules" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "e7b20785", + "metadata": {}, + "outputs": [], + "source": [ + "import boto3\n", + "import time\n", + "from datetime import datetime\n", + "from sagemaker import get_execution_role\n", + "from sagemaker.processing import Processor, ProcessingInput, ProcessingOutput" + ] + }, + { + "cell_type": "markdown", + "id": "7f3fd305", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "markdown", + "id": "6528a20b", + "metadata": {}, + "source": [ + "Modify according to your configurations." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d5cdd5d1", + "metadata": {}, + "outputs": [], + "source": [ + "# Bucket name in S3\n", + "bucket = \"hermione-sagemaker\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "5ec68bf7", + "metadata": {}, + "outputs": [], + "source": [ + "# Set session\n", + "region_name=\"us-east-1\"\n", + "boto3.setup_default_session(region_name=region_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "4d011a47", + "metadata": {}, + "outputs": [], + "source": [ + "# Get user role\n", + "role = get_execution_role()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "25f76666", + "metadata": {}, + "outputs": [], + "source": [ + "# Get AWS Account ID\n", + "account_number = boto3.client(\"sts\").get_caller_identity()[\"Account\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "fafb5f18", + "metadata": {}, + "outputs": [], + "source": [ + "# Image previous uploaded in ECR\n", + "image_name = \"hermione-processor\"\n", + "image_uri = f\"{account_number}.dkr.ecr.{region_name}.amazonaws.com/{image_name}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "2ef594d3", + "metadata": {}, + "outputs": [], + "source": [ + "# Input and output paths to execute train and inference\n", + "paths = {\n", + " 'train_raw': f\"s3://{bucket}/TRAIN_RAW\",\n", + " 'expectations': f\"s3://{bucket}/PREPROCESSING/EXPECTATIONS\",\n", + " 'preprocessing': f\"s3://{bucket}/PREPROCESSING/PREPROCESSING\",\n", + " 'train_processed': f\"s3://{bucket}/PREPROCESSING/TRAIN_PROCESSED\",\n", + " 'val_processed': f\"s3://{bucket}/PREPROCESSING/VAL_PROCESSED\",\n", + " 'test_raw': f\"s3://{bucket}/TEST_RAW\",\n", + " 'inference_processed': f\"s3://{bucket}/PREPROCESSING/INFERENCE_PROCESSED\",\n", + " 'validations': f\"s3://{bucket}/PREPROCESSING/VALIDATIONS\"\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "2b625b74", + "metadata": {}, + "outputs": [], + "source": [ + "# instance to run the code\n", + "instance_type_train=\"ml.t3.medium\"\n", + "instance_type_inference=\"ml.t3.medium\"" + ] + }, + { + "cell_type": "markdown", + "id": "6e8e92ba", + "metadata": {}, + "source": [ + "## Processor - Train" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "e1b41ed1", + "metadata": {}, + "outputs": [], + "source": [ + "# Receives a raw data in S3\n", + "inputs=[\n", + " ProcessingInput(source=paths['train_raw'], \n", + " destination='/opt/ml/processing/input/raw_data', \n", + " input_name=\"raw_data\")\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "cd67446b", + "metadata": {}, + "outputs": [], + "source": [ + "# Returns the great expectation object, preprocessing object, \n", + "# processed training data and processed validation data, and saves them in S3\n", + "outputs = [\n", + " ProcessingOutput(\n", + " source=\"/opt/ml/processing/output/expectations\",\n", + " destination=paths['expectations'],\n", + " output_name=\"expectations\",\n", + " ),\n", + " ProcessingOutput(\n", + " source=\"/opt/ml/processing/output/preprocessing\",\n", + " destination=paths['preprocessing'],\n", + " output_name=\"preprocessing\",\n", + " ),\n", + " ProcessingOutput(\n", + " source=\"/opt/ml/processing/output/processed/train\",\n", + " destination=paths['train_processed'],\n", + " output_name=\"train_data\",\n", + " ),\n", + " ProcessingOutput(\n", + " source=\"/opt/ml/processing/output/processed/val\",\n", + " destination=paths['val_processed'],\n", + " output_name=\"val_data\",\n", + " )\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "902f8e4f", + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the processor to access the ECR image\n", + "processor = Processor(image_uri=image_uri,\n", + " role=role,\n", + " instance_count=1,\n", + " instance_type=instance_type_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "fd8a28a1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Job Name: hermione-processor-2021-05-25-21-03-59-873\n", + "Inputs: [{'InputName': 'raw_data', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://hermione-sagemaker/TRAIN_RAW', 'LocalPath': '/opt/ml/processing/input/raw_data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]\n", + "Outputs: [{'OutputName': 'expectations', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/EXPECTATIONS', 'LocalPath': '/opt/ml/processing/output/expectations', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'preprocessing', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/PREPROCESSING', 'LocalPath': '/opt/ml/processing/output/preprocessing', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'train_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/TRAIN_PROCESSED', 'LocalPath': '/opt/ml/processing/output/processed/train', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'val_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/VAL_PROCESSED', 'LocalPath': '/opt/ml/processing/output/processed/val', 'S3UploadMode': 'EndOfJob'}}]\n", + "......................................................\n", + "\u001b[34mINFO:root:Starting the preprocessing\u001b[0m\n", + "\u001b[34mINFO:root:step_train: True\u001b[0m\n", + "\u001b[34mINFO:root:Reading the inputs\u001b[0m\n", + "\u001b[34mINFO:root:Reading file: /opt/ml/processing/input/raw_data/raw_train.csv\u001b[0m\n", + "\u001b[34mINFO:root:Data Quality\u001b[0m\n", + "\u001b[34mINFO:great_expectations.data_asset.data_asset:#01110 expectation(s) included in expectation_suite. Omitting 1 expectation(s) that failed when last run; set discard_failed_expectations=False to include them. result_format settings filtered.\u001b[0m\n", + "\u001b[34mINFO:root:Preprocessing\u001b[0m\n", + "\u001b[34mINFO:root:Cleaning data\u001b[0m\n", + "\u001b[34mINFO:root:One hot encoding\u001b[0m\n", + "\u001b[34mWARNING:py.warnings:/usr/local/lib/python3.8/dist-packages/category_encoders/utils.py:21: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead\n", + " elif pd.api.types.is_categorical(cols):\n", + "\u001b[0m\n", + "\u001b[34mINFO:root:Divide train and test\u001b[0m\n", + "\u001b[34mINFO:root:Normalizing\u001b[0m\n", + "\u001b[34mWARNING:py.warnings:/usr/local/lib/python3.8/dist-packages/pandas/core/indexing.py:1738: SettingWithCopyWarning: \u001b[0m\n", + "\u001b[34mA value is trying to be set on a copy of a slice from a DataFrame.\u001b[0m\n", + "\u001b[34mTry using .loc[row_indexer,col_indexer] = value instead\n", + "\u001b[0m\n", + "\u001b[34mSee the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " self._setitem_single_column(loc, value[:, i].tolist(), pi)\n", + "\u001b[0m\n", + "\u001b[34mINFO:root:Normalizing\u001b[0m\n", + "\u001b[34mINFO:root:shape train (393, 7) val (99, 7)\u001b[0m\n", + "\u001b[34mINFO:root:Saving\u001b[0m\n", + "CPU times: user 1.02 s, sys: 104 ms, total: 1.13 s\n", + "Wall time: 9min 14s\n" + ] + } + ], + "source": [ + "%%time\n", + "# Runs the processor to access the ECR image and process the training data\n", + "processor.run(inputs=inputs,\n", + " outputs= outputs,\n", + " arguments=[\"--step\", \"train\"] \n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "a0b0636e", + "metadata": {}, + "source": [ + "## Processor - Inference" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "4e1df020", + "metadata": {}, + "outputs": [], + "source": [ + "# Receives a raw data in S3, the preprocessing and great expectation objects created in the training\n", + "inputs=[\n", + " ProcessingInput(source=paths['test_raw'],\n", + " destination='/opt/ml/processing/input/raw_data', \n", + " input_name='raw_data'),\n", + " ProcessingInput(source=paths['preprocessing'], \n", + " destination='/opt/ml/processing/input/preprocessing', \n", + " input_name='preprocessing'),\n", + " ProcessingInput(source=paths['expectations'], \n", + " destination='/opt/ml/processing/input/expectations', \n", + " input_name='expectations')\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "4fa3439a", + "metadata": {}, + "outputs": [], + "source": [ + "# Returns the processed inference data and validations, and saves them in S3\n", + "outputs = [\n", + " ProcessingOutput(\n", + " source=\"/opt/ml/processing/output/processed/inference\",\n", + " destination=paths['inference_processed'],\n", + " output_name=\"inference_data\",\n", + " ),\n", + " ProcessingOutput(\n", + " source=\"/opt/ml/processing/output/validations\",\n", + " destination=paths['validations'],\n", + " output_name=\"validations\",\n", + " )\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "c399b969", + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the processor to access the ECR image\n", + "processor = Processor(image_uri=image_uri,\n", + " role=role,\n", + " instance_count=1,\n", + " instance_type=instance_type_inference)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "8cb61e97", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Job Name: hermione-processor-2021-05-25-21-13-13-987\n", + "Inputs: [{'InputName': 'raw_data', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://hermione-sagemaker/TEST_RAW', 'LocalPath': '/opt/ml/processing/input/raw_data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'preprocessing', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/PREPROCESSING', 'LocalPath': '/opt/ml/processing/input/preprocessing', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'expectations', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/EXPECTATIONS', 'LocalPath': '/opt/ml/processing/input/expectations', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]\n", + "Outputs: [{'OutputName': 'inference_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/INFERENCE_PROCESSED', 'LocalPath': '/opt/ml/processing/output/processed/inference', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'validations', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/VALIDATIONS', 'LocalPath': '/opt/ml/processing/output/validations', 'S3UploadMode': 'EndOfJob'}}]\n", + "............................................................\n", + "\u001b[34mINFO:root:Starting the preprocessing\u001b[0m\n", + "\u001b[34mINFO:root:step_train: False\u001b[0m\n", + "\u001b[34mINFO:root:Reading the inputs\u001b[0m\n", + "\u001b[34mINFO:root:Reading file: /opt/ml/processing/input/raw_data/raw_test.csv\u001b[0m\n", + "\u001b[34mINFO:root:Data Quality\u001b[0m\n", + "\u001b[34mINFO:root:Preprocessing\u001b[0m\n", + "\u001b[34mINFO:root:Cleaning data\u001b[0m\n", + "\u001b[34mINFO:root:One hot encoding\u001b[0m\n", + "\u001b[34mINFO:root:Normalizing\u001b[0m\n", + "\u001b[34mINFO:root:shape (222, 7)\u001b[0m\n", + "\u001b[34mINFO:root:Saving\u001b[0m\n", + "CPU times: user 1.19 s, sys: 38.4 ms, total: 1.23 s\n", + "Wall time: 10min 14s\n" + ] + } + ], + "source": [ + "%%time\n", + "# Runs the processor to access the ECR image and process the inference data\n", + "processor.run(inputs=inputs,\n", + " outputs= outputs,\n", + " arguments=[\"--step\", \"test\"] \n", + " )" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_StepFunctions_Inference.ipynb b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_StepFunctions_Inference.ipynb new file mode 100644 index 0000000..1c9af76 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_StepFunctions_Inference.ipynb @@ -0,0 +1,737 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Build machine learning workflow to predict new data with Amazon SageMaker and AWS Step Functions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This script creates a Step Function state machine to preprocess the inference data and predict with the images in ECR." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import modules" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import uuid\n", + "import boto3\n", + "import sagemaker\n", + "from sagemaker.amazon.amazon_estimator import get_image_uri\n", + "from sagemaker.s3 import S3Uploader\n", + "from sagemaker import get_execution_role\n", + "from sagemaker.sklearn.processing import SKLearnProcessor\n", + "from sagemaker.processing import Processor, ProcessingInput, ProcessingOutput\n", + "import stepfunctions\n", + "from stepfunctions.steps import (\n", + " Chain,\n", + " ProcessingStep,\n", + " TransformStep\n", + ")\n", + "from stepfunctions.inputs import ExecutionInput\n", + "from stepfunctions.workflow import Workflow" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Modify according to your configurations." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Bucket name in S3\n", + "bucket = \"hermione-sagemaker\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Set session\n", + "region_name=\"us-east-1\"\n", + "boto3.setup_default_session(region_name=region_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Get user role\n", + "role = get_execution_role()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Role to create and execute step functions\n", + "# paste the AmazonSageMaker-StepFunctionsWorkflowExecutionRole ARN\n", + "workflow_execution_role = \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# SageMaker expects unique names for each job, model and endpoint.\n", + "# Otherwise, the execution will fail. The ExecutionInput creates\n", + "# dynamically names for each execution.\n", + "execution_input = ExecutionInput(\n", + " schema={\n", + " \"PreprocessingJobName\": str,\n", + " \"TransformJobName\": str \n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# Get AWS Account ID\n", + "account_number = boto3.client(\"sts\").get_caller_identity()[\"Account\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# Processor image name previous uploaded in ECR\n", + "image_name_processor = \"hermione-processor\"" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Inference image name previous uploaded in ECR\n", + "image_name_inference = \"hermione-inference\"" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Input and output paths to execute train and inference\n", + "paths = {\n", + " 'expectations': f\"s3://{bucket}/PREPROCESSING/EXPECTATIONS\",\n", + " 'preprocessing': f\"s3://{bucket}/PREPROCESSING/PREPROCESSING\",\n", + " 'test_raw': f\"s3://{bucket}/TEST_RAW\",\n", + " 'inference_processed': f\"s3://{bucket}/PREPROCESSING/INFERENCE_PROCESSED\",\n", + " 'validations': f\"s3://{bucket}/PREPROCESSING/VALIDATIONS\",\n", + " 'model': f\"s3://{bucket}/PREPROCESSING/MODEL/Hermione-train-2021-05-26-12-41-29-505/output/model.tar.gz\",\n", + " 'output_path': f\"s3://{bucket}/PREPROCESSING/OUTPUT\"\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# instance to run the code\n", + "instance_type_preprocessing=\"ml.t3.medium\"\n", + "instance_type_inference=\"ml.m5.large\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preprocessing Step" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# Processor image previous uploaded in ECR\n", + "image_uri_processor = f\"{account_number}.dkr.ecr.{region_name}.amazonaws.com/{image_name_processor}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the processor to access the ECR image\n", + "processor = Processor(image_uri=image_uri_processor,\n", + " role=role,\n", + " instance_count=1,\n", + " instance_type=instance_type_preprocessing)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates input and output objects for ProcessingStep\n", + "inputs=[\n", + " ProcessingInput(source=paths['test_raw'],\n", + " destination='/opt/ml/processing/input/raw_data', \n", + " input_name='raw_data'),\n", + " ProcessingInput(source=paths['preprocessing'], \n", + " destination='/opt/ml/processing/input/preprocessing', \n", + " input_name='preprocessing'),\n", + " ProcessingInput(source=paths['expectations'], \n", + " destination='/opt/ml/processing/input/expectations', \n", + " input_name='expectations')\n", + "]\n", + "outputs = [\n", + " ProcessingOutput(\n", + " source=\"/opt/ml/processing/output/processed/inference\",\n", + " destination=paths['inference_processed'],\n", + " output_name=\"inference_data\",\n", + " ),\n", + " ProcessingOutput(\n", + " source=\"/opt/ml/processing/output/validations\",\n", + " destination=paths['validations'],\n", + " output_name=\"validations\",\n", + " )\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the ProcessingStep\n", + "processing_step = ProcessingStep(\n", + " \"SageMaker Preprocessing step\",\n", + " processor=processor,\n", + " job_name=execution_input[\"PreprocessingJobName\"],\n", + " inputs=inputs,\n", + " outputs=outputs,\n", + " container_arguments=[\"--step\", \"test\"]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Inference Step" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# Inference image previous uploaded in ECR\n", + "image_uri_inference = f\"{account_number}.dkr.ecr.{region_name}.amazonaws.com/{image_name_inference}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates input and output objects for TransformStep\n", + "input_path = paths['inference_processed']\n", + "model_path = paths['model']\n", + "output_path = paths['output_path']" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the model to access the ECR image\n", + "model = sagemaker.model.Model(\n", + " image_uri = image_uri_inference,\n", + " model_data=model_path,\n", + " role=role)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates a transformer object from the trained model\n", + "transformer = model.transformer(\n", + " instance_count=1,\n", + " instance_type=instance_type_inference, \n", + " output_path=output_path,\n", + " accept = 'text/csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the TransformStep\n", + "transform_step = TransformStep(\n", + " \"Inference Step\",\n", + " transformer=transformer,\n", + " job_name=execution_input[\"TransformJobName\"],\n", + " data=input_path,\n", + " content_type='text/csv',\n", + " wait_for_completion=True,\n", + " model_name=model.name\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Workflow and Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates Fail state to mark the workflow failed in case any of the steps fail.\n", + "failed_state_sagemaker_processing_failure = stepfunctions.steps.states.Fail(\n", + " \"ML Workflow failed\", cause=\"SageMakerProcessingJobFailed\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "# Adds the Error handling in the workflow\n", + "catch_state_processing = stepfunctions.steps.states.Catch(\n", + " error_equals=[\"States.TaskFailed\"],\n", + " next_step=failed_state_sagemaker_processing_failure,\n", + ")\n", + "\n", + "processing_step.add_catch(catch_state_processing)\n", + "transform_step.add_catch(catch_state_processing)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates workflow with Pre-Processing Job and Transform Job\n", + "workflow_graph = Chain([processing_step, transform_step])\n", + "branching_workflow = Workflow(\n", + " name=\"SFN_Hermione_Inference\",\n", + " definition=workflow_graph,\n", + " role=workflow_execution_role,\n", + ")\n", + "branching_workflow.create()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "# Generates unique names for Pre-Processing Job and Training Job\n", + "# Each job requires a unique name\n", + "preprocessing_job_name = \"Hermione-Preprocessing-{}\".format(\n", + " uuid.uuid1().hex\n", + ") \n", + "inference_job_name = \"Hermione-Inference-{}\".format(\n", + " uuid.uuid1().hex\n", + ") " + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + " \n", + " \n", + "
\n", + "
    \n", + "
  • \n", + "
    \n", + " Success\n", + "
  • \n", + "
  • \n", + "
    \n", + " Failed\n", + "
  • \n", + "
  • \n", + "
    \n", + " Cancelled\n", + "
  • \n", + "
  • \n", + "
    \n", + " In Progress\n", + "
  • \n", + "
  • \n", + "
    \n", + " Caught Error\n", + "
  • \n", + "
\n", + "
\n", + "\n", + " \n", + " Inspect in AWS Step Functions \n", + "
\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Executes the workflow\n", + "execution = branching_workflow.execute(\n", + " inputs={\n", + " \"PreprocessingJobName\": preprocessing_job_name,\n", + " \"TransformJobName\": inference_job_name\n", + " }\n", + ")\n", + "execution_output = execution.get_output(wait=False)\n", + "execution.render_progress()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Results" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SurvivedAgePclass_1Pclass_2Pclass_3Sex_1Sex_2predict
01.00.0072880.00.01.01.00.01.0
10.00.3717010.01.00.00.01.00.0
20.00.7612470.01.00.00.01.00.0
30.00.3340040.00.01.00.01.00.0
40.00.5727571.00.00.00.01.00.0
...........................
2170.00.2083440.00.01.00.01.00.0
2180.00.2334760.00.01.00.01.00.0
2190.00.0198540.00.01.01.00.01.0
2201.00.2209101.00.00.01.00.01.0
2211.00.6481530.01.00.01.00.01.0
\n", + "

222 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " Survived Age Pclass_1 Pclass_2 Pclass_3 Sex_1 Sex_2 predict\n", + "0 1.0 0.007288 0.0 0.0 1.0 1.0 0.0 1.0\n", + "1 0.0 0.371701 0.0 1.0 0.0 0.0 1.0 0.0\n", + "2 0.0 0.761247 0.0 1.0 0.0 0.0 1.0 0.0\n", + "3 0.0 0.334004 0.0 0.0 1.0 0.0 1.0 0.0\n", + "4 0.0 0.572757 1.0 0.0 0.0 0.0 1.0 0.0\n", + ".. ... ... ... ... ... ... ... ...\n", + "217 0.0 0.208344 0.0 0.0 1.0 0.0 1.0 0.0\n", + "218 0.0 0.233476 0.0 0.0 1.0 0.0 1.0 0.0\n", + "219 0.0 0.019854 0.0 0.0 1.0 1.0 0.0 1.0\n", + "220 1.0 0.220910 1.0 0.0 0.0 1.0 0.0 1.0\n", + "221 1.0 0.648153 0.0 1.0 0.0 1.0 0.0 1.0\n", + "\n", + "[222 rows x 8 columns]" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "pd.read_csv('s3://hermione-sagemaker/PREPROCESSING/OUTPUT/inference.csv.out')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_StepFunctions_Train.ipynb b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_StepFunctions_Train.ipynb new file mode 100644 index 0000000..a4c655a --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_StepFunctions_Train.ipynb @@ -0,0 +1,540 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Build machine learning workflow to train a model with Amazon SageMaker and AWS Step Functions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This script creates a Step Function state machine to preprocess the training data and train a model with the images in ECR." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import modules" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import uuid\n", + "import boto3\n", + "import sagemaker\n", + "from sagemaker import get_execution_role\n", + "from sagemaker.processing import Processor, ProcessingInput, ProcessingOutput\n", + "import stepfunctions\n", + "from stepfunctions.inputs import ExecutionInput\n", + "from stepfunctions.workflow import Workflow\n", + "from stepfunctions.steps import (\n", + " TrainingStep, \n", + " Chain,\n", + " ProcessingStep,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Modify according to your configurations." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Bucket name in S3\n", + "bucket = \"hermione-sagemaker\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Set session\n", + "region_name=\"us-east-1\"\n", + "boto3.setup_default_session(region_name=region_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Get user role\n", + "role = get_execution_role()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Role to create and execute step functions\n", + "# paste the AmazonSageMaker-StepFunctionsWorkflowExecutionRole ARN\n", + "workflow_execution_role = \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# SageMaker expects unique names for each job, model and endpoint.\n", + "# Otherwise, the execution will fail. The ExecutionInput creates\n", + "# dynamically names for each execution.\n", + "execution_input = ExecutionInput(\n", + " schema={\n", + " \"PreprocessingJobName\": str,\n", + " \"TrainingJobName\": str\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# Get AWS Account ID\n", + "account_number = boto3.client(\"sts\").get_caller_identity()[\"Account\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# Processor image name previous uploaded in ECR\n", + "image_name_processor = \"hermione-processor\"" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Training image name previous uploaded in ECR\n", + "image_name_train = \"hermione-train\"" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Input and output paths to execute\n", + "paths = {\n", + " 'train_raw': f\"s3://{bucket}/TRAIN_RAW\",\n", + " 'expectations': f\"s3://{bucket}/PREPROCESSING/EXPECTATIONS\",\n", + " 'preprocessing': f\"s3://{bucket}/PREPROCESSING/PREPROCESSING\",\n", + " 'train_processed': f\"s3://{bucket}/PREPROCESSING/TRAIN_PROCESSED\",\n", + " 'val_processed': f\"s3://{bucket}/PREPROCESSING/VAL_PROCESSED\",\n", + " 'model': f\"s3://{bucket}/PREPROCESSING/MODEL\"\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# instance to run the code\n", + "instance_type_preprocessing=\"ml.t3.medium\"\n", + "instance_type_train=\"ml.m5.large\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preprocessing Step" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# Processor image previous uploaded in ECR\n", + "image_uri_processor = f\"{account_number}.dkr.ecr.{region_name}.amazonaws.com/{image_name_processor}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the processor to access the ECR image\n", + "processor = Processor(image_uri=image_uri_processor,\n", + " role=role,\n", + " instance_count=1,\n", + " instance_type=instance_type_preprocessing)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates input and output objects for ProcessingStep\n", + "inputs=[\n", + " ProcessingInput(source=paths['train_raw'], \n", + " destination='/opt/ml/processing/input/raw_data', \n", + " input_name=\"raw_data\")\n", + "]\n", + "outputs = [\n", + " ProcessingOutput(\n", + " source=\"/opt/ml/processing/output/expectations\",\n", + " destination=paths['expectations'],\n", + " output_name=\"expectations\",\n", + " ),\n", + " ProcessingOutput(\n", + " source=\"/opt/ml/processing/output/preprocessing\",\n", + " destination=paths['preprocessing'],\n", + " output_name=\"preprocessing\",\n", + " ),\n", + " ProcessingOutput(\n", + " source=\"/opt/ml/processing/output/processed/train\",\n", + " destination=paths['train_processed'],\n", + " output_name=\"train_data\",\n", + " ),\n", + " ProcessingOutput(\n", + " source=\"/opt/ml/processing/output/processed/val\",\n", + " destination=paths['val_processed'],\n", + " output_name=\"val_data\",\n", + " )\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the ProcessingStep\n", + "processing_step = ProcessingStep(\n", + " \"Preprocessing step\",\n", + " processor=processor,\n", + " job_name=execution_input[\"PreprocessingJobName\"],\n", + " inputs=inputs,\n", + " outputs=outputs,\n", + " container_arguments=[\"--step\", \"train\"]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## TrainingStep" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# Training image previous uploaded in ECR\n", + "image_uri_train = f\"{account_number}.dkr.ecr.{region_name}.amazonaws.com/{image_name_train}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates input and output objects for TrainingStep\n", + "train_config = sagemaker.inputs.TrainingInput(\n", + " paths['train_processed'],\n", + " content_type='text/csv',\n", + ")\n", + "val_config = sagemaker.inputs.TrainingInput(\n", + " paths['val_processed'],\n", + " content_type='text/csv'\n", + ")\n", + "output_path = paths['model']" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the estimator to access the ECR image\n", + "est = sagemaker.estimator.Estimator(\n", + " image_uri_train,\n", + " role, \n", + " instance_count=1, \n", + " instance_type=instance_type_train,\n", + " volume_size = 30,\n", + " output_path = output_path,\n", + " base_job_name = \"Hermione-Train\",\n", + " use_spot_instances=True, # Usar instâncias SPOT\n", + " max_run = 24*60*60,\n", + " max_wait = 24*60*60 # timeout em segundos. Required if use_spot_instances == True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the TrainingStep\n", + "training_step = TrainingStep(\n", + " 'TrainStep',\n", + " estimator=est,\n", + " data={\n", + " 'train': train_config,\n", + " 'validation': val_config\n", + " }, \n", + " job_name=execution_input[\"TrainingJobName\"] \n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create Workflow and Execute" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates Fail state to mark the workflow failed in case any of the steps fail.\n", + "failed_state_sagemaker_processing_failure = stepfunctions.steps.states.Fail(\n", + " \"ML Workflow failed\", cause=\"SageMakerProcessingJobFailed\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "# Adds the Error handling in the workflow\n", + "catch_state_processing = stepfunctions.steps.states.Catch(\n", + " error_equals=[\"States.TaskFailed\"],\n", + " next_step=failed_state_sagemaker_processing_failure,\n", + ")\n", + "\n", + "processing_step.add_catch(catch_state_processing)\n", + "training_step.add_catch(catch_state_processing)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Creates workflow with Pre-Processing Job and Training Job\n", + "workflow_graph = Chain([processing_step, training_step])\n", + "branching_workflow = Workflow(\n", + " name=\"SFN_Hermione_Train\",\n", + " definition=workflow_graph,\n", + " role=workflow_execution_role,\n", + ")\n", + "branching_workflow.create()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "# Generates unique names for Pre-Processing Job and Training Job\n", + "# Each job requires a unique name\n", + "preprocessing_job_name = \"Hermione-Preprocessing-{}\".format(\n", + " uuid.uuid1().hex\n", + ") \n", + "training_job_name = \"Hermione-Training-{}\".format(\n", + " uuid.uuid1().hex\n", + ") " + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + " \n", + " \n", + "
\n", + "
    \n", + "
  • \n", + "
    \n", + " Success\n", + "
  • \n", + "
  • \n", + "
    \n", + " Failed\n", + "
  • \n", + "
  • \n", + "
    \n", + " Cancelled\n", + "
  • \n", + "
  • \n", + "
    \n", + " In Progress\n", + "
  • \n", + "
  • \n", + "
    \n", + " Caught Error\n", + "
  • \n", + "
\n", + "
\n", + "\n", + " \n", + " Inspect in AWS Step Functions \n", + "
\n", + "\n", + "\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Executes the workflow\n", + "execution = branching_workflow.execute(\n", + " inputs={\n", + " \"PreprocessingJobName\": preprocessing_job_name,\n", + " \"TrainingJobName\": training_job_name\n", + " }\n", + ")\n", + "execution_output = execution.get_output(wait=False)\n", + "execution.render_progress()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_Train.ipynb b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_Train.ipynb new file mode 100644 index 0000000..b0a796f --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_Train.ipynb @@ -0,0 +1,393 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "577c4f6b", + "metadata": {}, + "source": [ + "# Sagemaker Train" + ] + }, + { + "cell_type": "markdown", + "id": "501ef5b6", + "metadata": {}, + "source": [ + "This script creates and trains the model with the uploaded image in ECR." + ] + }, + { + "cell_type": "markdown", + "id": "e66b3975", + "metadata": {}, + "source": [ + "## Import modules" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "d658fb44", + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "import boto3\n", + "import sagemaker\n", + "from sagemaker import get_execution_role" + ] + }, + { + "cell_type": "markdown", + "id": "64036230", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "markdown", + "id": "28411012", + "metadata": {}, + "source": [ + "Modify according to your configurations." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "7e937373", + "metadata": {}, + "outputs": [], + "source": [ + "# Bucket name in S3\n", + "bucket = \"hermione-sagemaker\"" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "16450249", + "metadata": {}, + "outputs": [], + "source": [ + "# Set session\n", + "region_name=\"us-east-1\"\n", + "boto3.setup_default_session(region_name=region_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "2e144eb8", + "metadata": {}, + "outputs": [], + "source": [ + "# Get user role\n", + "role = get_execution_role()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "50b4a590", + "metadata": {}, + "outputs": [], + "source": [ + "# Get AWS Account ID\n", + "account_number = boto3.client(\"sts\").get_caller_identity()[\"Account\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "8d56e6ca", + "metadata": {}, + "outputs": [], + "source": [ + "# Image previous uploaded in ECR\n", + "image_name = \"hermione-train\"\n", + "image_uri = f\"{account_number}.dkr.ecr.{region_name}.amazonaws.com/{image_name}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "e710ea0a", + "metadata": {}, + "outputs": [], + "source": [ + "# Input and output paths to execute train\n", + "paths = {\n", + " 'train_processed': f\"s3://{bucket}/PREPROCESSING/TRAIN_PROCESSED\",\n", + " 'val_processed': f\"s3://{bucket}/PREPROCESSING/VAL_PROCESSED\",\n", + " 'model': f\"s3://{bucket}/PREPROCESSING/MODEL\"\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "f8a27026", + "metadata": {}, + "outputs": [], + "source": [ + "# instance to run the code\n", + "instance_type=\"ml.m5.large\"" + ] + }, + { + "cell_type": "markdown", + "id": "b6efb8ce", + "metadata": {}, + "source": [ + "## Train" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "ed9cb39b", + "metadata": {}, + "outputs": [], + "source": [ + "# Receives the processed train data in S3\n", + "train_config = sagemaker.inputs.TrainingInput(\n", + " paths['train_processed'],\n", + " content_type='text/csv',\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "34f144e0", + "metadata": {}, + "outputs": [], + "source": [ + "# Receives the processed validation data in S3\n", + "val_config = sagemaker.inputs.TrainingInput(\n", + " paths['val_processed'],\n", + " content_type='text/csv'\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "a0bbbf7d", + "metadata": {}, + "outputs": [], + "source": [ + "# Saves the model object in S3\n", + "output_path = paths['model']" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "299813d5", + "metadata": {}, + "outputs": [], + "source": [ + "# Metrics to visualize in the Monitor\n", + "metrics = [\n", + " {\n", + " \"Name\": \"accuracy\",\n", + " \"Regex\": \"accuracy=(.*?);\",\n", + " },\n", + " {\n", + " \"Name\": \"f1\",\n", + " \"Regex\": \"f1=(.*?);\",\n", + " },\n", + " {\n", + " \"Name\": \"precision\",\n", + " \"Regex\": \"precision=(.*?);\",\n", + " },\n", + " {\n", + " \"Name\": \"recall\",\n", + " \"Regex\": \"recall=(.*?);\",\n", + " },\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "4ad41d36", + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the estimator to access the ECR image\n", + "est = sagemaker.estimator.Estimator(\n", + " image_uri,\n", + " role, \n", + " instance_count=1, \n", + " instance_type=instance_type,\n", + " volume_size = 30,\n", + " output_path = output_path,\n", + " base_job_name = \"Hermione-train\",\n", + " use_spot_instances=True,\n", + " max_run = 24*60*60,\n", + " max_wait = 24*60*60, # timeout in seconds. Required if use_spot_instances == True\n", + " metric_definitions=metrics\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "62c1894f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2021-05-26 12:41:29 Starting - Starting the training job...\n", + "2021-05-26 12:41:52 Starting - Launching requested ML instancesProfilerReport-1622032889: InProgress\n", + "......\n", + "2021-05-26 12:42:52 Starting - Preparing the instances for training......\n", + "2021-05-26 12:43:52 Downloading - Downloading input data\n", + "2021-05-26 12:43:52 Training - Downloading the training image.....\u001b[34m2021-05-26 09:44:41,407 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\n", + "2021-05-26 12:45:00 Uploading - Uploading generated training model\n", + "2021-05-26 12:45:00 Completed - Training job completed\n", + "\u001b[34m2021-05-26 09:44:47,642 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34m2021-05-26 09:44:47,653 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34m2021-05-26 09:44:47,663 sagemaker-training-toolkit INFO Invoking user script\n", + "\u001b[0m\n", + "\u001b[34mTraining Env:\n", + "\u001b[0m\n", + "\u001b[34m{\n", + " \"additional_framework_parameters\": {},\n", + " \"channel_input_dirs\": {\n", + " \"validation\": \"/opt/ml/input/data/validation\",\n", + " \"train\": \"/opt/ml/input/data/train\"\n", + " },\n", + " \"current_host\": \"algo-1\",\n", + " \"framework_module\": null,\n", + " \"hosts\": [\n", + " \"algo-1\"\n", + " ],\n", + " \"hyperparameters\": {},\n", + " \"input_config_dir\": \"/opt/ml/input/config\",\n", + " \"input_data_config\": {\n", + " \"validation\": {\n", + " \"ContentType\": \"text/csv\",\n", + " \"TrainingInputMode\": \"File\",\n", + " \"S3DistributionType\": \"FullyReplicated\",\n", + " \"RecordWrapperType\": \"None\"\n", + " },\n", + " \"train\": {\n", + " \"ContentType\": \"text/csv\",\n", + " \"TrainingInputMode\": \"File\",\n", + " \"S3DistributionType\": \"FullyReplicated\",\n", + " \"RecordWrapperType\": \"None\"\n", + " }\n", + " },\n", + " \"input_dir\": \"/opt/ml/input\",\n", + " \"is_master\": true,\n", + " \"job_name\": \"Hermione-train-2021-05-26-12-41-29-505\",\n", + " \"log_level\": 20,\n", + " \"master_hostname\": \"algo-1\",\n", + " \"model_dir\": \"/opt/ml/model\",\n", + " \"module_dir\": \"/opt/ml/code\",\n", + " \"module_name\": \"train\",\n", + " \"network_interface_name\": \"eth0\",\n", + " \"num_cpus\": 2,\n", + " \"num_gpus\": 0,\n", + " \"output_data_dir\": \"/opt/ml/output/data\",\n", + " \"output_dir\": \"/opt/ml/output\",\n", + " \"output_intermediate_dir\": \"/opt/ml/output/intermediate\",\n", + " \"resource_config\": {\n", + " \"current_host\": \"algo-1\",\n", + " \"hosts\": [\n", + " \"algo-1\"\n", + " ],\n", + " \"network_interface_name\": \"eth0\"\n", + " },\n", + " \"user_entry_point\": \"train.py\"\u001b[0m\n", + "\u001b[34m}\n", + "\u001b[0m\n", + "\u001b[34mEnvironment variables:\n", + "\u001b[0m\n", + "\u001b[34mSM_HOSTS=[\"algo-1\"]\u001b[0m\n", + "\u001b[34mSM_NETWORK_INTERFACE_NAME=eth0\u001b[0m\n", + "\u001b[34mSM_HPS={}\u001b[0m\n", + "\u001b[34mSM_USER_ENTRY_POINT=train.py\u001b[0m\n", + "\u001b[34mSM_FRAMEWORK_PARAMS={}\u001b[0m\n", + "\u001b[34mSM_RESOURCE_CONFIG={\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"}\u001b[0m\n", + "\u001b[34mSM_INPUT_DATA_CONFIG={\"train\":{\"ContentType\":\"text/csv\",\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"},\"validation\":{\"ContentType\":\"text/csv\",\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}}\u001b[0m\n", + "\u001b[34mSM_OUTPUT_DATA_DIR=/opt/ml/output/data\u001b[0m\n", + "\u001b[34mSM_CHANNELS=[\"train\",\"validation\"]\u001b[0m\n", + "\u001b[34mSM_CURRENT_HOST=algo-1\u001b[0m\n", + "\u001b[34mSM_MODULE_NAME=train\u001b[0m\n", + "\u001b[34mSM_LOG_LEVEL=20\u001b[0m\n", + "\u001b[34mSM_FRAMEWORK_MODULE=\u001b[0m\n", + "\u001b[34mSM_INPUT_DIR=/opt/ml/input\u001b[0m\n", + "\u001b[34mSM_INPUT_CONFIG_DIR=/opt/ml/input/config\u001b[0m\n", + "\u001b[34mSM_OUTPUT_DIR=/opt/ml/output\u001b[0m\n", + "\u001b[34mSM_NUM_CPUS=2\u001b[0m\n", + "\u001b[34mSM_NUM_GPUS=0\u001b[0m\n", + "\u001b[34mSM_MODEL_DIR=/opt/ml/model\u001b[0m\n", + "\u001b[34mSM_MODULE_DIR=/opt/ml/code\u001b[0m\n", + "\u001b[34mSM_TRAINING_ENV={\"additional_framework_parameters\":{},\"channel_input_dirs\":{\"train\":\"/opt/ml/input/data/train\",\"validation\":\"/opt/ml/input/data/validation\"},\"current_host\":\"algo-1\",\"framework_module\":null,\"hosts\":[\"algo-1\"],\"hyperparameters\":{},\"input_config_dir\":\"/opt/ml/input/config\",\"input_data_config\":{\"train\":{\"ContentType\":\"text/csv\",\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"},\"validation\":{\"ContentType\":\"text/csv\",\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}},\"input_dir\":\"/opt/ml/input\",\"is_master\":true,\"job_name\":\"Hermione-train-2021-05-26-12-41-29-505\",\"log_level\":20,\"master_hostname\":\"algo-1\",\"model_dir\":\"/opt/ml/model\",\"module_dir\":\"/opt/ml/code\",\"module_name\":\"train\",\"network_interface_name\":\"eth0\",\"num_cpus\":2,\"num_gpus\":0,\"output_data_dir\":\"/opt/ml/output/data\",\"output_dir\":\"/opt/ml/output\",\"output_intermediate_dir\":\"/opt/ml/output/intermediate\",\"resource_config\":{\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"},\"user_entry_point\":\"train.py\"}\u001b[0m\n", + "\u001b[34mSM_USER_ARGS=[]\u001b[0m\n", + "\u001b[34mSM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate\u001b[0m\n", + "\u001b[34mSM_CHANNEL_VALIDATION=/opt/ml/input/data/validation\u001b[0m\n", + "\u001b[34mSM_CHANNEL_TRAIN=/opt/ml/input/data/train\u001b[0m\n", + "\u001b[34mPYTHONPATH=/usr/local/bin:/opt/ml/code:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/lib/python38.zip:/usr/lib/python3.8:/usr/lib/python3.8/lib-dynload:/usr/local/lib/python3.8/dist-packages:/usr/lib/python3/dist-packages\n", + "\u001b[0m\n", + "\u001b[34mInvoking script with the following command:\n", + "\u001b[0m\n", + "\u001b[34m/usr/bin/python3 train.py\n", + "\n", + "\u001b[0m\n", + "\u001b[34m/usr/local/lib/python3.8/dist-packages/interpret_community/common/gpu_kmeans.py:30: UserWarning: cuML is required to use GPU explainers. Check https://rapids.ai/start.html for more information on how to install it.\n", + " warnings.warn(\u001b[0m\n", + "\u001b[34mcuML is required to use GPU explainers. Check https://rapids.ai/start.html for more information on how to install it.\u001b[0m\n", + "\u001b[34mINFO:root:Starting the training\u001b[0m\n", + "\u001b[34mINFO:root:Reading the inputs\u001b[0m\n", + "\u001b[34mINFO:root:Training the model\u001b[0m\n", + "\u001b[34mINFO:root:Saving\u001b[0m\n", + "\u001b[34mINFO:root:accuracy=0.7373737373737373; f1=0.6976744186046512; precision=0.6382978723404256; recall=0.7692307692307693;\u001b[0m\n", + "\u001b[34mINFO:root:Training complete.\u001b[0m\n", + "\u001b[34m2021-05-26 09:44:51,898 sagemaker-training-toolkit INFO Reporting training SUCCESS\u001b[0m\n", + "Training seconds: 85\n", + "Billable seconds: 36\n", + "Managed Spot Training savings: 57.6%\n", + "CPU times: user 450 ms, sys: 19.9 ms, total: 470 ms\n", + "Wall time: 3min 42s\n" + ] + } + ], + "source": [ + "%%time\n", + "# Train the model and validate\n", + "est.fit({'train':train_config, 'validation':val_config}, wait=True, logs=True)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/.ipynb_checkpoints/dataquality-checkpoint.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/.ipynb_checkpoints/dataquality-checkpoint.py new file mode 100644 index 0000000..5ce7b61 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/.ipynb_checkpoints/dataquality-checkpoint.py @@ -0,0 +1,60 @@ +import pandas as pd +import great_expectations as ge + +class DataQuality: + """ + Class to perform data quality before training + """ + def __init__(self, continuous_cols=None, discrete_cat_cols=None): + """ + Constructor + + Parameters + ---------- + continuous_cols : array + Receives an array with the name of the continuous columns + discrete_cat_cols : array + Receives an array with the name of the dicrete/categorical columns + Returns + ------- + DataQuality + """ + self.continuous_cols = continuous_cols + self.discrete_cat_cols = discrete_cat_cols + + def perform(self, + df: pd.DataFrame): + """ + Perform data quality + + Parameters + ---------- + df : pd.Dataframe + Dataframe to be processed + + Returns + ------- + json + """ + df_ge = ge.dataset.PandasDataset(df) + cols = df_ge.columns + df_ge.expect_table_columns_to_match_ordered_list(cols) + for col in cols: + df_ge.expect_column_values_to_not_be_null(col) + cut_off = 2 + if self.continuous_cols != None: + for col in self.continuous_cols: + measures = df_ge[col].describe() + df_ge.expect_column_values_to_be_of_type(col, 'int64') + df_ge.expect_column_mean_to_be_between(col, measures['mean'] - cut_off * measures['std'], measures['mean'] + cut_off * measures['std']) + df_ge.expect_column_max_to_be_between(col, measures['max'] - cut_off * measures['std'], measures['max'] + cut_off * measures['std']) + df_ge.expect_column_min_to_be_between(col, measures['min'] - cut_off * measures['std'], measures['min'] + cut_off * measures['std']) + expected_partition = ge.dataset.util.continuous_partition_data(df_ge[col]) + df_ge.expect_column_bootstrapped_ks_test_p_value_to_be_greater_than(col, expected_partition) + if len(self.discrete_cat_cols) != None: + for col in self.discrete_cat_cols: + possible_cat = df_ge[col].unique() + df_ge.expect_column_values_to_be_in_set(col, possible_cat) + expected_partition = ge.dataset.util.categorical_partition_data(df_ge[col]) + df_ge.expect_column_chisquare_test_p_value_to_be_greater_than(col, expected_partition) + return df_ge \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/.ipynb_checkpoints/normalization-checkpoint.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/.ipynb_checkpoints/normalization-checkpoint.py new file mode 100644 index 0000000..6d5822b --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/.ipynb_checkpoints/normalization-checkpoint.py @@ -0,0 +1,159 @@ +import pandas as pd +import numpy as np +from sklearn.preprocessing import StandardScaler, normalize, MinMaxScaler +from scipy.stats import zscore + +class Normalizer: + + def __init__(self, norm_cols: dict): + """ + Constructor + + Parameters + ---------- + norm_cols : dict + Receives dict with the name of the normalization to be + performed and which are the columns + Ex: norm_cols = {'zscore': ['salary', 'price'], + 'min-max': ['heigth', 'age']} + + Returns + ------- + Normalization + """ + self.norm_cols = norm_cols + self.col_names = [name for norm in norm_cols for name in norm_cols[norm]] + self.norms = {'min-max': MinMaxScaler, + 'standard': StandardScaler} + self.fitted = False + + def statistics(self, df : pd.DataFrame): + """ + Calculates dataframe statistics + + Parameters + ---------- + df : dataframe to calculate the statistics for each column + + Returns + ------- + None + """ + zip_cols = lambda result: zip(result.index.values, result.values) + self.col_min = {col: value for col, value in zip_cols(df[self.col_names].min())} + self.col_max = {col: value for col, value in zip_cols(df[self.col_names].max())} + self.col_std = {col: value for col, value in zip_cols(df[self.col_names].std())} + self.col_mean = {col: value for col, value in zip_cols(df[self.col_names].mean())} + self.col_median = {col: value for col, value in zip_cols(df[self.col_names].median())} + + def __apply_func(self, X, normalization): + """ + Creates the normalization object + + Parameters + ---------- + X : array + Data to be normalized + normalization : Normalization + Normalization to be applied + + Returns + ------- + Normalization + """ + normalization.fit(X) + return normalization + + def fit(self, df: pd.DataFrame): + """ + Generates normalization object for each column + + Parameters + ---------- + df : pd.DataFrame + dataframe with columns to be normalized + + Returns + ------- + None + """ + self.statistics(df) + self.normalization = dict() + for norm in self.norm_cols: + if norm in ['zscore', 'log10']: + continue + for col in self.norm_cols[norm]: + self.normalization[col] = self.__apply_func(df[col].values.reshape(-1, 1), self.norms[norm]()) + self.fitted = True + + def transform(self, df: pd.DataFrame): + """ + Apply normalization to each column + + Parameters + ---------- + df : pd.DataFrame + dataframe with columns to be normalized + + Returns + ------- + pd.DataFrame + """ + if not self.fitted: + raise Exception("Not yet fitted.") + + for norm in self.norm_cols: + if norm == 'zscore': + for col in self.norm_cols[norm]: + df.loc[:,col] = (df[col].values - self.col_mean[col])/self.col_std[col] + elif norm == 'log10': + for col in self.norm_cols[norm]: + df.loc[:,col] = np.log10(df[col].values) + else: + for col in self.norm_cols[norm]: + df.loc[:,col] = self.normalization[col].transform(df[col].values.reshape(-1, 1)) + return df + + def inverse_transform(self, df: pd.DataFrame): + """ + Apply the denormalized to each column + + Parameters + ---------- + df : pd.DataFrame + dataframe with columns to be denormalized + + Returns + ------- + pd.DataFrame + """ + if not self.fitted: + raise Exception("Not yet trained.") + + for norm in self.norm_cols: + if norm == 'zscore': + for col in self.norm_cols[norm]: + df.loc[:,col] = df[col].apply(lambda z: self.col_std[col]*z + self.col_mean[col]) + elif norm == 'log10': + for col in self.norm_cols[norm]: + df.loc[:,col] = df[col].apply(lambda x: 10 ** x) + else: + for col in self.norm_cols[norm]: + df.loc[:,col] = self.normalization[col].inverse_transform(df[col].values.reshape(-1, 1)) + return df + + def fit_transform(self, df: pd.DataFrame): + """ + Creates object and apply it normalization + + Parameters + ---------- + df : pd.DataFrame + dataframe with columns to be normalized + + Returns + ------- + pd.DataFrame + """ + self.fit(df) + return self.transform(df) diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/.ipynb_checkpoints/preprocessing-checkpoint.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/.ipynb_checkpoints/preprocessing-checkpoint.py new file mode 100644 index 0000000..dea90fa --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/.ipynb_checkpoints/preprocessing-checkpoint.py @@ -0,0 +1,141 @@ +import pandas as pd + +from ml.preprocessing.normalization import Normalizer +from sklearn.preprocessing import OneHotEncoder +from sklearn.model_selection import train_test_split +from category_encoders import * +import logging + +logging.getLogger().setLevel(logging.INFO) + +class Preprocessing: + """ + Class to perform data preprocessing before training + """ + + def __init__(self, norm_cols=None, oneHot_cols=None): + """ + Constructor + + Parameters + ---------- + norm_cols : dict + Receives dict with the name of the normalization to be + performed and which are the columns + Ex: norm_cols = {'zscore': ['salary', 'price'], + 'min-max': ['heigth', 'age']} + oneHot_cols : array + Receives an array with columns names to be categorized with One Hot Encoding + Returns + ------- + Preprocessing + """ + self.norm_cols = norm_cols + self.oneHot_cols = oneHot_cols + self.ohe = OneHotEncoder(handle_unknown='ignore') + + def clean_data(self, df: pd.DataFrame): + """ + Perform data cleansing. + + Parameters + ---------- + df : pd.Dataframe + Dataframe to be processed + + Returns + ------- + pd.Dataframe + Cleaned Data Frame + """ + logging.info("Cleaning data") + df_copy = df.copy() + df_copy['Pclass'] = df_copy.Pclass.astype('object') + df_copy = df_copy.dropna() + return df_copy + + def categ_encoding_oneHot(self, df: pd.DataFrame, step_train = False): + """ + Perform encoding of the categorical variables using One Hot Encoding + + Parameters + ---------- + df : pd.Dataframe + Dataframe to be processed + step_train : bool + if True, the fit function is executed + + Returns + ------- + pd.Dataframe + Encoded Data Frame + """ + logging.info("One hot encoding") + df_copy = df.copy() + + if step_train: + self.ohe.fit(df_copy[self.oneHot_cols]) + + arr = self.ohe.transform(df_copy[self.oneHot_cols]) + df_copy = df_copy.join(arr).drop(self.oneHot_cols, axis=1) + return df_copy + + def normalize(self, df: pd.DataFrame, step_train = False): + """ + Apply normalization to the selected columns + + Parameters + ---------- + df : pd.DataFrame + dataframe with columns to be normalized + step_train : bool + if True, the Normalizer is created and applied, + otherwise it is only applied + + Returns + ------- + pd.DataFrame + Normalized dataframe + """ + logging.info("Normalizing") + if step_train: + self.norm = Normalizer(self.norm_cols) + df = self.norm.fit_transform(df) + else: + df = self.norm.transform(df.copy()) + return df + + def execute(self, df, step_train = False, val_size = 0.2): + """ + Apply all preprocessing steps on the Dataframe + + Parameters + ---------- + df : pd.DataFrame + dataframe with columns to be normalized + step_train : bool + if True, data is splited in train and val + step_train : val_size + Size of the validation dataset + + Returns + ------- + pd.DataFrame + - One Preprocessed dataframe, if step_train is False + - Two Preprocessed dataframes, if step_train is True + """ + df = self.clean_data(df) + df = self.categ_encoding_oneHot(df, step_train) + + if step_train: + logging.info("Divide train and test") + X_train, X_val = train_test_split(df, test_size=val_size, random_state=123) + X_train = self.normalize(X_train, step_train = True) + X_val = self.normalize(X_val, step_train = False) + logging.info(f"shape train {X_train.shape} val {X_val.shape}") + return X_train, X_val + else: + X = self.normalize(df, step_train = False) + logging.info(f"shape {X.shape}") + return X + diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/.ipynb_checkpoints/text_vectorizer-checkpoint.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/.ipynb_checkpoints/text_vectorizer-checkpoint.py new file mode 100644 index 0000000..674458e --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/.ipynb_checkpoints/text_vectorizer-checkpoint.py @@ -0,0 +1,201 @@ +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.feature_extraction.text import TfidfVectorizer +import numpy as np +import pandas as pd + +class TextVectorizer: + + def __init__(self, vectorizer_cols : dict, word2vec=None): + """ + Constructor + + Parameters + ---------- + vectorizer_cols : dict + Receives a dict with the name of the vectorizer to be + performed and which are the columns + Ex: vectorizer_cols = {'embedding_median': ['col'], + 'embedding_mean': ['col'], + 'tf_idf': ['col'], + 'bag_of_words' : [col]} + Returns + ------- + Normalization + """ + self.word2vec = word2vec + self.index_ini_fim = len(self.word2vec.index2word) if word2vec != None else 0 + self.vectorizer_cols = vectorizer_cols + self.vectorizer_vects = {'bag_of_words': self.bag_of_words, + 'tf_idf': self.tf_idf_vect} + self.fitted = False + + def fit(self, df: pd.DataFrame): + """ + Generates the vectorizer object for each column. The text must be preprocessed. + + Parameters + ---------- + df : pd.DataFrame + dataframe with columns to be vectorizer + + Returns + ------- + None + """ + self.vectorizers_fitted = dict() + for vectorizer in self.vectorizer_cols: + if vectorizer in ['index', 'embedding_median', 'embedding_mean']: + continue + for col in self.vectorizer_cols[vectorizer]: + self.vectorizers_fitted[vectorizer] = {} + self.vectorizers_fitted[vectorizer][col] = self.vectorizer_vects[vectorizer](df[col].values) + self.fitted = True + + def transform(self, df: pd.DataFrame): + """ + Apply the vectorizer object for each column. The text must be preprocessed. + + Parameters + ---------- + df : pd.DataFrame + dataframe with columns to be vectorizer + + Returns + ------- + pd.DataFrame + """ + if not self.fitted: + raise Exception("Not yet trained.") + + for vectorizer in self.vectorizer_cols: + if vectorizer == 'index': + for col in self.vectorizer_cols[vectorizer]: + df.loc[:, col+"_"+vectorizer] = df[col].apply(lambda x: self.embedding(x, 3)) + elif vectorizer == 'embedding_median': + for col in self.vectorizer_cols[vectorizer]: + df.loc[:, col+"_"+vectorizer] = df[col].apply(lambda x: self.embedding(x, 1)) + elif vectorizer == 'embedding_mean': + for col in self.vectorizer_cols[vectorizer]: + df.loc[:, col+"_"+vectorizer] = df[col].apply(lambda x: self.embedding(x, 2)) + elif (vectorizer == 'bag_of_words') | (vectorizer == 'tf_idf'): + for col in self.vectorizer_cols[vectorizer]: + values = self.vectorizers_fitted[vectorizer][col].transform(df[col]) + df.loc[:,col+"_"+vectorizer] = pd.Series(values.toarray().tolist()) + + return df + + def embedding(self, X, typ_transform=1): + """ + Apply the embedding in X. The text must be preprocessed. + + Parameters + ---------- + X : pd.Series + row to be encoded + typ_transform : int + type of transformation + 1 - apply embedding median + 2 - apply embedding mean + 2 - apply index + + Returns + ------- + pd.DataFrame + """ + if X is None or type(X) == float: + return None + vector = [] + if typ_transform == 1: # mediana + vector = np.median([self.word2vec[x] for x in X.split() if x in self.word2vec], axis=0) + elif typ_transform == 2: # média + vector = np.mean([self.word2vec[x] for x in X.split() if x in self.word2vec], axis=0)#[0] + elif typ_transform == 3: # indexação + idx = self.word2vec.index2word + set_idx = set(idx) + indexes = [idx.index(token) for token in X.split() if token in set_idx] + indexes = [self.index_ini_fim] + indexes + [self.index_ini_fim] + # Create vector + X_length = len(indexes) + vector = np.zeros(X_length, dtype=np.int64) + vector[:len(indexes)] = indexes + else: + vector = [] + return vector + + def bag_of_words(self, corpus): + """ + Generate object bag of words + + Parameters + ---------- + corpus : str + text to generate object bag of words + Returns + ------- + model + """ + vectorizer = CountVectorizer() + model = vectorizer.fit(corpus) + return model + + def tf_idf_vect(self, corpus): + """ + Generate object td idf + + Parameters + ---------- + corpus : str + text to generate object tf idf + Returns + ------- + model + """ + vectorizer = TfidfVectorizer() + model = vectorizer.fit(corpus) + return model + + def inverse_transform(self, df: pd.DataFrame): + """ + Apply the invese_transform of vectorizer to each column + Options: index, bag_of_words and tf_idf + + Parameters + ---------- + df : pd.DataFrame + dataframe with columns to be unvectorizer + + Returns + ------- + pd.DataFrame + """ + if not self.fitted: + raise Exception("Not yet trained.") + + for vectorizer in self.vectorizer_cols: + if vectorizer == 'index': + for col in self.vectorizer_cols[vectorizer]: + df.loc[:, col+"_remove_"+vectorizer] = df[col].apply(lambda x: self.unvectorize(x)) + elif (vectorizer == 'bag_of_words') | (vectorizer == 'tf_idf'): + for col in self.vectorizer_cols[vectorizer]: + values = self.vectorizers_fitted[vectorizer][col].inverse_transform(df[col]) + df.loc[:,col+"_remove_"+vectorizer] = pd.Series(values.toarray().tolist()) + + return df + + def unvectorize(self, vector): + """ + Apply unvectorize in vector index + + Parameters + ---------- + vector : array + array with index + + Returns + ------- + array + """ + idx = self.word2vec.index2word + tokens = [idx[index] for index in vector if index != self.index_ini_fim] + X = " ".join(token for token in tokens) + return X \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/dataquality.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/dataquality.py new file mode 100644 index 0000000..5ce7b61 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/dataquality.py @@ -0,0 +1,60 @@ +import pandas as pd +import great_expectations as ge + +class DataQuality: + """ + Class to perform data quality before training + """ + def __init__(self, continuous_cols=None, discrete_cat_cols=None): + """ + Constructor + + Parameters + ---------- + continuous_cols : array + Receives an array with the name of the continuous columns + discrete_cat_cols : array + Receives an array with the name of the dicrete/categorical columns + Returns + ------- + DataQuality + """ + self.continuous_cols = continuous_cols + self.discrete_cat_cols = discrete_cat_cols + + def perform(self, + df: pd.DataFrame): + """ + Perform data quality + + Parameters + ---------- + df : pd.Dataframe + Dataframe to be processed + + Returns + ------- + json + """ + df_ge = ge.dataset.PandasDataset(df) + cols = df_ge.columns + df_ge.expect_table_columns_to_match_ordered_list(cols) + for col in cols: + df_ge.expect_column_values_to_not_be_null(col) + cut_off = 2 + if self.continuous_cols != None: + for col in self.continuous_cols: + measures = df_ge[col].describe() + df_ge.expect_column_values_to_be_of_type(col, 'int64') + df_ge.expect_column_mean_to_be_between(col, measures['mean'] - cut_off * measures['std'], measures['mean'] + cut_off * measures['std']) + df_ge.expect_column_max_to_be_between(col, measures['max'] - cut_off * measures['std'], measures['max'] + cut_off * measures['std']) + df_ge.expect_column_min_to_be_between(col, measures['min'] - cut_off * measures['std'], measures['min'] + cut_off * measures['std']) + expected_partition = ge.dataset.util.continuous_partition_data(df_ge[col]) + df_ge.expect_column_bootstrapped_ks_test_p_value_to_be_greater_than(col, expected_partition) + if len(self.discrete_cat_cols) != None: + for col in self.discrete_cat_cols: + possible_cat = df_ge[col].unique() + df_ge.expect_column_values_to_be_in_set(col, possible_cat) + expected_partition = ge.dataset.util.categorical_partition_data(df_ge[col]) + df_ge.expect_column_chisquare_test_p_value_to_be_greater_than(col, expected_partition) + return df_ge \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/normalization.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/normalization.py new file mode 100644 index 0000000..6d5822b --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/normalization.py @@ -0,0 +1,159 @@ +import pandas as pd +import numpy as np +from sklearn.preprocessing import StandardScaler, normalize, MinMaxScaler +from scipy.stats import zscore + +class Normalizer: + + def __init__(self, norm_cols: dict): + """ + Constructor + + Parameters + ---------- + norm_cols : dict + Receives dict with the name of the normalization to be + performed and which are the columns + Ex: norm_cols = {'zscore': ['salary', 'price'], + 'min-max': ['heigth', 'age']} + + Returns + ------- + Normalization + """ + self.norm_cols = norm_cols + self.col_names = [name for norm in norm_cols for name in norm_cols[norm]] + self.norms = {'min-max': MinMaxScaler, + 'standard': StandardScaler} + self.fitted = False + + def statistics(self, df : pd.DataFrame): + """ + Calculates dataframe statistics + + Parameters + ---------- + df : dataframe to calculate the statistics for each column + + Returns + ------- + None + """ + zip_cols = lambda result: zip(result.index.values, result.values) + self.col_min = {col: value for col, value in zip_cols(df[self.col_names].min())} + self.col_max = {col: value for col, value in zip_cols(df[self.col_names].max())} + self.col_std = {col: value for col, value in zip_cols(df[self.col_names].std())} + self.col_mean = {col: value for col, value in zip_cols(df[self.col_names].mean())} + self.col_median = {col: value for col, value in zip_cols(df[self.col_names].median())} + + def __apply_func(self, X, normalization): + """ + Creates the normalization object + + Parameters + ---------- + X : array + Data to be normalized + normalization : Normalization + Normalization to be applied + + Returns + ------- + Normalization + """ + normalization.fit(X) + return normalization + + def fit(self, df: pd.DataFrame): + """ + Generates normalization object for each column + + Parameters + ---------- + df : pd.DataFrame + dataframe with columns to be normalized + + Returns + ------- + None + """ + self.statistics(df) + self.normalization = dict() + for norm in self.norm_cols: + if norm in ['zscore', 'log10']: + continue + for col in self.norm_cols[norm]: + self.normalization[col] = self.__apply_func(df[col].values.reshape(-1, 1), self.norms[norm]()) + self.fitted = True + + def transform(self, df: pd.DataFrame): + """ + Apply normalization to each column + + Parameters + ---------- + df : pd.DataFrame + dataframe with columns to be normalized + + Returns + ------- + pd.DataFrame + """ + if not self.fitted: + raise Exception("Not yet fitted.") + + for norm in self.norm_cols: + if norm == 'zscore': + for col in self.norm_cols[norm]: + df.loc[:,col] = (df[col].values - self.col_mean[col])/self.col_std[col] + elif norm == 'log10': + for col in self.norm_cols[norm]: + df.loc[:,col] = np.log10(df[col].values) + else: + for col in self.norm_cols[norm]: + df.loc[:,col] = self.normalization[col].transform(df[col].values.reshape(-1, 1)) + return df + + def inverse_transform(self, df: pd.DataFrame): + """ + Apply the denormalized to each column + + Parameters + ---------- + df : pd.DataFrame + dataframe with columns to be denormalized + + Returns + ------- + pd.DataFrame + """ + if not self.fitted: + raise Exception("Not yet trained.") + + for norm in self.norm_cols: + if norm == 'zscore': + for col in self.norm_cols[norm]: + df.loc[:,col] = df[col].apply(lambda z: self.col_std[col]*z + self.col_mean[col]) + elif norm == 'log10': + for col in self.norm_cols[norm]: + df.loc[:,col] = df[col].apply(lambda x: 10 ** x) + else: + for col in self.norm_cols[norm]: + df.loc[:,col] = self.normalization[col].inverse_transform(df[col].values.reshape(-1, 1)) + return df + + def fit_transform(self, df: pd.DataFrame): + """ + Creates object and apply it normalization + + Parameters + ---------- + df : pd.DataFrame + dataframe with columns to be normalized + + Returns + ------- + pd.DataFrame + """ + self.fit(df) + return self.transform(df) diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/preprocessing.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/preprocessing.py new file mode 100644 index 0000000..dea90fa --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/preprocessing.py @@ -0,0 +1,141 @@ +import pandas as pd + +from ml.preprocessing.normalization import Normalizer +from sklearn.preprocessing import OneHotEncoder +from sklearn.model_selection import train_test_split +from category_encoders import * +import logging + +logging.getLogger().setLevel(logging.INFO) + +class Preprocessing: + """ + Class to perform data preprocessing before training + """ + + def __init__(self, norm_cols=None, oneHot_cols=None): + """ + Constructor + + Parameters + ---------- + norm_cols : dict + Receives dict with the name of the normalization to be + performed and which are the columns + Ex: norm_cols = {'zscore': ['salary', 'price'], + 'min-max': ['heigth', 'age']} + oneHot_cols : array + Receives an array with columns names to be categorized with One Hot Encoding + Returns + ------- + Preprocessing + """ + self.norm_cols = norm_cols + self.oneHot_cols = oneHot_cols + self.ohe = OneHotEncoder(handle_unknown='ignore') + + def clean_data(self, df: pd.DataFrame): + """ + Perform data cleansing. + + Parameters + ---------- + df : pd.Dataframe + Dataframe to be processed + + Returns + ------- + pd.Dataframe + Cleaned Data Frame + """ + logging.info("Cleaning data") + df_copy = df.copy() + df_copy['Pclass'] = df_copy.Pclass.astype('object') + df_copy = df_copy.dropna() + return df_copy + + def categ_encoding_oneHot(self, df: pd.DataFrame, step_train = False): + """ + Perform encoding of the categorical variables using One Hot Encoding + + Parameters + ---------- + df : pd.Dataframe + Dataframe to be processed + step_train : bool + if True, the fit function is executed + + Returns + ------- + pd.Dataframe + Encoded Data Frame + """ + logging.info("One hot encoding") + df_copy = df.copy() + + if step_train: + self.ohe.fit(df_copy[self.oneHot_cols]) + + arr = self.ohe.transform(df_copy[self.oneHot_cols]) + df_copy = df_copy.join(arr).drop(self.oneHot_cols, axis=1) + return df_copy + + def normalize(self, df: pd.DataFrame, step_train = False): + """ + Apply normalization to the selected columns + + Parameters + ---------- + df : pd.DataFrame + dataframe with columns to be normalized + step_train : bool + if True, the Normalizer is created and applied, + otherwise it is only applied + + Returns + ------- + pd.DataFrame + Normalized dataframe + """ + logging.info("Normalizing") + if step_train: + self.norm = Normalizer(self.norm_cols) + df = self.norm.fit_transform(df) + else: + df = self.norm.transform(df.copy()) + return df + + def execute(self, df, step_train = False, val_size = 0.2): + """ + Apply all preprocessing steps on the Dataframe + + Parameters + ---------- + df : pd.DataFrame + dataframe with columns to be normalized + step_train : bool + if True, data is splited in train and val + step_train : val_size + Size of the validation dataset + + Returns + ------- + pd.DataFrame + - One Preprocessed dataframe, if step_train is False + - Two Preprocessed dataframes, if step_train is True + """ + df = self.clean_data(df) + df = self.categ_encoding_oneHot(df, step_train) + + if step_train: + logging.info("Divide train and test") + X_train, X_val = train_test_split(df, test_size=val_size, random_state=123) + X_train = self.normalize(X_train, step_train = True) + X_val = self.normalize(X_val, step_train = False) + logging.info(f"shape train {X_train.shape} val {X_val.shape}") + return X_train, X_val + else: + X = self.normalize(df, step_train = False) + logging.info(f"shape {X.shape}") + return X + diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/text_vectorizer.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/text_vectorizer.py new file mode 100644 index 0000000..674458e --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/text_vectorizer.py @@ -0,0 +1,201 @@ +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.feature_extraction.text import TfidfVectorizer +import numpy as np +import pandas as pd + +class TextVectorizer: + + def __init__(self, vectorizer_cols : dict, word2vec=None): + """ + Constructor + + Parameters + ---------- + vectorizer_cols : dict + Receives a dict with the name of the vectorizer to be + performed and which are the columns + Ex: vectorizer_cols = {'embedding_median': ['col'], + 'embedding_mean': ['col'], + 'tf_idf': ['col'], + 'bag_of_words' : [col]} + Returns + ------- + Normalization + """ + self.word2vec = word2vec + self.index_ini_fim = len(self.word2vec.index2word) if word2vec != None else 0 + self.vectorizer_cols = vectorizer_cols + self.vectorizer_vects = {'bag_of_words': self.bag_of_words, + 'tf_idf': self.tf_idf_vect} + self.fitted = False + + def fit(self, df: pd.DataFrame): + """ + Generates the vectorizer object for each column. The text must be preprocessed. + + Parameters + ---------- + df : pd.DataFrame + dataframe with columns to be vectorizer + + Returns + ------- + None + """ + self.vectorizers_fitted = dict() + for vectorizer in self.vectorizer_cols: + if vectorizer in ['index', 'embedding_median', 'embedding_mean']: + continue + for col in self.vectorizer_cols[vectorizer]: + self.vectorizers_fitted[vectorizer] = {} + self.vectorizers_fitted[vectorizer][col] = self.vectorizer_vects[vectorizer](df[col].values) + self.fitted = True + + def transform(self, df: pd.DataFrame): + """ + Apply the vectorizer object for each column. The text must be preprocessed. + + Parameters + ---------- + df : pd.DataFrame + dataframe with columns to be vectorizer + + Returns + ------- + pd.DataFrame + """ + if not self.fitted: + raise Exception("Not yet trained.") + + for vectorizer in self.vectorizer_cols: + if vectorizer == 'index': + for col in self.vectorizer_cols[vectorizer]: + df.loc[:, col+"_"+vectorizer] = df[col].apply(lambda x: self.embedding(x, 3)) + elif vectorizer == 'embedding_median': + for col in self.vectorizer_cols[vectorizer]: + df.loc[:, col+"_"+vectorizer] = df[col].apply(lambda x: self.embedding(x, 1)) + elif vectorizer == 'embedding_mean': + for col in self.vectorizer_cols[vectorizer]: + df.loc[:, col+"_"+vectorizer] = df[col].apply(lambda x: self.embedding(x, 2)) + elif (vectorizer == 'bag_of_words') | (vectorizer == 'tf_idf'): + for col in self.vectorizer_cols[vectorizer]: + values = self.vectorizers_fitted[vectorizer][col].transform(df[col]) + df.loc[:,col+"_"+vectorizer] = pd.Series(values.toarray().tolist()) + + return df + + def embedding(self, X, typ_transform=1): + """ + Apply the embedding in X. The text must be preprocessed. + + Parameters + ---------- + X : pd.Series + row to be encoded + typ_transform : int + type of transformation + 1 - apply embedding median + 2 - apply embedding mean + 2 - apply index + + Returns + ------- + pd.DataFrame + """ + if X is None or type(X) == float: + return None + vector = [] + if typ_transform == 1: # mediana + vector = np.median([self.word2vec[x] for x in X.split() if x in self.word2vec], axis=0) + elif typ_transform == 2: # média + vector = np.mean([self.word2vec[x] for x in X.split() if x in self.word2vec], axis=0)#[0] + elif typ_transform == 3: # indexação + idx = self.word2vec.index2word + set_idx = set(idx) + indexes = [idx.index(token) for token in X.split() if token in set_idx] + indexes = [self.index_ini_fim] + indexes + [self.index_ini_fim] + # Create vector + X_length = len(indexes) + vector = np.zeros(X_length, dtype=np.int64) + vector[:len(indexes)] = indexes + else: + vector = [] + return vector + + def bag_of_words(self, corpus): + """ + Generate object bag of words + + Parameters + ---------- + corpus : str + text to generate object bag of words + Returns + ------- + model + """ + vectorizer = CountVectorizer() + model = vectorizer.fit(corpus) + return model + + def tf_idf_vect(self, corpus): + """ + Generate object td idf + + Parameters + ---------- + corpus : str + text to generate object tf idf + Returns + ------- + model + """ + vectorizer = TfidfVectorizer() + model = vectorizer.fit(corpus) + return model + + def inverse_transform(self, df: pd.DataFrame): + """ + Apply the invese_transform of vectorizer to each column + Options: index, bag_of_words and tf_idf + + Parameters + ---------- + df : pd.DataFrame + dataframe with columns to be unvectorizer + + Returns + ------- + pd.DataFrame + """ + if not self.fitted: + raise Exception("Not yet trained.") + + for vectorizer in self.vectorizer_cols: + if vectorizer == 'index': + for col in self.vectorizer_cols[vectorizer]: + df.loc[:, col+"_remove_"+vectorizer] = df[col].apply(lambda x: self.unvectorize(x)) + elif (vectorizer == 'bag_of_words') | (vectorizer == 'tf_idf'): + for col in self.vectorizer_cols[vectorizer]: + values = self.vectorizers_fitted[vectorizer][col].inverse_transform(df[col]) + df.loc[:,col+"_remove_"+vectorizer] = pd.Series(values.toarray().tolist()) + + return df + + def unvectorize(self, vector): + """ + Apply unvectorize in vector index + + Parameters + ---------- + vector : array + array with index + + Returns + ------- + array + """ + idx = self.word2vec.index2word + tokens = [idx[index] for index in vector if index != self.index_ini_fim] + X = " ".join(token for token in tokens) + return X \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/visualization/app-streamlit-titanict.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/visualization/app-streamlit-titanict.py new file mode 100644 index 0000000..727aca8 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/visualization/app-streamlit-titanict.py @@ -0,0 +1,84 @@ +import streamlit as st +from vega_datasets import data +import pandas as pd +import altair as alt +import sys +from pandas_profiling import ProfileReport +from streamlit_pandas_profiling import st_profile_report + +@st.cache +def load_data(): + dataframe = pd.read_csv("../../../data/raw/train.csv") + dataframe["Survived"] = dataframe["Survived"].replace([0,1],["Died", "Survived"]) + return dataframe + +def instructions(): + + st.markdown( + """ + Sample streamlit page using the Titanic dataset. + This library is interesting for presenting the results and generating a web page. + + + ### Questions? + + Streamlit community -> https://discuss.streamlit.io + """) + +def dataset_analysis(df): + survived = ["All"] + survived.extend(df["Survived"].unique()) + + selected = st.selectbox("Survived:", survived) + if selected == "All": + st.write('## Dataset Titanic', df) + else: + st.write('## Dataset Titanic', df[df["Survived"] == selected]) + + if st.checkbox("Graphical Display", False): + st.subheader("Dataset Graphical Display") + + st.altair_chart(alt.Chart(df).mark_circle().encode( + alt.X('Age', scale=alt.Scale(zero=False)), + alt.Y('Fare', scale=alt.Scale(zero=False, padding=1)), + color='Survived', + size='Pclass', + tooltip=['Age','Survived', 'Sex', 'Pclass'], + ).interactive(), use_container_width=True) + if st.checkbox("Show Summary", False): + st.write(df.describe()) + + +def profilling_analysis(df): + try: + pr = ProfileReport(df, explorative=True) + st.title("Pandas Profiling in Streamlit") + st.write(df) + st_profile_report(pr) + except: + st.title("Error - Pandas profiling was not generated") + + +def main(): + st.title("Titanic Dataset") + + df = load_data() + + st.sidebar.title("What to do") + menu = ["Instructions", "DataSet Exploration - Profilling", "DataSet Exploration - General"] + app_mode = st.sidebar.selectbox("Select an option:", + menu) + if app_mode == menu[0]: + st.sidebar.success('Next "'+menu[1]+'".') + instructions() + elif app_mode == menu[1]: + st.sidebar.success('Next "'+menu[2]+'".') + profilling_analysis(df) + elif app_mode == menu[2]: + #st.sidebar.success('Para continuar selecione "'+menu[3]+'".') + dataset_analysis(df) + + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/visualization/visualization.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/visualization/visualization.py new file mode 100644 index 0000000..a516e9d --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/visualization/visualization.py @@ -0,0 +1,428 @@ +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt +import altair as alt +import numpy as np +from yellowbrick.target import FeatureCorrelation + +class Visualization: + + @staticmethod + def general_analysis(df): + """ + Plot function general analysis of graphs + + Parameters + ---------- + df : pd.DataFrame + dataframe to be analyzed + + Returns + ---- + None + """ + pass + + @staticmethod + def missing_analysis(df): + """ + Function plots the percentage of missings in all columns of the DataFrame + + Parameters + ---------- + df : pd.DataFrame + dataframe on which the missing will be analyzed + + Returns + ------- + None + """ + df_isnull = (df.isnull().sum() / len(df))*100 + df_isnull = df_isnull.drop(df_isnull[df_isnull ==0].index).sort_values(ascending = False) + missing_data = pd.DataFrame({'Percentual Missing': df_isnull}) + missing_data.plot.bar() + + @staticmethod + def count_values(df, feature, title): + """ + Plot of count of distinct values ​​of a feature + + Parameters + ---------- + df : pd.DataFrame + dataframe with the values + feature : str + name of the feature to be counted + title : str + chart title + + Returns + ---- + None + """ + g = sns.catplot(feature, data=df, aspect=4, kind="count") + g.set_xticklabels(rotation=90) + g = plt.title(title) + + @staticmethod + def regression_analysis( y_true, y_pred, path=None): + """ + Analysis of the real and predicted y of the regression model + + Parameters + ---------- + y_true : array + true values + y_pred : array + predicted values + path : str + path where the graphics will be saved + + Returns + ------- + None + """ + residual = y_true - y_pred + print("Histogram") + Visualization.histogram(residual, "Residual") + print("Scatter") + Visualization.scatter(y_pred, residual, "pred", "residual", path=path) + print("Scatter") + Visualization.scatter(y_true, y_pred, "y_test", "pred", path=path) + + @staticmethod + def histogram(values, title, fig_size=(4,3), path=None): + """ + Histogram plot of a set of values + + Parameters + ---------- + values : array + values + title : str + title + fig_size : tuple + figure size + path : str + path where the graphics will be saved + + Returns + ------- + None + """ + plt.clf() + f, ax = plt.subplots(1, figsize=fig_size) + ax.hist(values, bins=60) + ax.set_title(title) + f.tight_layout() + if(path != None): + f.savefig(path+'/hist_'+title+'.png') + + + @staticmethod + def correlation_analysis(df, fig_size=(5,4), path=None): + """ + Correlation of variables in the dataframe + + Parameters + ---------- + df : pd.DataFrame + dataframe + fig_size : tuple + figure size + path : str + path where the graphics will be saved + + Returns + ------- + None + """ + plt.clf() + f, ax = plt.subplots(1, figsize=fig_size) + corr = round(df.corr(), 4) + sns.heatmap(corr, + xticklabels=corr.columns.values, + yticklabels=corr.columns.values, ax=ax) + f.tight_layout() + if(path != None): + f.savefig(path+'/correlation.png') + + @staticmethod + def features_correlation(df, cols, target, fig_size=(6,6), path=None): + """ + Correlation of variables in the dataframe with respect to the target + + Parameters + ---------- + df : pd.Dataframe + dataframe with the data to calculate the correlation + cols : array + columns to be correlated with the target + target : str + target name + fig_size : tuple + figure size + path : str + path where the graphics will be saved + + Returns + ------- + None + """ + f, ax = plt.subplots(1, figsize=fig_size) + ax.set_xlabel("Feature Correlation") + visualizer = FeatureCorrelation(labels=list(cols)) + visualizer.fit(df[cols], df[target]) + f.tight_layout() + if(path != None): + f.savefig(path+'/features_correlation.png') + + @staticmethod + def scatter(x, y, xlabel, ylabel, fig_size=(5,4), groups=None, group_color=None, path=None): + """ + Plot scatter + + Parameters + ---------- + x : array + list of x-axis values + y : array + list of y-axis values + x_label : str + label x + y_label : array + label y + fig_size : tuple + figure size + groups : array + group list + group_color : dict + group colors + path : str + path where the graphics will be saved + + Returns + ------- + None + """ + f, ax = plt.subplots(1, figsize=fig_size) + sns.scatterplot(x, y, hue=groups, palette=group_color, legend="full", ax=ax) + ax.set_xlabel(xlabel) + ax.set_ylabel(ylabel) + f.tight_layout() + if(path != None): + f.savefig(path+'/scatter_'+xlabel+'_'+ylabel+'.png') + + @staticmethod + def bar(x, y, xlabel, ylabel, fig_size=(5,4), est=np.mean, groups=None, group_color=None, path=None): + """ + Plot bar + + Parameters + ---------- + x : array + list of x-axis values + y : array + list of y-axis values + x_label : str + label x + y_label : array + label y + fig_size : tuple + figure size + est : np + numpy function for aggregating the bars + groups : array + group list + group_color : dict + group colors + path : str + path where the graphics will be saved + + Returns + ------- + None + """ + f, ax = plt.subplots(1, figsize=fig_size) + sns.barplot(x, y, ax=ax, hue=groups, estimator=est, color=group_color) + ax.set_xlabel(xlabel) + ax.set_ylabel(ylabel) + f.tight_layout() + if(path != None): + f.savefig(path+'/barr_'+xlabel+'_'+ylabel+'.png') + + @staticmethod + def line(x, y, xlabel, ylabel, fig_size=(5,4), est=np.mean, groups=None, group_color=None, path=None): + """ + Plot bar + + Parameters + ---------- + x : array + list of x-axis values + y : array + list of y-axis values + x_label : str + label x + y_label : array + label y + fig_size : tuple + figure size + est : np + numpy function for aggregating the bars + groups : array + group list + group_color : dict + group colors + path : str + path where the graphics will be saved + + Returns + ------- + None + """ + f, ax = plt.subplots(1, figsize=fig_size) + sns.lineplot(x, y, hue=groups, estimator=est, color=group_color, ax=ax) + ax.set_xlabel(xlabel) + ax.set_ylabel(ylabel) + f.tight_layout() + if(path != None): + f.savefig(path+'/linha_'+xlabel+'_'+ylabel+'.png') + + @staticmethod + def box_plot(x, y, xlabel, ylabel, fig_size=(5,4), path=None): + """ + Plot line + + Parameters + ---------- + x : array + list of x-axis values + y : array + list of y-axis values + x_label : str + label x + y_label : array + label y + fig_size : tuple + figure size + path : str + path where the graphics will be saved + + Returns + ------- + None + """ + f, ax = plt.subplots(1, figsize=fig_size) + sns.boxplot(x=x, y=y, ax=ax) + ax.set_xlabel(xlabel) + ax.set_ylabel(ylabel) + f.tight_layout() + if(path != None): + f.savefig(path+'/boxplot_'+xlabel+'_'+ylabel+'.png') + + @staticmethod + def scatter_interactive(df, col_name_x, col_name_y, xlabel, ylabel, hover, fig_size=(400,300), **kwargs): + """ + Interactive plotter + + Parameters + ---------- + df : pd.Dataframe + dataframe + col_name_x : str + col name in x + col_name_y : str + col name in y + x_label : str + label x + y_label : str + label y + hover : list + values show when pass mouse + fig_size : tuple + figure size + **kwargs : **kwargs + to inform other properties of the chart. For example, + to set the color to a type, just pass color = "blue" + Returns + ------- + None + """ + alt.Chart(df, width=fig_size[0], height=fig_size[1]).mark_circle().encode( + alt.X(col_name_x, title=xlabel), + alt.Y(col_name_y, title=ylabel), + tooltip=hover, + **kwargs + ).interactive().display() + + @staticmethod + def bar_interactive(df, col_name_x, col_name_y, xlabel, ylabel, hover, fig_size=(400,300), **kwargs): + """ + Interactive plotter + + Parameters + ---------- + df : pd.Dataframe + dataframe + col_name_x : str + col name in x + col_name_y : str + col name in y + x_label : str + label x + y_label : str + label y + hover : list + values show when pass mouse + fig_size : tuple + figure size + **kwargs : **kwargs + to inform other properties of the chart. For example, + to set the color to a type, just pass color = "blue" + Returns + ------- + None + """ + alt.Chart(df, width=fig_size[0], height=fig_size[1]).mark_bar().encode( + alt.X(col_name_x, title=xlabel), + alt.Y(col_name_y, title=ylabel), + tooltip=hover, + **kwargs + ).interactive().display() + + @staticmethod + def line_interactive(df, col_name_x, col_name_y, xlabel, ylabel, hover, fig_size=(400,300), **kwargs): + """ + Interactive plotter + + Parameters + ---------- + df : pd.Dataframe + dataframe + col_name_x : str + col name in x + col_name_y : str + col name in y + x_label : str + label x + y_label : str + label y + hover : list + values show when pass mouse + fig_size : tuple + figure size + **kwargs : **kwargs + to inform other properties of the chart. For example, + to set the color to a type, just pass color = "blue" + Returns + ------- + None + """ + alt.Chart(df, width=fig_size[0], height=fig_size[1]).mark_line().encode( + alt.X(col_name_x, title=xlabel), + alt.Y(col_name_y, title=ylabel), + tooltip=hover, + **kwargs + ).interactive().display() + diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/tests/.ipynb_checkpoints/README-checkpoint.md b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/tests/.ipynb_checkpoints/README-checkpoint.md new file mode 100644 index 0000000..ae81f9e --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/tests/.ipynb_checkpoints/README-checkpoint.md @@ -0,0 +1,41 @@ +# Hermione test files + +In this folder, you can develop unit tests for your Data Science project. + +Unit testing is a regular process in software development but, unfortunately, not so common in Data Science projects. To ensure your code quality and that the project is running flawless at all times, it is extremely important that you code unit tests, specially if you are not working alone but in a Data Science team. + +The tests you have in the implemented example project test, for instance, if the project has its minimum directory structure, if your dataset is correctly imported, if the dataset has no missing values and that some columns that should be there are there indeed after preprocessing. + +There are no "written in stone" rules to good testing in Data Science. You just have to figure out what tests are best for you. + +## How to run the tests + +When working locally, you should run your tests before pushing to a remote repository or sharing your code to others. To do that, **ensure that you are inside `tests` folder**. + +```bash +cd src/tests +``` + +Then, run the `pytest` command. + +```bash +pytest +``` + +If you want to have a coverage report, do so: + +```bash +coverage run -m pytest +coverage report -m +``` + +Both `coverage` and `pytest` libraries are already in the `requirements.txt` file. + +## Include tests on CI/CD files + +If you are working with a remote repository, it is a great practice to code a CI/CD `.yml` file. For more information, visit + +- [CI/CD for Machine Learning](https://www.infoq.com/presentations/ci-cd-ml/) +- [CI/CD for Machine Learning & AI](https://blog.paperspace.com/ci-cd-for-machine-learning-ai/) +- [Accelerate MLOps: using CI/CD with machine learning models +](https://algorithmia.com/blog/accelerate-mlops-using-ci-cd-with-machine-learning-models) diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/tests/.ipynb_checkpoints/test_project-checkpoint.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/tests/.ipynb_checkpoints/test_project-checkpoint.py new file mode 100644 index 0000000..2d6936f --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/tests/.ipynb_checkpoints/test_project-checkpoint.py @@ -0,0 +1,54 @@ +import os +import pytest +import pandas as pd +import sys +sys.path.append('..') + +@pytest.fixture(scope='module') +def read_data(): + from ml.data_source.spreadsheet import Spreadsheet + yield Spreadsheet().get_data('../../data/raw/train.csv') + +@pytest.fixture(scope='module') +def cleaned_data(read_data): + from ml.preprocessing.preprocessing import Preprocessing + p = Preprocessing() + yield p.clean_data(read_data) + +def test_tree(): + """ + Test if the project has a good minimum structure + """ + assert os.path.exists(os.path.join('..','..', 'data', 'raw')) + assert os.path.exists(os.path.join('..','..', 'output')) + assert os.path.exists(os.path.join('..','..', 'src', 'api')) + assert os.path.exists(os.path.join('..','..', 'src', 'config')) + assert os.path.exists(os.path.join('..','..', 'src', 'ml', 'data_source')) + assert os.path.exists(os.path.join('..','..', 'src', 'ml', 'model')) + assert os.path.exists(os.path.join('..','..', 'src', 'ml', 'notebooks')) + assert os.path.exists(os.path.join('..','..', 'src', 'ml', 'preprocessing')) + assert os.path.exists(os.path.join('..','..', 'src', 'tests')) + +def test_spreadsheet(read_data): + """ + Test that spreadsheet is importing correctly + """ + assert read_data.shape[0] > 1 + + +def test_clean_data(cleaned_data): + """ + Test that the df is cleaned correctly + """ + assert cleaned_data.Pclass.dtype == 'object' + assert pd.isnull(cleaned_data.Age).sum() == 0 + +def test_categ_encoding(cleaned_data): + """ + Test if column PClass is + """ + from ml.preprocessing.preprocessing import Preprocessing + p = Preprocessing() + df = p.categ_encoding(cleaned_data) + names = ['Survived', 'Age', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male'] + assert [name in df.columns for name in names] \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/tests/README.md b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/tests/README.md new file mode 100644 index 0000000..ae81f9e --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/tests/README.md @@ -0,0 +1,41 @@ +# Hermione test files + +In this folder, you can develop unit tests for your Data Science project. + +Unit testing is a regular process in software development but, unfortunately, not so common in Data Science projects. To ensure your code quality and that the project is running flawless at all times, it is extremely important that you code unit tests, specially if you are not working alone but in a Data Science team. + +The tests you have in the implemented example project test, for instance, if the project has its minimum directory structure, if your dataset is correctly imported, if the dataset has no missing values and that some columns that should be there are there indeed after preprocessing. + +There are no "written in stone" rules to good testing in Data Science. You just have to figure out what tests are best for you. + +## How to run the tests + +When working locally, you should run your tests before pushing to a remote repository or sharing your code to others. To do that, **ensure that you are inside `tests` folder**. + +```bash +cd src/tests +``` + +Then, run the `pytest` command. + +```bash +pytest +``` + +If you want to have a coverage report, do so: + +```bash +coverage run -m pytest +coverage report -m +``` + +Both `coverage` and `pytest` libraries are already in the `requirements.txt` file. + +## Include tests on CI/CD files + +If you are working with a remote repository, it is a great practice to code a CI/CD `.yml` file. For more information, visit + +- [CI/CD for Machine Learning](https://www.infoq.com/presentations/ci-cd-ml/) +- [CI/CD for Machine Learning & AI](https://blog.paperspace.com/ci-cd-for-machine-learning-ai/) +- [Accelerate MLOps: using CI/CD with machine learning models +](https://algorithmia.com/blog/accelerate-mlops-using-ci-cd-with-machine-learning-models) diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/tests/test_project.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/tests/test_project.py new file mode 100644 index 0000000..2d6936f --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/tests/test_project.py @@ -0,0 +1,54 @@ +import os +import pytest +import pandas as pd +import sys +sys.path.append('..') + +@pytest.fixture(scope='module') +def read_data(): + from ml.data_source.spreadsheet import Spreadsheet + yield Spreadsheet().get_data('../../data/raw/train.csv') + +@pytest.fixture(scope='module') +def cleaned_data(read_data): + from ml.preprocessing.preprocessing import Preprocessing + p = Preprocessing() + yield p.clean_data(read_data) + +def test_tree(): + """ + Test if the project has a good minimum structure + """ + assert os.path.exists(os.path.join('..','..', 'data', 'raw')) + assert os.path.exists(os.path.join('..','..', 'output')) + assert os.path.exists(os.path.join('..','..', 'src', 'api')) + assert os.path.exists(os.path.join('..','..', 'src', 'config')) + assert os.path.exists(os.path.join('..','..', 'src', 'ml', 'data_source')) + assert os.path.exists(os.path.join('..','..', 'src', 'ml', 'model')) + assert os.path.exists(os.path.join('..','..', 'src', 'ml', 'notebooks')) + assert os.path.exists(os.path.join('..','..', 'src', 'ml', 'preprocessing')) + assert os.path.exists(os.path.join('..','..', 'src', 'tests')) + +def test_spreadsheet(read_data): + """ + Test that spreadsheet is importing correctly + """ + assert read_data.shape[0] > 1 + + +def test_clean_data(cleaned_data): + """ + Test that the df is cleaned correctly + """ + assert cleaned_data.Pclass.dtype == 'object' + assert pd.isnull(cleaned_data.Age).sum() == 0 + +def test_categ_encoding(cleaned_data): + """ + Test if column PClass is + """ + from ml.preprocessing.preprocessing import Preprocessing + p = Preprocessing() + df = p.categ_encoding(cleaned_data) + names = ['Survived', 'Age', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male'] + assert [name in df.columns for name in names] \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/train/.ipynb_checkpoints/Dockerfile-checkpoint b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/train/.ipynb_checkpoints/Dockerfile-checkpoint new file mode 100644 index 0000000..207b1f7 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/train/.ipynb_checkpoints/Dockerfile-checkpoint @@ -0,0 +1,66 @@ +FROM ubuntu:latest +# Set a docker label to advertise multi-model support on the container +LABEL com.amazonaws.sagemaker.capabilities.multi-models=false +# Set a docker label to enable container to use SAGEMAKER_BIND_TO_PORT environment variable if present +LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true + +# No question/dialog is asked during apt-get install +ARG DEBIAN_FRONTEND=noninteractive + +# Setting the Timezone Environment Variable +ENV TZ=America/Sao_Paulo + +# install ubuntu libraries +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + gcc \ + python3.7 \ + python3-dev \ + python3-pip \ + ca-certificates \ + git \ + curl \ + nginx \ + openjdk-8-jre-headless\ + wget &&\ + rm -rf /var/lib/apt/lists/* + +# Create folders for code +RUN mkdir /opt/ml && \ + mkdir /opt/ml/output && \ + mkdir /opt/ml/code && \ + mkdir /opt/ml/code/train && \ + mkdir /opt/ml/code/src + +# Install requirements +COPY requirements.txt /opt/ml/code/src/requirements.txt +RUN pip3 install --no-cache -r /opt/ml/code/src/requirements.txt + +# Install the SageMaker Training Toolkit +RUN pip3 install --no-cache \ + boto3 \ + sagemaker \ + sagemaker-training + +# copy folders for code +COPY src/config/ /opt/ml/code/src/config/ +COPY src/ml/ /opt/ml/code/src/ml/ +COPY src/util.py /opt/ml/code/src/util.py +COPY train/train.py /opt/ml/code/train.py + +# Copy entrypoint script to the image and make it executable +WORKDIR /opt/ml/code + +# Environment variables +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PYTHONIOENCODING=UTF-8 \ + LANG=C.UTF-8 \ + LC_ALL=C.UTF-8 + +# Setting PYTHONPATH to access the copied code +ENV PYTHONPATH="/opt/ml/code:${PATH}" + +# Add a Python script and configure Docker to run it +RUN chmod +x train.py +ENV SAGEMAKER_PROGRAM train.py diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/train/.ipynb_checkpoints/train-checkpoint.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/train/.ipynb_checkpoints/train-checkpoint.py new file mode 100644 index 0000000..bc7b4cd --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/train/.ipynb_checkpoints/train-checkpoint.py @@ -0,0 +1,84 @@ +import sys +sys.path.append("src/") + +import os +import traceback +import pandas as pd +import logging +from sklearn.metrics import * +from ml.model.trainer import TrainerSklearn +from sklearn.ensemble import RandomForestClassifier +from util import * + +logging.getLogger().setLevel('INFO') + +# Paths to access the datasets and salve the model +prefix = '/opt/ml/' + +training_path = os.environ["SM_CHANNEL_TRAIN"] +val_path = os.environ["SM_CHANNEL_VALIDATION"] + +error_path = os.path.join(prefix, 'output') +model_path = os.environ['SM_MODEL_DIR'] + +def train(): + """ + Execute the train step in the virtual environment + + """ + logging.info('Starting the training') + try: + logging.info('Reading the inputs') + # Take the set of train files and read them all into a single pandas dataframe + input_files = [ os.path.join(training_path, file) for file in os.listdir(training_path) ] + if len(input_files) == 0: + raise ValueError(('There are no files in {}.\n' + + 'This usually indicates that the channel ({}) was incorrectly specified,\n' + + 'the data specification in S3 was incorrectly specified or the role specified\n' + + 'does not have permission to access the data.').format(training_path, channel_name)) + raw_data = [ pd.read_csv(file) for file in input_files ] + train = pd.concat(raw_data) + + # Take the set of val files and read them all into a single pandas dataframe + input_files = [ os.path.join(val_path, file) for file in os.listdir(val_path) ] + if len(input_files) == 0: + raise ValueError(('There are no files in {}.\n' + + 'This usually indicates that the channel ({}) was incorrectly specified,\n' + + 'the data specification in S3 was incorrectly specified or the role specified\n' + + 'does not have permission to access the data.').format(val_path, channel_name)) + raw_data = [ pd.read_csv(file) for file in input_files ] + val = pd.concat(raw_data) + + # Define the target and columns to be used in the train + target = "Survived" + columns = train.columns.drop(target) + + logging.info("Training the model") + model = TrainerSklearn().train(train, val, target, classification=True, + algorithm=RandomForestClassifier, + columns=columns) + + # Salve the model and metrics + logging.info("Saving") + model.save_model(os.path.join(model_path, 'model.pkl')) + metrics = model.artifacts["metrics"] + logging.info(f"accuracy={metrics['accuracy']}; f1={metrics['f1']}; precision={metrics['precision']}; recall={metrics['recall']};") + pd.DataFrame(model.artifacts["metrics"].items(), columns=['Metric', 'Value']).to_csv(os.path.join(model_path, 'metrics.csv'), index=False) + logging.info('Training complete.') + + except Exception as e: + # Write out an error file. This will be returned as the failureReason in the + # DescribeTrainingJob result. + trc = traceback.format_exc() + with open(os.path.join(error_path, 'failure'), 'w') as s: + s.write('Exception during training: ' + str(e) + '\n' + trc) + # Printing this causes the exception to be in the training job logs, as well. + logging.info('Exception during training: ' + str(e) + '\n' + trc, file=sys.stderr) + # A non-zero exit code causes the training job to be marked as Failed. + sys.exit(255) + +if __name__ == '__main__': + train() + + # A zero exit code causes the job to be marked a Succeeded. + sys.exit(0) diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/train/Dockerfile b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/train/Dockerfile new file mode 100644 index 0000000..207b1f7 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/train/Dockerfile @@ -0,0 +1,66 @@ +FROM ubuntu:latest +# Set a docker label to advertise multi-model support on the container +LABEL com.amazonaws.sagemaker.capabilities.multi-models=false +# Set a docker label to enable container to use SAGEMAKER_BIND_TO_PORT environment variable if present +LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true + +# No question/dialog is asked during apt-get install +ARG DEBIAN_FRONTEND=noninteractive + +# Setting the Timezone Environment Variable +ENV TZ=America/Sao_Paulo + +# install ubuntu libraries +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + gcc \ + python3.7 \ + python3-dev \ + python3-pip \ + ca-certificates \ + git \ + curl \ + nginx \ + openjdk-8-jre-headless\ + wget &&\ + rm -rf /var/lib/apt/lists/* + +# Create folders for code +RUN mkdir /opt/ml && \ + mkdir /opt/ml/output && \ + mkdir /opt/ml/code && \ + mkdir /opt/ml/code/train && \ + mkdir /opt/ml/code/src + +# Install requirements +COPY requirements.txt /opt/ml/code/src/requirements.txt +RUN pip3 install --no-cache -r /opt/ml/code/src/requirements.txt + +# Install the SageMaker Training Toolkit +RUN pip3 install --no-cache \ + boto3 \ + sagemaker \ + sagemaker-training + +# copy folders for code +COPY src/config/ /opt/ml/code/src/config/ +COPY src/ml/ /opt/ml/code/src/ml/ +COPY src/util.py /opt/ml/code/src/util.py +COPY train/train.py /opt/ml/code/train.py + +# Copy entrypoint script to the image and make it executable +WORKDIR /opt/ml/code + +# Environment variables +ENV PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PYTHONIOENCODING=UTF-8 \ + LANG=C.UTF-8 \ + LC_ALL=C.UTF-8 + +# Setting PYTHONPATH to access the copied code +ENV PYTHONPATH="/opt/ml/code:${PATH}" + +# Add a Python script and configure Docker to run it +RUN chmod +x train.py +ENV SAGEMAKER_PROGRAM train.py diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/train/train.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/train/train.py new file mode 100644 index 0000000..bc7b4cd --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/train/train.py @@ -0,0 +1,84 @@ +import sys +sys.path.append("src/") + +import os +import traceback +import pandas as pd +import logging +from sklearn.metrics import * +from ml.model.trainer import TrainerSklearn +from sklearn.ensemble import RandomForestClassifier +from util import * + +logging.getLogger().setLevel('INFO') + +# Paths to access the datasets and salve the model +prefix = '/opt/ml/' + +training_path = os.environ["SM_CHANNEL_TRAIN"] +val_path = os.environ["SM_CHANNEL_VALIDATION"] + +error_path = os.path.join(prefix, 'output') +model_path = os.environ['SM_MODEL_DIR'] + +def train(): + """ + Execute the train step in the virtual environment + + """ + logging.info('Starting the training') + try: + logging.info('Reading the inputs') + # Take the set of train files and read them all into a single pandas dataframe + input_files = [ os.path.join(training_path, file) for file in os.listdir(training_path) ] + if len(input_files) == 0: + raise ValueError(('There are no files in {}.\n' + + 'This usually indicates that the channel ({}) was incorrectly specified,\n' + + 'the data specification in S3 was incorrectly specified or the role specified\n' + + 'does not have permission to access the data.').format(training_path, channel_name)) + raw_data = [ pd.read_csv(file) for file in input_files ] + train = pd.concat(raw_data) + + # Take the set of val files and read them all into a single pandas dataframe + input_files = [ os.path.join(val_path, file) for file in os.listdir(val_path) ] + if len(input_files) == 0: + raise ValueError(('There are no files in {}.\n' + + 'This usually indicates that the channel ({}) was incorrectly specified,\n' + + 'the data specification in S3 was incorrectly specified or the role specified\n' + + 'does not have permission to access the data.').format(val_path, channel_name)) + raw_data = [ pd.read_csv(file) for file in input_files ] + val = pd.concat(raw_data) + + # Define the target and columns to be used in the train + target = "Survived" + columns = train.columns.drop(target) + + logging.info("Training the model") + model = TrainerSklearn().train(train, val, target, classification=True, + algorithm=RandomForestClassifier, + columns=columns) + + # Salve the model and metrics + logging.info("Saving") + model.save_model(os.path.join(model_path, 'model.pkl')) + metrics = model.artifacts["metrics"] + logging.info(f"accuracy={metrics['accuracy']}; f1={metrics['f1']}; precision={metrics['precision']}; recall={metrics['recall']};") + pd.DataFrame(model.artifacts["metrics"].items(), columns=['Metric', 'Value']).to_csv(os.path.join(model_path, 'metrics.csv'), index=False) + logging.info('Training complete.') + + except Exception as e: + # Write out an error file. This will be returned as the failureReason in the + # DescribeTrainingJob result. + trc = traceback.format_exc() + with open(os.path.join(error_path, 'failure'), 'w') as s: + s.write('Exception during training: ' + str(e) + '\n' + trc) + # Printing this causes the exception to be in the training job logs, as well. + logging.info('Exception during training: ' + str(e) + '\n' + trc, file=sys.stderr) + # A non-zero exit code causes the training job to be marked as Failed. + sys.exit(255) + +if __name__ == '__main__': + train() + + # A zero exit code causes the job to be marked a Succeeded. + sys.exit(0) From 1b121aabb13c18b0a19fd0426cc12864a3543eb3 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Wed, 26 May 2021 19:41:19 +0000 Subject: [PATCH 02/10] Git ignore --- .../.ipynb_checkpoints/.tpl-checkpoint.gitignore | 8 ++++++++ .../__IMPLEMENTED_SAGEMAKER__/.tpl.gitignore | 8 ++++++++ 2 files changed, 16 insertions(+) create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.ipynb_checkpoints/.tpl-checkpoint.gitignore create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.tpl.gitignore diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.ipynb_checkpoints/.tpl-checkpoint.gitignore b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.ipynb_checkpoints/.tpl-checkpoint.gitignore new file mode 100644 index 0000000..95ac7a3 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.ipynb_checkpoints/.tpl-checkpoint.gitignore @@ -0,0 +1,8 @@ +.ipynb_checkpoints +mlruns/ +__pycache__/ +.vscode/ +catboost_info/ +.metaflow +data/ +*_env/ \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.tpl.gitignore b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.tpl.gitignore new file mode 100644 index 0000000..95ac7a3 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.tpl.gitignore @@ -0,0 +1,8 @@ +.ipynb_checkpoints +mlruns/ +__pycache__/ +.vscode/ +catboost_info/ +.metaflow +data/ +*_env/ \ No newline at end of file From de58bcd6ca67331ba5227bfc48e5fe9503d3df09 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Wed, 26 May 2021 20:31:21 +0000 Subject: [PATCH 03/10] Git ignore and readme instructions --- .gitignore | 5 +- .../.tpl-checkpoint.gitignore | 8 - .../README.tpl-checkpoint.md | 241 ------------------ .../build_and_push-checkpoint.sh | 52 ---- .../requirements-checkpoint.txt | 28 -- .../__IMPLEMENTED_SAGEMAKER__/README.tpl.md | 3 + .../requirements.txt | 4 + 7 files changed, 11 insertions(+), 330 deletions(-) delete mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.ipynb_checkpoints/.tpl-checkpoint.gitignore delete mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.ipynb_checkpoints/README.tpl-checkpoint.md delete mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.ipynb_checkpoints/build_and_push-checkpoint.sh delete mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.ipynb_checkpoints/requirements-checkpoint.txt diff --git a/.gitignore b/.gitignore index 8314989..a1549ea 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ __pycache__/ *egg-info -.vscode/ \ No newline at end of file +.vscode/ +.ipynb_checkpoints +*/.ipynb_checkpoints/* +hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.ipynb_checkpoints/* diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.ipynb_checkpoints/.tpl-checkpoint.gitignore b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.ipynb_checkpoints/.tpl-checkpoint.gitignore deleted file mode 100644 index 95ac7a3..0000000 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.ipynb_checkpoints/.tpl-checkpoint.gitignore +++ /dev/null @@ -1,8 +0,0 @@ -.ipynb_checkpoints -mlruns/ -__pycache__/ -.vscode/ -catboost_info/ -.metaflow -data/ -*_env/ \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.ipynb_checkpoints/README.tpl-checkpoint.md b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.ipynb_checkpoints/README.tpl-checkpoint.md deleted file mode 100644 index a14d02f..0000000 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.ipynb_checkpoints/README.tpl-checkpoint.md +++ /dev/null @@ -1,241 +0,0 @@ -# Hermione Sagemaker - -This notebook explains how to execute the Titanic project example - - -## Sagemaker - -Our code is divided in three steps: Processor, Train and Inference. In the Processor step, we preprocessed the training, validation and inference data. The Train step receives the preprocessed training and validation data, and uses them to train and validate a new model. The Inference step receives the inference data and model, and generates the prediction for the data. - -### Permitions - -If you are running this code on a SageMaker notebook instance, do the following to provide IAM permissions to the notebook: - -1. Open the Amazon [SageMaker console](https://console.aws.amazon.com/sagemaker/). -2. Select Notebook instances and choose the name of your notebook instance. -3. Under Permissions and encryption select the role ARN to view the role on the IAM console. -4. Under the Permissions tab, choose Attach policies and search for AmazonS3FullAccess. -5. Select the check box next to AmazonS3FullAccess. -6. Search for AmazonSageMakerFullAccess and AWSStepFunctionsFullAccess and select their check boxes. -7. Choose Attach policy. You will then be redirected to the details page for the role. -8. Copy and save the IAM role ARN for later use. - -Next, we will create a new policy to attach. - -12. Click Attach policies again and then Create policy.\n", -13. Enter the following in the JSON tab: - -```json -{ - "Version": "2012-10-17", - "Statement": [ - { - "Sid": "VisualEditor0", - "Effect": "Allow", - "Action": [ - "s3:PutObject", - "s3:GetObject", - "logs:CreateLogStream", - "codebuild:DeleteProject", - "codebuild:StartBuild", - "s3:DeleteObject", - "codebuild:CreateProject", - "codebuild:BatchGetBuilds" - ], - "Resource": [ - "arn:aws:s3:::sagemaker-*/*", - "arn:aws:codebuild:*:*:project/sagemaker-studio*", - "arn:aws:logs:*:*:log-group:/aws/codebuild/sagemaker-studio*" - ] - }, - { - "Sid": "VisualEditor1", - "Effect": "Allow", - "Action": [ - "logs:GetLogEvents", - "s3:CreateBucket", - "logs:PutLogEvents" - ], - "Resource": [ - "arn:aws:logs:*:*:log-group:/aws/codebuild/sagemaker-studio*:log-stream:*", - "arn:aws:s3:::sagemaker*" - ] - }, - { - "Sid": "VisualEditor2", - "Effect": "Allow", - "Action": [ - "iam:GetRole", - "ecr:CreateRepository", - "iam:ListRoles", - "ecr:GetAuthorizationToken", - "ecr:UploadLayerPart", - "ecr:ListImages", - "logs:CreateLogGroup", - "ecr:PutImage", - "iam:PassRole", - "sagemaker:*", - "ecr:BatchGetImage", - "ecr:CompleteLayerUpload", - "ecr:DescribeImages", - "ecr:DescribeRepositories", - "ecr:InitiateLayerUpload", - "ecr:BatchCheckLayerAvailability" - ], - "Resource": "*" - } - ] -} -``` - -14. Choose Next:Tags and add a tag, if you want to. -15. Choose Next:Review and add a name such as AmazonSageMaker-ExecutionPolicy. -16. Choose Create Policy. -17. Select Roles and search for your role. -18. Under the Permissions tab, click Attach policies. -19. Search for your newly created policy and select the check box next to it. -20. Choose Attach policy. - -### Docker images - -First, we need to create an image and upload it in ECR for each one of the steps. To do that, execute the following commands in the terminal: - -```bash -cd Sagemaker/project-name -bash build_and_push.sh processor hermione-processor -bash build_and_push.sh train hermione-train -bash build_and_push.sh inference hermione-inference -``` - -The bash command will access the Dockerfile in the folder, create the image and save it in ECR with the specified name - -### Notebooks - -To test the images in ECR, execute the following notebooks: - -- project-name/src/ml/notebooks/Sagemaker_Processor.ipynb -- project-name/src/ml/notebooks/Sagemaker_Train.ipynb -- project-name/src/ml/notebooks/Sagemaker_Inference.ipynb - -## Stepfunctions - -We also create two Step Function state machines to execute the whole process. The first machine processes the training data and creates the model. And the second one processes the inference data and generates its prediction. - -### Permitions - -The Step Functions workflow requires an IAM role to interact with other services in AWS environment. To do that, follow these [AWS steps](https://github.com/aws/amazon-sagemaker-examples/blob/master/step-functions-data-science-sdk/step_functions_mlworkflow_processing/step_functions_mlworkflow_scikit_learn_data_processing_and_model_evaluation.ipynb): - - -1. Go to the [IAM console](https://console.aws.amazon.com/iam/). -2. Select Roles and then Create role. -3. Under Choose the service that will use this role select Step Functions. -4. Choose Next until you can enter a Role name. -5. Enter a name such as AmazonSageMaker-StepFunctionsWorkflowExecutionRole and then select Create role. -6. Search and click on the IAM Role you just created. -7. Click Attach policies and then select CloudWatchEventsFullAccess. -9. Click on Attach Policy - - -Next, create and attach another new policy to the role you created: - -9. Click Attach policies again and then Create policy. -10. Enter the following in the JSON tab: - - -```json -{ - "Version": "2012-10-17", - "Statement": [ - { - "Sid": "VisualEditor0", - "Effect": "Allow", - "Action": [ - "events:PutTargets", - "events:DescribeRule", - "events:PutRule" - ], - "Resource": [ - "arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTrainingJobsRule", - "arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTransformJobsRule", - "arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerTuningJobsRule", - "arn:aws:events:*:*:rule/StepFunctionsGetEventsForECSTaskRule", - "arn:aws:events:*:*:rule/StepFunctionsGetEventsForBatchJobsRule", - "arn:aws:events:*:*:rule/StepFunctionsGetEventsForStepFunctionsExecutionRule", - "arn:aws:events:*:*:rule/StepFunctionsGetEventsForSageMakerProcessingJobsRule" - ] - }, - { - "Sid": "VisualEditor1", - "Effect": "Allow", - "Action": "iam:PassRole", - "Resource": "NOTEBOOK_ROLE_ARN", - "Condition": { - "StringEquals": { - "iam:PassedToService": "sagemaker.amazonaws.com" - } - } - }, - { - "Sid": "VisualEditor2", - "Effect": "Allow", - "Action": [ - "batch:DescribeJobs", - "batch:SubmitJob", - "batch:TerminateJob", - "dynamodb:DeleteItem", - "dynamodb:GetItem", - "dynamodb:PutItem", - "dynamodb:UpdateItem", - "ecs:DescribeTasks", - "ecs:RunTask", - "ecs:StopTask", - "glue:BatchStopJobRun", - "glue:GetJobRun", - "glue:GetJobRuns", - "glue:StartJobRun", - "lambda:InvokeFunction", - "sagemaker:CreateEndpoint", - "sagemaker:CreateEndpointConfig", - "sagemaker:CreateHyperParameterTuningJob", - "sagemaker:CreateModel", - "sagemaker:CreateProcessingJob", - "sagemaker:CreateTrainingJob", - "sagemaker:CreateTransformJob", - "sagemaker:DeleteEndpoint", - "sagemaker:DeleteEndpointConfig", - "sagemaker:DescribeHyperParameterTuningJob", - "sagemaker:DescribeProcessingJob", - "sagemaker:DescribeTrainingJob", - "sagemaker:DescribeTransformJob", - "sagemaker:ListProcessingJobs", - "sagemaker:ListTags", - "sagemaker:StopHyperParameterTuningJob", - "sagemaker:StopProcessingJob", - "sagemaker:StopTrainingJob", - "sagemaker:StopTransformJob", - "sagemaker:UpdateEndpoint", - "sns:Publish", - "sqs:SendMessage" - ], - "Resource": "*" - } - ] -} -``` - -11. Replace NOTEBOOK_ROLE_ARN with the ARN for your notebook that you used in the previous step in the above Sagemaker Permitions. -12. Choose Review policy and give the policy a name such as AmazonSageMaker-StepFunctionsWorkflowExecutionPolicy. -13. Choose Create policy. -14. Select Roles and search for your AmazonSageMaker-StepFunctionsWorkflowExecutionRole role. -15. Click Attach policies. -16. Search for your newly created AmazonSageMaker-StepFunctionsWorkflowExecutionPolicy policy and select the check box next to it. -17. Choose Attach policy. -18. Copy the AmazonSageMaker-StepFunctionsWorkflowExecutionRole Role ARN at the top of the Summary. You will use it in the next step. - - -### Notebooks - -To create and test the Step Functions state machines, execute the following notebooks: - -- project-name/src/ml/notebooks/Sagemaker_StepFunctions_Train.ipynb -- project-name/src/ml/notebooks/Sagemaker_StepFunctions_Inference.ipynb \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.ipynb_checkpoints/build_and_push-checkpoint.sh b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.ipynb_checkpoints/build_and_push-checkpoint.sh deleted file mode 100644 index b1ea715..0000000 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.ipynb_checkpoints/build_and_push-checkpoint.sh +++ /dev/null @@ -1,52 +0,0 @@ -#!/usr/bin/env bash - -# This script shows how to build the Docker image and push it to ECR to be ready for use -# by SageMaker. - -# The argument to this script is the image name. This will be used as the image on the local -# machine and combined with the account and region to form the repository name for ECR. -mode=$1 -image=$2 - - -if [ "$image" == "" ] -then - echo "Usage: $0 " - exit 1 -fi - - -# Get the account number associated with the current IAM credentials -account=$(aws sts get-caller-identity --query Account --output text) - -if [ $? -ne 0 ] -then - exit 255 -fi - - -# Get the region defined in the current configuration (default to us-east-1 if none defined) -region=$(aws configure get region) -region=${region:-us-east-1} - - -fullname="${account}.dkr.ecr.${region}.amazonaws.com/${image}:latest" - -# If the repository doesn't exist in ECR, create it. - -aws ecr describe-repositories --repository-names "${image}" > /dev/null 2>&1 - -if [ $? -ne 0 ] -then - aws ecr create-repository --repository-name "${image}" > /dev/null -fi - -# Get the login command from ECR and execute it directly -aws ecr get-login-password --region "${region}" | docker login --username AWS --password-stdin "${account}".dkr.ecr."${region}".amazonaws.com - -# Build the docker image locally with the image name and then push it to ECR -# with the full name. -docker build -f ${mode}/Dockerfile -t ${image} . -docker tag ${image} ${fullname} - -docker push ${fullname} \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.ipynb_checkpoints/requirements-checkpoint.txt b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.ipynb_checkpoints/requirements-checkpoint.txt deleted file mode 100644 index a9d480f..0000000 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/.ipynb_checkpoints/requirements-checkpoint.txt +++ /dev/null @@ -1,28 +0,0 @@ -category-encoders -coverage -datetime -Flask -gunicorn -hermione-ml -matplotlib -mlflow -mlxtend -numpy -pandas -plotly -pytest -seaborn -scikit-learn -scipy -statsmodels -tqdm -yellowbrick -vega_datasets -altair -pandas_profiling -streamlit_pandas_profiling -interpret-community -lime -lightgbm -great_expectations -stepfunctions diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/README.tpl.md b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/README.tpl.md index a14d02f..395fa3d 100644 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/README.tpl.md +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/README.tpl.md @@ -102,6 +102,9 @@ First, we need to create an image and upload it in ECR for each one of the steps ```bash cd Sagemaker/project-name +source project-name_env/bin/activate +pip install -r requirements.txt +python -m ipykernel install --user --name project-name_env --display-name "project-name" bash build_and_push.sh processor hermione-processor bash build_and_push.sh train hermione-train bash build_and_push.sh inference hermione-inference diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/requirements.txt b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/requirements.txt index a9d480f..3b158af 100644 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/requirements.txt +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/requirements.txt @@ -26,3 +26,7 @@ lime lightgbm great_expectations stepfunctions +sagemaker-inference +ipykernel +boto3 +sagemaker From 03419ee5d6588899e1b22cd1938d97ff369e584c Mon Sep 17 00:00:00 2001 From: karenstemartins Date: Thu, 27 May 2021 19:30:45 +0000 Subject: [PATCH 04/10] Data and documentation modified --- README.md | 24 +- .../__IMPLEMENTED_SAGEMAKER__/README.tpl.md | 2 +- .../data/raw/raw_test.csv | 269 ++++++ .../data/raw/raw_train.csv | 624 ++++++++++++ .../data/raw/train.csv | 892 ------------------ .../processor/preprocessor.py | 3 +- .../__IMPLEMENTED_SAGEMAKER__/src/api/app.py | 41 - .../src/api/myrequests.py | 17 - .../__IMPLEMENTED_SAGEMAKER__/src/api/wsgi.py | 4 - .../ml/notebooks/Sagemaker_Processor.ipynb | 59 +- .../src/tests/test_project.py | 86 +- 11 files changed, 1021 insertions(+), 1000 deletions(-) create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/data/raw/raw_test.csv create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/data/raw/raw_train.csv delete mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/data/raw/train.csv delete mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/api/app.py delete mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/api/myrequests.py delete mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/api/wsgi.py diff --git a/README.md b/README.md index bd6b2d7..6722308 100644 --- a/README.md +++ b/README.md @@ -74,13 +74,21 @@ After installed Hermione: hermione new project_hermione ``` -1. Hit Enter if you want to start with an example code +2. Hit Enter if you want to start with an example code ``` Do you want to start with an implemented example (recommended) [y/n]? [y]: ``` -3. Hermione already creates a virtual environment for the project. For Windows users, activate it with +3. If you choose an implemented example, select Sagemaker or Local version + +``` +Do you want to start with: + (1) Sagemaker + (2) Local version +``` + +4. Hermione already creates a virtual environment for the project. For Windows users, activate it with ```cmd _env\Scripts\activate @@ -93,13 +101,13 @@ source _env/bin/activate ``` -4. After activating, you should install some libraries. There are a few suggestions in “requirements.txt” file: +5. After activating, you should install some libraries. There are a few suggestions in “requirements.txt” file: ``` pip install -r requirements.txt ``` -1. Now we will train some models from the example, using MLflow ❤. To do so, inside *src* directory, just type: _hermione train_. The “hermione train” command will search for a `train.py` file and execute it. In the example, models and metrics are already controlled via MLflow. +1. Now, if you selected the Local version, we will train some models from the example, using MLflow ❤. To do so, inside *src* directory, just type: _hermione train_. The “hermione train” command will search for a `train.py` file and execute it. In the example, models and metrics are already controlled via MLflow. ![](https://cdn-images-1.medium.com/max/800/1*MmVcmAYspxWdzbd5r00W5g.png) @@ -118,16 +126,19 @@ mlflow ui ![](https://cdn-images-1.medium.com/max/800/1*c_rDEqERZR6r8JVI3TMTcQ.png) -8. To make batch predictions using your `predict.py` file, type `hermione predict`. The default implemented version will print some predictions for you in the terminal. +7. To make batch predictions using your `predict.py` file, type `hermione predict`. The default implemented version will print some predictions for you in the terminal. ``` hermione predict ``` -9. In the Titanic example, we also provide a step by step notebook. To view it, just type jupyter notebook inside directory `/src/notebooks/`. +8. In the Titanic example, we also provide a step by step notebook. To view it, just type jupyter notebook inside directory `/src/notebooks/`. ![](https://cdn-images-1.medium.com/max/800/1*U3ToR5jDjQJihT9EnxeDdg.png) + +9. If you selected the Sagemaker version, click [here](hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/README.tpl.md) to check a tutorial. + Do you want to create your **project from scratch**? There click [here](tutorial_base.md) to check a tutorial. @@ -186,6 +197,7 @@ Here we describe briefly what each class is doing: - **Preprocessing** - concentrates all preprocessing steps that must be performed on the data before the model is trained. - **Normalization** - applies normalization and denormalization to reported columns. This class contains the following normalization algorithms already implemented: StandardScaler e MinMaxScaler. - **TextVectorizer** - transforms text into vector. Implemented methods: Bag of words, TF_IDF, Embedding: mean, median e indexing. +- **DataQuality** - concentrates all data validation steps that must be performed on the data to ensure its quality (Available in Sagemaker version). ### Visualization diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/README.tpl.md b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/README.tpl.md index 395fa3d..60aa98d 100644 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/README.tpl.md +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/README.tpl.md @@ -22,7 +22,7 @@ If you are running this code on a SageMaker notebook instance, do the following Next, we will create a new policy to attach. -12. Click Attach policies again and then Create policy.\n", +12. Click Attach policies again and then Create policy. 13. Enter the following in the JSON tab: ```json diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/data/raw/raw_test.csv b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/data/raw/raw_test.csv new file mode 100644 index 0000000..f0dbfb0 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/data/raw/raw_test.csv @@ -0,0 +1,269 @@ +Survived,Pclass,Sex,Age +1,3,female,1.0 +0,3,male, +0,1,male,30.0 +0,1,male,61.0 +0,3,male,27.0 +0,2,male,46.0 +1,2,female,40.0 +0,2,female,27.0 +0,3,female,18.0 +0,3,female,29.0 +0,3,female,9.0 +0,3,female,28.0 +1,1,female,52.0 +0,3,male,24.5 +0,1,female,50.0 +1,3,female,0.75 +1,1,female,58.0 +0,1,male, +0,3,female,45.0 +0,2,male,29.0 +0,3,female, +1,2,female,8.0 +0,2,male,39.0 +0,3,male,4.0 +0,3,male,2.0 +0,3,male,25.0 +0,3,male,22.0 +1,1,female,17.0 +0,3,male,19.0 +0,3,male, +0,3,female,2.0 +1,2,female,40.0 +0,2,male,34.0 +1,3,male,26.0 +0,2,male,19.0 +0,3,male,11.0 +0,3,male,42.0 +0,3,male,51.0 +1,2,female,24.0 +1,1,male,40.0 +1,1,female,14.0 +0,3,male, +0,3,female, +1,3,female,63.0 +0,3,male,16.0 +0,2,male,25.0 +1,1,female,39.0 +0,1,male,42.0 +0,3,male,20.0 +1,2,female,24.0 +1,3,female, +0,3,female,6.0 +0,3,male,20.5 +0,3,male,35.0 +0,2,male,24.0 +0,3,male, +0,3,male,16.0 +0,3,male,18.0 +0,1,male,29.0 +0,3,male,14.0 +1,1,female,33.0 +1,1,female,18.0 +1,1,male,11.0 +1,1,female, +0,3,male,24.0 +0,3,male,34.0 +0,2,male,48.0 +0,2,male,50.0 +0,1,male, +1,2,female, +1,1,female,49.0 +0,3,male,50.0 +1,1,male, +0,1,male,65.0 +0,3,male,21.0 +0,3,male,28.0 +0,3,male,41.0 +0,3,male,21.0 +1,1,female, +1,1,female, +0,3,female,26.0 +0,3,male,28.5 +0,3,male,9.0 +1,3,male, +0,3,male,24.0 +0,2,male,33.0 +1,3,female,1.0 +0,3,male,33.0 +1,1,male, +0,3,male,25.0 +1,3,female,18.0 +1,2,male, +0,2,male,54.0 +1,3,male,3.0 +0,1,male,37.0 +0,3,male,19.0 +1,1,female,24.0 +1,1,female,35.0 +0,3,male,33.0 +1,2,female,24.0 +0,3,male,1.0 +0,1,male,58.0 +0,1,male,45.0 +1,3,female,15.0 +0,3,male, +0,1,male,31.0 +0,3,male,26.0 +0,3,male,28.5 +0,3,male,35.0 +1,2,female,36.0 +1,2,male,0.83 +1,1,male,31.0 +1,1,female,31.0 +0,3,male,32.0 +0,3,male,26.0 +0,2,male,44.0 +0,1,male,60.0 +0,2,male,54.0 +0,3,male,18.0 +0,1,male,19.0 +0,3,male,19.0 +0,3,male,43.0 +1,1,male,42.0 +0,3,male, +0,2,male,27.0 +0,2,male,21.0 +0,3,female,43.0 +0,1,male, +1,3,female, +0,3,male,20.0 +1,1,female,44.0 +0,3,male,29.0 +0,3,male,25.0 +1,2,female,30.0 +1,1,female,48.0 +0,3,male,18.0 +1,1,female,50.0 +0,3,female,30.0 +0,3,male, +1,2,female,41.0 +0,3,male,65.0 +1,1,male,32.0 +0,3,male,17.0 +1,2,male,0.83 +1,3,female,5.0 +0,3,male,28.0 +0,3,male,39.0 +1,3,male,32.0 +1,3,male,27.0 +1,1,female,24.0 +1,1,male,49.0 +0,3,male,11.0 +0,3,female,23.0 +0,3,male,16.0 +1,1,female,22.0 +0,3,male, +1,1,female,58.0 +1,3,female,5.0 +1,2,female,42.0 +1,1,female,49.0 +1,2,female,40.0 +0,1,male,38.0 +1,1,male,25.0 +0,1,female,25.0 +0,1,male, +1,1,female,22.0 +1,2,female,54.0 +0,3,male,20.0 +1,3,female, +0,3,male,40.0 +0,3,male,23.0 +0,1,male, +0,3,female, +0,2,male,28.0 +0,3,male, +1,3,male,45.0 +1,2,male,1.0 +0,3,male, +0,2,male,27.0 +1,1,female,16.0 +0,3,male,31.0 +0,3,male,45.5 +0,3,male,21.0 +1,1,male,23.0 +0,2,male,52.0 +1,2,female,3.0 +0,2,male,16.0 +1,1,female,30.0 +0,3,male,21.0 +1,3,female,16.0 +1,2,male,19.0 +0,3,male, +1,2,female,25.0 +1,2,male,32.0 +0,3,female,30.5 +0,1,male,21.0 +1,3,male,25.0 +1,3,female,35.0 +1,2,female,17.0 +1,3,male, +1,1,female,16.0 +1,3,female, +1,1,male,42.0 +0,1,male,62.0 +0,1,male,40.0 +0,3,male,19.0 +0,3,male, +1,2,female,25.0 +0,1,male, +0,3,male,42.0 +0,3,male,29.0 +1,2,female,19.0 +0,3,female,3.0 +0,3,male,35.0 +0,3,male,30.5 +0,3,male, +0,2,male,34.0 +0,3,male, +1,3,male, +0,3,male,25.0 +0,3,male, +1,1,male,60.0 +0,3,male,30.0 +1,1,male,50.0 +0,3,female,9.0 +0,3,male,25.0 +0,2,male,27.0 +0,3,male,40.5 +0,2,male,30.0 +1,2,female,28.0 +1,3,female,30.0 +0,3,male,74.0 +0,3,female,25.0 +0,3,male,34.0 +0,2,male,31.0 +1,3,male,27.0 +1,1,female,35.0 +0,3,male,7.0 +1,1,female, +0,2,male,23.0 +0,3,male,30.0 +0,3,male,2.0 +1,3,female,24.0 +0,3,male, +0,3,male, +0,2,male,59.0 +0,2,male,51.0 +0,1,male,22.0 +1,1,male,34.0 +1,3,female,33.0 +0,3,male,24.0 +0,3,female,47.0 +0,1,male,47.0 +0,2,male,36.0 +0,3,male, +1,2,female,14.0 +0,3,female,41.0 +0,1,male, +1,3,female, +0,1,male, +0,3,male,33.0 +1,3,female,31.0 +0,3,male,17.0 +0,3,male,19.0 +0,3,female,2.0 +1,2,female,18.0 +0,3,male, +1,1,female,52.0 diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/data/raw/raw_train.csv b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/data/raw/raw_train.csv new file mode 100644 index 0000000..45c2b4a --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/data/raw/raw_train.csv @@ -0,0 +1,624 @@ +Survived,Pclass,Sex,Age +1,2,female,34.0 +1,2,female,31.0 +1,1,male,36.0 +1,3,male,29.0 +0,2,male,18.0 +1,1,female,63.0 +0,3,male, +1,2,female,28.0 +1,2,female,50.0 +1,3,female, +0,3,male,20.0 +0,3,male,22.0 +1,2,female,48.0 +0,3,female,40.0 +0,2,male,42.0 +1,1,female, +0,2,male, +1,3,female,0.75 +0,3,male, +0,1,male,54.0 +0,3,male, +1,1,female,19.0 +0,3,male,28.0 +1,3,female, +0,2,male,25.0 +0,3,female,39.0 +0,1,male,28.0 +1,3,male,3.0 +0,3,male,17.0 +0,3,male, +0,3,male,22.0 +1,2,male,1.0 +1,1,female,24.0 +0,3,male,49.0 +1,2,male,34.0 +1,2,female,45.0 +1,2,female,36.0 +1,3,male,19.0 +0,3,male, +0,3,male,24.0 +1,3,male,20.0 +0,3,male,44.0 +0,2,male,25.0 +0,2,male,21.0 +0,2,male,43.0 +1,3,male,24.0 +0,3,male,18.0 +0,2,male,70.0 +0,3,male,22.0 +1,3,female,4.0 +0,3,female, +0,3,female,18.0 +1,2,female,13.0 +1,3,female,19.0 +1,3,female,15.0 +0,2,male, +0,2,male,47.0 +0,1,male,55.0 +0,3,male, +0,2,male,32.5 +0,3,male,20.0 +1,2,female,28.0 +0,3,male,26.0 +0,3,male,22.0 +1,2,female,29.0 +0,3,male,28.0 +1,2,female,32.0 +1,2,female,17.0 +0,2,male,29.0 +0,1,male,51.0 +0,1,male,45.5 +0,3,male,25.0 +1,3,male,6.0 +1,3,male, +0,3,male, +0,2,male,18.0 +0,1,male,18.0 +0,3,male, +0,3,male,21.0 +1,1,female,56.0 +0,3,male, +0,3,male,59.0 +0,3,male, +1,1,male,36.0 +0,3,male,36.0 +0,1,male,44.0 +0,1,male,38.0 +0,3,female,31.0 +0,3,female,25.0 +0,1,male,31.0 +0,3,male,42.0 +1,3,female,22.0 +1,3,female,24.0 +0,2,male,34.0 +1,3,female,22.0 +0,2,male,60.0 +0,2,male,36.0 +0,3,male, +1,1,male,28.0 +0,3,male, +0,3,male,21.0 +0,3,male,34.0 +0,3,female, +0,3,male, +0,3,female,45.0 +1,1,female,58.0 +0,3,male,4.0 +1,3,male,20.0 +0,1,male, +1,2,female,27.0 +0,2,male,66.0 +0,1,male,47.0 +1,2,female,30.0 +1,3,male,32.0 +0,1,male,24.0 +1,2,female,55.0 +1,1,female,38.0 +0,3,male, +0,3,male,22.0 +0,1,male,56.0 +0,3,male,28.0 +0,3,male,24.0 +0,3,female,32.0 +0,3,male,26.0 +1,1,male,52.0 +1,2,female,28.0 +1,1,female,48.0 +1,1,male,27.0 +0,3,male,40.5 +0,3,male,70.5 +0,2,male,37.0 +0,3,male,33.0 +1,2,female,4.0 +0,3,male,28.0 +0,3,female,31.0 +1,1,male,28.0 +0,3,female,45.0 +0,1,male,40.0 +0,3,male, +0,3,male,22.0 +0,2,male,21.0 +1,3,male,0.42 +0,3,male, +1,2,female,19.0 +0,3,female,30.0 +0,3,male,21.0 +1,1,female,30.0 +0,1,male,58.0 +0,1,male,61.0 +0,3,male,44.0 +0,3,male,17.0 +0,3,female,18.0 +0,3,female,8.0 +0,2,male,28.0 +0,3,male,61.0 +0,2,female,24.0 +0,3,female,2.0 +0,3,male,22.0 +1,3,female, +0,1,male,36.0 +1,2,female,24.0 +1,3,male,22.0 +0,3,male, +1,1,female,18.0 +1,1,female,35.0 +0,3,female, +0,1,male,65.0 +0,3,female, +0,3,male,16.0 +0,3,male,33.0 +1,1,female, +0,2,male,30.0 +0,1,male, +0,3,male, +0,3,male,19.0 +0,3,female,11.0 +0,3,male,16.0 +1,1,female,43.0 +0,2,male,36.5 +0,3,male,20.0 +0,3,male,40.0 +0,3,male, +0,3,male,28.0 +0,3,female,16.0 +0,3,female,17.0 +0,2,male,24.0 +1,3,female, +1,3,female,15.0 +1,3,female, +0,3,female, +0,3,female,39.0 +0,3,female,21.0 +1,1,male,35.0 +0,3,female,9.0 +0,3,female,18.0 +0,3,male,22.0 +1,3,male, +0,3,male,19.0 +0,3,male,7.0 +0,3,male,36.0 +0,3,male, +0,3,male,30.0 +1,1,female,53.0 +0,3,male,25.0 +1,3,male,32.0 +0,3,male,29.0 +0,2,male,26.0 +0,2,male,36.0 +0,2,male,39.0 +0,1,male, +1,2,female,36.0 +1,1,female,22.0 +0,1,male,46.0 +1,1,male,49.0 +1,1,male, +1,1,female,29.0 +0,3,male, +0,1,female,2.0 +0,3,male,15.0 +1,3,female, +0,1,male,64.0 +0,3,male,1.0 +1,3,male,18.0 +1,3,male,12.0 +0,3,male,18.0 +0,2,male,24.0 +0,3,male, +0,3,female, +0,1,male,47.0 +1,3,female,2.0 +0,3,male, +0,3,male,39.0 +0,3,male, +0,2,male,35.0 +1,1,female,36.0 +0,3,male, +1,3,female, +1,3,male, +0,2,male, +0,2,male,23.0 +0,1,male, +1,2,female,4.0 +1,1,female, +1,3,female,4.0 +0,2,male,23.0 +0,3,male,20.0 +1,2,female,35.0 +1,1,female,39.0 +0,3,male,24.0 +0,3,female,20.0 +0,2,male,19.0 +1,1,female,39.0 +0,3,female, +1,1,male,27.0 +0,3,female,22.0 +0,3,male,48.0 +0,1,male,49.0 +0,1,male,19.0 +0,3,male, +1,3,female,27.0 +1,1,male,37.0 +0,3,male,36.0 +1,3,male,21.0 +0,2,male,32.0 +0,3,female, +1,3,female, +0,3,male,26.0 +0,3,male, +0,3,male,17.0 +0,2,male,30.0 +1,3,female,5.0 +1,3,male,44.0 +1,2,male,42.0 +0,3,male,24.0 +0,2,male,57.0 +1,1,female,60.0 +0,3,male,24.0 +0,3,female, +1,3,female,22.0 +0,3,male, +1,1,female,24.0 +0,3,male,16.0 +0,3,male,21.0 +0,3,male,37.0 +1,3,female, +0,2,male,25.0 +1,1,female,47.0 +1,3,female, +0,1,male,54.0 +1,1,female,18.0 +0,3,male,28.0 +0,2,male,23.0 +1,2,male,8.0 +0,3,male, +0,3,male,35.0 +1,1,female,38.0 +1,2,female,50.0 +1,1,male,4.0 +0,1,male,45.0 +0,3,male,21.0 +1,3,male,31.0 +0,3,male, +1,3,male,9.0 +0,3,male, +0,3,male,23.0 +1,1,male,17.0 +0,3,male,44.0 +1,3,male,39.0 +1,1,female,17.0 +0,3,male,20.0 +1,1,female,39.0 +0,3,male,19.0 +0,2,male,31.0 +1,3,male,30.0 +0,2,male,18.0 +0,3,male, +0,2,male, +1,3,male, +0,2,male, +0,3,male,29.0 +1,3,female,38.0 +1,3,male,29.0 +0,2,male,29.0 +1,2,female,29.0 +1,1,female,38.0 +0,3,male,36.0 +1,3,female, +0,3,male, +1,1,female, +0,3,female,21.0 +1,1,female,45.0 +0,1,male,36.0 +0,3,male,23.0 +0,1,male,28.0 +0,3,male, +1,3,female, +1,2,female,29.0 +0,3,male,32.0 +0,3,male,21.0 +0,3,female,21.0 +0,3,male,30.0 +1,2,male,62.0 +1,1,female,33.0 +0,1,male, +0,2,male, +1,1,male, +0,3,male,34.0 +0,3,female,28.0 +0,1,male,33.0 +0,2,male,42.0 +1,2,female,34.0 +1,1,female,32.0 +0,2,female,44.0 +0,3,male,21.0 +0,3,male, +0,3,male,19.0 +0,3,male, +0,3,male,55.5 +0,2,male,19.0 +0,3,male,47.0 +1,2,female,7.0 +0,3,male,43.0 +1,2,female,24.0 +1,3,female, +0,3,male,38.0 +1,1,female,35.0 +1,1,female,41.0 +0,3,male, +1,1,male,35.0 +1,3,female,19.0 +0,3,male, +0,2,male,34.0 +1,1,male,48.0 +1,2,female,33.0 +0,3,male, +0,2,male,16.0 +0,1,male, +0,1,male,37.0 +0,3,male,29.0 +1,3,male,16.0 +0,3,male,22.0 +0,3,male, +0,3,male,32.0 +0,3,male,8.0 +1,1,female,21.0 +1,3,male,32.0 +0,3,male,28.0 +1,1,female,15.0 +1,1,female,19.0 +0,2,male,25.0 +0,3,male,41.0 +1,2,male,31.0 +0,2,male,35.0 +0,3,male, +0,3,female,18.0 +0,3,female,24.0 +1,1,female,30.0 +1,3,female,4.0 +1,3,female,18.0 +1,1,female,36.0 +0,3,female,29.0 +1,3,male,26.0 +1,1,female,23.0 +0,3,male,32.0 +0,3,male,26.0 +1,1,female,44.0 +1,3,female,23.0 +1,1,male,36.0 +0,3,male,20.0 +0,3,male,22.0 +1,1,female,36.0 +1,3,female, +0,2,female,26.0 +1,2,female,42.0 +0,3,female, +1,1,female,24.0 +0,3,male, +1,3,male,1.0 +1,1,male, +1,3,male,20.0 +0,3,male,20.0 +0,3,male,28.0 +1,2,male,3.0 +1,1,male,45.0 +0,3,male,25.0 +0,3,female,9.0 +0,2,male,39.0 +0,3,male, +0,1,male,46.0 +0,3,male, +1,2,female,18.0 +0,3,male,16.0 +0,3,male,47.0 +0,3,female,37.0 +0,3,male,14.0 +0,1,male, +0,3,male,31.0 +0,1,male, +0,2,male,23.0 +0,3,male,33.0 +0,3,male,39.0 +0,2,male,28.0 +1,1,female,40.0 +0,3,male,42.0 +1,1,female,31.0 +0,3,male,33.0 +0,3,male,25.0 +1,3,female,31.0 +0,2,male, +0,3,male,19.0 +0,3,male,38.0 +1,2,female,45.0 +0,3,female,48.0 +1,1,male,27.0 +1,1,female,30.0 +0,3,male, +0,3,male,4.0 +0,3,male, +1,2,female,28.0 +0,3,male, +0,2,male,32.0 +1,1,male,48.0 +0,3,male, +0,3,male, +0,1,male,39.0 +1,3,female,18.0 +1,3,male, +0,3,male,10.0 +0,3,male,32.0 +1,3,female,27.0 +0,3,male, +0,2,female,57.0 +1,2,female,5.0 +1,2,female,33.0 +0,3,male,18.0 +0,1,male,27.0 +0,3,male,9.0 +1,1,female,54.0 +0,3,male,51.0 +1,1,male,51.0 +1,1,female,40.0 +0,3,male, +1,1,female,33.0 +1,3,female,22.0 +1,1,female,23.0 +1,1,male,56.0 +1,2,female,22.0 +0,3,male,27.0 +0,3,male,22.0 +1,1,female,35.0 +0,2,male,18.0 +0,3,female,10.0 +0,3,male,17.0 +1,3,male, +1,3,male,4.0 +0,3,male,23.5 +1,3,female,26.0 +1,1,female,62.0 +0,3,male, +0,3,male,16.0 +1,3,female, +0,3,male,40.0 +1,1,male,48.0 +0,3,male,2.0 +0,1,male,71.0 +1,2,male,3.0 +1,1,female, +0,3,male,26.0 +0,3,male,26.0 +1,1,female,22.0 +0,1,male,64.0 +0,3,female, +1,3,female, +0,1,male,56.0 +0,3,female,14.0 +0,3,male, +0,2,male,36.0 +0,1,male,52.0 +0,3,male,51.0 +1,2,male,0.67 +1,2,female,50.0 +1,1,female,35.0 +0,3,male, +1,3,male,9.0 +1,2,female,24.0 +1,2,female,23.0 +0,3,male,40.0 +0,1,male,24.0 +0,3,male,19.0 +0,1,male,47.0 +0,2,male,28.0 +1,2,female,21.0 +0,3,male, +0,1,male,70.0 +0,3,male,21.0 +1,1,female,54.0 +0,3,male,38.0 +1,2,female,2.0 +1,3,female,13.0 +1,1,male,0.92 +0,1,male,50.0 +1,1,female,30.0 +0,2,male,52.0 +0,2,female,38.0 +1,3,female, +0,1,male,62.0 +0,3,male,45.0 +1,3,female, +0,3,male,30.0 +1,3,female,29.0 +0,1,male,29.0 +1,1,male,36.0 +1,2,female,6.0 +0,3,male, +0,3,male,18.0 +1,1,male,26.0 +0,2,male,54.0 +0,3,male,35.0 +0,3,male, +1,1,female,51.0 +0,3,male,34.5 +1,3,male,25.0 +1,1,male,25.0 +1,3,male,27.0 +1,2,female,27.0 +0,3,male, +1,3,female,24.0 +0,3,male, +0,2,male,30.0 +1,2,female,34.0 +0,3,male, +0,3,female,22.0 +0,3,female, +0,3,male,22.0 +1,1,female,26.0 +0,3,male, +1,1,male,80.0 +0,2,male,30.0 +1,3,female, +0,3,female, +1,1,female,42.0 +0,2,male,34.0 +0,2,male,23.0 +0,3,male, +1,1,female,16.0 +1,3,female,36.0 +1,2,female, +1,1,female,19.0 +1,3,female,14.0 +1,2,male,2.0 +1,3,female,26.0 +0,3,male, +1,3,female,27.0 +0,2,male,31.0 +1,3,female,17.0 +1,3,female,16.0 +1,2,female,22.0 +0,3,male, +0,3,male,30.0 +0,3,male, +0,1,male,45.0 +0,3,female, +0,3,female,14.5 +1,1,male,38.0 +0,1,male,50.0 +0,3,male,26.0 +1,3,female, +0,3,female,41.0 +0,3,female,20.0 +0,1,male,71.0 +1,1,male,35.0 +0,3,male, +1,3,male,32.0 +1,2,female,32.5 +1,3,female,21.0 +0,3,male,36.0 +1,2,male, +1,1,female,21.0 +1,2,female,34.0 +1,2,female,30.0 +0,3,male,32.0 +0,3,male,30.0 +1,3,male,29.0 diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/data/raw/train.csv b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/data/raw/train.csv deleted file mode 100644 index b0ee013..0000000 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/data/raw/train.csv +++ /dev/null @@ -1,892 +0,0 @@ -PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked -1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S -2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38,1,0,PC 17599,71.2833,C85,C -3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S -4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S -5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S -6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q -7,0,1,"McCarthy, Mr. Timothy J",male,54,0,0,17463,51.8625,E46,S -8,0,3,"Palsson, Master. Gosta Leonard",male,2,3,1,349909,21.075,,S -9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27,0,2,347742,11.1333,,S -10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14,1,0,237736,30.0708,,C -11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4,1,1,PP 9549,16.7,G6,S -12,1,1,"Bonnell, Miss. Elizabeth",female,58,0,0,113783,26.55,C103,S -13,0,3,"Saundercock, Mr. William Henry",male,20,0,0,A/5. 2151,8.05,,S -14,0,3,"Andersson, Mr. Anders Johan",male,39,1,5,347082,31.275,,S -15,0,3,"Vestrom, Miss. Hulda Amanda Adolfina",female,14,0,0,350406,7.8542,,S -16,1,2,"Hewlett, Mrs. (Mary D Kingcome) ",female,55,0,0,248706,16,,S -17,0,3,"Rice, Master. Eugene",male,2,4,1,382652,29.125,,Q -18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13,,S -19,0,3,"Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)",female,31,1,0,345763,18,,S -20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.225,,C -21,0,2,"Fynney, Mr. Joseph J",male,35,0,0,239865,26,,S -22,1,2,"Beesley, Mr. Lawrence",male,34,0,0,248698,13,D56,S -23,1,3,"McGowan, Miss. Anna ""Annie""",female,15,0,0,330923,8.0292,,Q -24,1,1,"Sloper, Mr. William Thompson",male,28,0,0,113788,35.5,A6,S -25,0,3,"Palsson, Miss. Torborg Danira",female,8,3,1,349909,21.075,,S -26,1,3,"Asplund, Mrs. Carl Oscar (Selma Augusta Emilia Johansson)",female,38,1,5,347077,31.3875,,S -27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.225,,C -28,0,1,"Fortune, Mr. Charles Alexander",male,19,3,2,19950,263,C23 C25 C27,S -29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q -30,0,3,"Todoroff, Mr. Lalio",male,,0,0,349216,7.8958,,S -31,0,1,"Uruchurtu, Don. Manuel E",male,40,0,0,PC 17601,27.7208,,C -32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,,1,0,PC 17569,146.5208,B78,C -33,1,3,"Glynn, Miss. Mary Agatha",female,,0,0,335677,7.75,,Q -34,0,2,"Wheadon, Mr. Edward H",male,66,0,0,C.A. 24579,10.5,,S -35,0,1,"Meyer, Mr. Edgar Joseph",male,28,1,0,PC 17604,82.1708,,C -36,0,1,"Holverson, Mr. Alexander Oskar",male,42,1,0,113789,52,,S -37,1,3,"Mamee, Mr. Hanna",male,,0,0,2677,7.2292,,C -38,0,3,"Cann, Mr. Ernest Charles",male,21,0,0,A./5. 2152,8.05,,S -39,0,3,"Vander Planke, Miss. Augusta Maria",female,18,2,0,345764,18,,S -40,1,3,"Nicola-Yarred, Miss. Jamila",female,14,1,0,2651,11.2417,,C -41,0,3,"Ahlin, Mrs. Johan (Johanna Persdotter Larsson)",female,40,1,0,7546,9.475,,S -42,0,2,"Turpin, Mrs. William John Robert (Dorothy Ann Wonnacott)",female,27,1,0,11668,21,,S -43,0,3,"Kraeff, Mr. Theodor",male,,0,0,349253,7.8958,,C -44,1,2,"Laroche, Miss. Simonne Marie Anne Andree",female,3,1,2,SC/Paris 2123,41.5792,,C -45,1,3,"Devaney, Miss. Margaret Delia",female,19,0,0,330958,7.8792,,Q -46,0,3,"Rogers, Mr. William John",male,,0,0,S.C./A.4. 23567,8.05,,S -47,0,3,"Lennon, Mr. Denis",male,,1,0,370371,15.5,,Q -48,1,3,"O'Driscoll, Miss. Bridget",female,,0,0,14311,7.75,,Q -49,0,3,"Samaan, Mr. Youssef",male,,2,0,2662,21.6792,,C -50,0,3,"Arnold-Franchi, Mrs. Josef (Josefine Franchi)",female,18,1,0,349237,17.8,,S -51,0,3,"Panula, Master. Juha Niilo",male,7,4,1,3101295,39.6875,,S -52,0,3,"Nosworthy, Mr. Richard Cater",male,21,0,0,A/4. 39886,7.8,,S -53,1,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49,1,0,PC 17572,76.7292,D33,C -54,1,2,"Faunthorpe, Mrs. Lizzie (Elizabeth Anne Wilkinson)",female,29,1,0,2926,26,,S -55,0,1,"Ostby, Mr. Engelhart Cornelius",male,65,0,1,113509,61.9792,B30,C -56,1,1,"Woolner, Mr. Hugh",male,,0,0,19947,35.5,C52,S -57,1,2,"Rugg, Miss. Emily",female,21,0,0,C.A. 31026,10.5,,S -58,0,3,"Novel, Mr. Mansouer",male,28.5,0,0,2697,7.2292,,C -59,1,2,"West, Miss. Constance Mirium",female,5,1,2,C.A. 34651,27.75,,S -60,0,3,"Goodwin, Master. William Frederick",male,11,5,2,CA 2144,46.9,,S -61,0,3,"Sirayanian, Mr. Orsen",male,22,0,0,2669,7.2292,,C -62,1,1,"Icard, Miss. Amelie",female,38,0,0,113572,80,B28, -63,0,1,"Harris, Mr. Henry Birkhardt",male,45,1,0,36973,83.475,C83,S -64,0,3,"Skoog, Master. Harald",male,4,3,2,347088,27.9,,S -65,0,1,"Stewart, Mr. Albert A",male,,0,0,PC 17605,27.7208,,C -66,1,3,"Moubarek, Master. Gerios",male,,1,1,2661,15.2458,,C -67,1,2,"Nye, Mrs. (Elizabeth Ramell)",female,29,0,0,C.A. 29395,10.5,F33,S -68,0,3,"Crease, Mr. Ernest James",male,19,0,0,S.P. 3464,8.1583,,S -69,1,3,"Andersson, Miss. Erna Alexandra",female,17,4,2,3101281,7.925,,S -70,0,3,"Kink, Mr. Vincenz",male,26,2,0,315151,8.6625,,S -71,0,2,"Jenkin, Mr. Stephen Curnow",male,32,0,0,C.A. 33111,10.5,,S -72,0,3,"Goodwin, Miss. Lillian Amy",female,16,5,2,CA 2144,46.9,,S -73,0,2,"Hood, Mr. Ambrose Jr",male,21,0,0,S.O.C. 14879,73.5,,S -74,0,3,"Chronopoulos, Mr. Apostolos",male,26,1,0,2680,14.4542,,C -75,1,3,"Bing, Mr. Lee",male,32,0,0,1601,56.4958,,S -76,0,3,"Moen, Mr. Sigurd Hansen",male,25,0,0,348123,7.65,F G73,S -77,0,3,"Staneff, Mr. Ivan",male,,0,0,349208,7.8958,,S -78,0,3,"Moutal, Mr. Rahamin Haim",male,,0,0,374746,8.05,,S -79,1,2,"Caldwell, Master. Alden Gates",male,0.83,0,2,248738,29,,S -80,1,3,"Dowdell, Miss. Elizabeth",female,30,0,0,364516,12.475,,S -81,0,3,"Waelens, Mr. Achille",male,22,0,0,345767,9,,S -82,1,3,"Sheerlinck, Mr. Jan Baptist",male,29,0,0,345779,9.5,,S -83,1,3,"McDermott, Miss. Brigdet Delia",female,,0,0,330932,7.7875,,Q -84,0,1,"Carrau, Mr. Francisco M",male,28,0,0,113059,47.1,,S -85,1,2,"Ilett, Miss. Bertha",female,17,0,0,SO/C 14885,10.5,,S -86,1,3,"Backstrom, Mrs. Karl Alfred (Maria Mathilda Gustafsson)",female,33,3,0,3101278,15.85,,S -87,0,3,"Ford, Mr. William Neal",male,16,1,3,W./C. 6608,34.375,,S -88,0,3,"Slocovski, Mr. Selman Francis",male,,0,0,SOTON/OQ 392086,8.05,,S -89,1,1,"Fortune, Miss. Mabel Helen",female,23,3,2,19950,263,C23 C25 C27,S -90,0,3,"Celotti, Mr. Francesco",male,24,0,0,343275,8.05,,S -91,0,3,"Christmann, Mr. Emil",male,29,0,0,343276,8.05,,S -92,0,3,"Andreasson, Mr. Paul Edvin",male,20,0,0,347466,7.8542,,S -93,0,1,"Chaffee, Mr. Herbert Fuller",male,46,1,0,W.E.P. 5734,61.175,E31,S -94,0,3,"Dean, Mr. Bertram Frank",male,26,1,2,C.A. 2315,20.575,,S -95,0,3,"Coxon, Mr. Daniel",male,59,0,0,364500,7.25,,S -96,0,3,"Shorney, Mr. Charles Joseph",male,,0,0,374910,8.05,,S -97,0,1,"Goldschmidt, Mr. George B",male,71,0,0,PC 17754,34.6542,A5,C -98,1,1,"Greenfield, Mr. William Bertram",male,23,0,1,PC 17759,63.3583,D10 D12,C -99,1,2,"Doling, Mrs. John T (Ada Julia Bone)",female,34,0,1,231919,23,,S -100,0,2,"Kantor, Mr. Sinai",male,34,1,0,244367,26,,S -101,0,3,"Petranec, Miss. Matilda",female,28,0,0,349245,7.8958,,S -102,0,3,"Petroff, Mr. Pastcho (""Pentcho"")",male,,0,0,349215,7.8958,,S -103,0,1,"White, Mr. Richard Frasar",male,21,0,1,35281,77.2875,D26,S -104,0,3,"Johansson, Mr. Gustaf Joel",male,33,0,0,7540,8.6542,,S -105,0,3,"Gustafsson, Mr. Anders Vilhelm",male,37,2,0,3101276,7.925,,S -106,0,3,"Mionoff, Mr. Stoytcho",male,28,0,0,349207,7.8958,,S -107,1,3,"Salkjelsvik, Miss. Anna Kristine",female,21,0,0,343120,7.65,,S -108,1,3,"Moss, Mr. Albert Johan",male,,0,0,312991,7.775,,S -109,0,3,"Rekic, Mr. Tido",male,38,0,0,349249,7.8958,,S -110,1,3,"Moran, Miss. Bertha",female,,1,0,371110,24.15,,Q -111,0,1,"Porter, Mr. Walter Chamberlain",male,47,0,0,110465,52,C110,S -112,0,3,"Zabour, Miss. Hileni",female,14.5,1,0,2665,14.4542,,C -113,0,3,"Barton, Mr. David John",male,22,0,0,324669,8.05,,S -114,0,3,"Jussila, Miss. Katriina",female,20,1,0,4136,9.825,,S -115,0,3,"Attalah, Miss. Malake",female,17,0,0,2627,14.4583,,C -116,0,3,"Pekoniemi, Mr. Edvard",male,21,0,0,STON/O 2. 3101294,7.925,,S -117,0,3,"Connors, Mr. Patrick",male,70.5,0,0,370369,7.75,,Q -118,0,2,"Turpin, Mr. William John Robert",male,29,1,0,11668,21,,S -119,0,1,"Baxter, Mr. Quigg Edmond",male,24,0,1,PC 17558,247.5208,B58 B60,C -120,0,3,"Andersson, Miss. Ellis Anna Maria",female,2,4,2,347082,31.275,,S -121,0,2,"Hickman, Mr. Stanley George",male,21,2,0,S.O.C. 14879,73.5,,S -122,0,3,"Moore, Mr. Leonard Charles",male,,0,0,A4. 54510,8.05,,S -123,0,2,"Nasser, Mr. Nicholas",male,32.5,1,0,237736,30.0708,,C -124,1,2,"Webber, Miss. Susan",female,32.5,0,0,27267,13,E101,S -125,0,1,"White, Mr. Percival Wayland",male,54,0,1,35281,77.2875,D26,S -126,1,3,"Nicola-Yarred, Master. Elias",male,12,1,0,2651,11.2417,,C -127,0,3,"McMahon, Mr. Martin",male,,0,0,370372,7.75,,Q -128,1,3,"Madsen, Mr. Fridtjof Arne",male,24,0,0,C 17369,7.1417,,S -129,1,3,"Peter, Miss. Anna",female,,1,1,2668,22.3583,F E69,C -130,0,3,"Ekstrom, Mr. Johan",male,45,0,0,347061,6.975,,S -131,0,3,"Drazenoic, Mr. Jozef",male,33,0,0,349241,7.8958,,C -132,0,3,"Coelho, Mr. Domingos Fernandeo",male,20,0,0,SOTON/O.Q. 3101307,7.05,,S -133,0,3,"Robins, Mrs. Alexander A (Grace Charity Laury)",female,47,1,0,A/5. 3337,14.5,,S -134,1,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",female,29,1,0,228414,26,,S -135,0,2,"Sobey, Mr. Samuel James Hayden",male,25,0,0,C.A. 29178,13,,S -136,0,2,"Richard, Mr. Emile",male,23,0,0,SC/PARIS 2133,15.0458,,C -137,1,1,"Newsom, Miss. Helen Monypeny",female,19,0,2,11752,26.2833,D47,S -138,0,1,"Futrelle, Mr. Jacques Heath",male,37,1,0,113803,53.1,C123,S -139,0,3,"Osen, Mr. Olaf Elon",male,16,0,0,7534,9.2167,,S -140,0,1,"Giglio, Mr. Victor",male,24,0,0,PC 17593,79.2,B86,C -141,0,3,"Boulos, Mrs. Joseph (Sultana)",female,,0,2,2678,15.2458,,C -142,1,3,"Nysten, Miss. Anna Sofia",female,22,0,0,347081,7.75,,S -143,1,3,"Hakkarainen, Mrs. Pekka Pietari (Elin Matilda Dolck)",female,24,1,0,STON/O2. 3101279,15.85,,S -144,0,3,"Burke, Mr. Jeremiah",male,19,0,0,365222,6.75,,Q -145,0,2,"Andrew, Mr. Edgardo Samuel",male,18,0,0,231945,11.5,,S -146,0,2,"Nicholls, Mr. Joseph Charles",male,19,1,1,C.A. 33112,36.75,,S -147,1,3,"Andersson, Mr. August Edvard (""Wennerstrom"")",male,27,0,0,350043,7.7958,,S -148,0,3,"Ford, Miss. Robina Maggie ""Ruby""",female,9,2,2,W./C. 6608,34.375,,S -149,0,2,"Navratil, Mr. Michel (""Louis M Hoffman"")",male,36.5,0,2,230080,26,F2,S -150,0,2,"Byles, Rev. Thomas Roussel Davids",male,42,0,0,244310,13,,S -151,0,2,"Bateman, Rev. Robert James",male,51,0,0,S.O.P. 1166,12.525,,S -152,1,1,"Pears, Mrs. Thomas (Edith Wearne)",female,22,1,0,113776,66.6,C2,S -153,0,3,"Meo, Mr. Alfonzo",male,55.5,0,0,A.5. 11206,8.05,,S -154,0,3,"van Billiard, Mr. Austin Blyler",male,40.5,0,2,A/5. 851,14.5,,S -155,0,3,"Olsen, Mr. Ole Martin",male,,0,0,Fa 265302,7.3125,,S -156,0,1,"Williams, Mr. Charles Duane",male,51,0,1,PC 17597,61.3792,,C -157,1,3,"Gilnagh, Miss. Katherine ""Katie""",female,16,0,0,35851,7.7333,,Q -158,0,3,"Corn, Mr. Harry",male,30,0,0,SOTON/OQ 392090,8.05,,S -159,0,3,"Smiljanic, Mr. Mile",male,,0,0,315037,8.6625,,S -160,0,3,"Sage, Master. Thomas Henry",male,,8,2,CA. 2343,69.55,,S -161,0,3,"Cribb, Mr. John Hatfield",male,44,0,1,371362,16.1,,S -162,1,2,"Watt, Mrs. James (Elizabeth ""Bessie"" Inglis Milne)",female,40,0,0,C.A. 33595,15.75,,S -163,0,3,"Bengtsson, Mr. John Viktor",male,26,0,0,347068,7.775,,S -164,0,3,"Calic, Mr. Jovo",male,17,0,0,315093,8.6625,,S -165,0,3,"Panula, Master. Eino Viljami",male,1,4,1,3101295,39.6875,,S -166,1,3,"Goldsmith, Master. Frank John William ""Frankie""",male,9,0,2,363291,20.525,,S -167,1,1,"Chibnall, Mrs. (Edith Martha Bowerman)",female,,0,1,113505,55,E33,S -168,0,3,"Skoog, Mrs. William (Anna Bernhardina Karlsson)",female,45,1,4,347088,27.9,,S -169,0,1,"Baumann, Mr. John D",male,,0,0,PC 17318,25.925,,S -170,0,3,"Ling, Mr. Lee",male,28,0,0,1601,56.4958,,S -171,0,1,"Van der hoef, Mr. Wyckoff",male,61,0,0,111240,33.5,B19,S -172,0,3,"Rice, Master. Arthur",male,4,4,1,382652,29.125,,Q -173,1,3,"Johnson, Miss. Eleanor Ileen",female,1,1,1,347742,11.1333,,S -174,0,3,"Sivola, Mr. Antti Wilhelm",male,21,0,0,STON/O 2. 3101280,7.925,,S -175,0,1,"Smith, Mr. James Clinch",male,56,0,0,17764,30.6958,A7,C -176,0,3,"Klasen, Mr. Klas Albin",male,18,1,1,350404,7.8542,,S -177,0,3,"Lefebre, Master. Henry Forbes",male,,3,1,4133,25.4667,,S -178,0,1,"Isham, Miss. Ann Elizabeth",female,50,0,0,PC 17595,28.7125,C49,C -179,0,2,"Hale, Mr. Reginald",male,30,0,0,250653,13,,S -180,0,3,"Leonard, Mr. Lionel",male,36,0,0,LINE,0,,S -181,0,3,"Sage, Miss. Constance Gladys",female,,8,2,CA. 2343,69.55,,S -182,0,2,"Pernot, Mr. Rene",male,,0,0,SC/PARIS 2131,15.05,,C -183,0,3,"Asplund, Master. Clarence Gustaf Hugo",male,9,4,2,347077,31.3875,,S -184,1,2,"Becker, Master. Richard F",male,1,2,1,230136,39,F4,S -185,1,3,"Kink-Heilmann, Miss. Luise Gretchen",female,4,0,2,315153,22.025,,S -186,0,1,"Rood, Mr. Hugh Roscoe",male,,0,0,113767,50,A32,S -187,1,3,"O'Brien, Mrs. Thomas (Johanna ""Hannah"" Godfrey)",female,,1,0,370365,15.5,,Q -188,1,1,"Romaine, Mr. Charles Hallace (""Mr C Rolmane"")",male,45,0,0,111428,26.55,,S -189,0,3,"Bourke, Mr. John",male,40,1,1,364849,15.5,,Q -190,0,3,"Turcin, Mr. Stjepan",male,36,0,0,349247,7.8958,,S -191,1,2,"Pinsky, Mrs. (Rosa)",female,32,0,0,234604,13,,S -192,0,2,"Carbines, Mr. William",male,19,0,0,28424,13,,S -193,1,3,"Andersen-Jensen, Miss. Carla Christine Nielsine",female,19,1,0,350046,7.8542,,S -194,1,2,"Navratil, Master. Michel M",male,3,1,1,230080,26,F2,S -195,1,1,"Brown, Mrs. James Joseph (Margaret Tobin)",female,44,0,0,PC 17610,27.7208,B4,C -196,1,1,"Lurette, Miss. Elise",female,58,0,0,PC 17569,146.5208,B80,C -197,0,3,"Mernagh, Mr. Robert",male,,0,0,368703,7.75,,Q -198,0,3,"Olsen, Mr. Karl Siegwart Andreas",male,42,0,1,4579,8.4042,,S -199,1,3,"Madigan, Miss. Margaret ""Maggie""",female,,0,0,370370,7.75,,Q -200,0,2,"Yrois, Miss. Henriette (""Mrs Harbeck"")",female,24,0,0,248747,13,,S -201,0,3,"Vande Walle, Mr. Nestor Cyriel",male,28,0,0,345770,9.5,,S -202,0,3,"Sage, Mr. Frederick",male,,8,2,CA. 2343,69.55,,S -203,0,3,"Johanson, Mr. Jakob Alfred",male,34,0,0,3101264,6.4958,,S -204,0,3,"Youseff, Mr. Gerious",male,45.5,0,0,2628,7.225,,C -205,1,3,"Cohen, Mr. Gurshon ""Gus""",male,18,0,0,A/5 3540,8.05,,S -206,0,3,"Strom, Miss. Telma Matilda",female,2,0,1,347054,10.4625,G6,S -207,0,3,"Backstrom, Mr. Karl Alfred",male,32,1,0,3101278,15.85,,S -208,1,3,"Albimona, Mr. Nassef Cassem",male,26,0,0,2699,18.7875,,C -209,1,3,"Carr, Miss. Helen ""Ellen""",female,16,0,0,367231,7.75,,Q -210,1,1,"Blank, Mr. Henry",male,40,0,0,112277,31,A31,C -211,0,3,"Ali, Mr. Ahmed",male,24,0,0,SOTON/O.Q. 3101311,7.05,,S -212,1,2,"Cameron, Miss. Clear Annie",female,35,0,0,F.C.C. 13528,21,,S -213,0,3,"Perkin, Mr. John Henry",male,22,0,0,A/5 21174,7.25,,S -214,0,2,"Givard, Mr. Hans Kristensen",male,30,0,0,250646,13,,S -215,0,3,"Kiernan, Mr. Philip",male,,1,0,367229,7.75,,Q -216,1,1,"Newell, Miss. Madeleine",female,31,1,0,35273,113.275,D36,C -217,1,3,"Honkanen, Miss. Eliina",female,27,0,0,STON/O2. 3101283,7.925,,S -218,0,2,"Jacobsohn, Mr. Sidney Samuel",male,42,1,0,243847,27,,S -219,1,1,"Bazzani, Miss. Albina",female,32,0,0,11813,76.2917,D15,C -220,0,2,"Harris, Mr. Walter",male,30,0,0,W/C 14208,10.5,,S -221,1,3,"Sunderland, Mr. Victor Francis",male,16,0,0,SOTON/OQ 392089,8.05,,S -222,0,2,"Bracken, Mr. James H",male,27,0,0,220367,13,,S -223,0,3,"Green, Mr. George Henry",male,51,0,0,21440,8.05,,S -224,0,3,"Nenkoff, Mr. Christo",male,,0,0,349234,7.8958,,S -225,1,1,"Hoyt, Mr. Frederick Maxfield",male,38,1,0,19943,90,C93,S -226,0,3,"Berglund, Mr. Karl Ivar Sven",male,22,0,0,PP 4348,9.35,,S -227,1,2,"Mellors, Mr. William John",male,19,0,0,SW/PP 751,10.5,,S -228,0,3,"Lovell, Mr. John Hall (""Henry"")",male,20.5,0,0,A/5 21173,7.25,,S -229,0,2,"Fahlstrom, Mr. Arne Jonas",male,18,0,0,236171,13,,S -230,0,3,"Lefebre, Miss. Mathilde",female,,3,1,4133,25.4667,,S -231,1,1,"Harris, Mrs. Henry Birkhardt (Irene Wallach)",female,35,1,0,36973,83.475,C83,S -232,0,3,"Larsson, Mr. Bengt Edvin",male,29,0,0,347067,7.775,,S -233,0,2,"Sjostedt, Mr. Ernst Adolf",male,59,0,0,237442,13.5,,S -234,1,3,"Asplund, Miss. Lillian Gertrud",female,5,4,2,347077,31.3875,,S -235,0,2,"Leyson, Mr. Robert William Norman",male,24,0,0,C.A. 29566,10.5,,S -236,0,3,"Harknett, Miss. Alice Phoebe",female,,0,0,W./C. 6609,7.55,,S -237,0,2,"Hold, Mr. Stephen",male,44,1,0,26707,26,,S -238,1,2,"Collyer, Miss. Marjorie ""Lottie""",female,8,0,2,C.A. 31921,26.25,,S -239,0,2,"Pengelly, Mr. Frederick William",male,19,0,0,28665,10.5,,S -240,0,2,"Hunt, Mr. George Henry",male,33,0,0,SCO/W 1585,12.275,,S -241,0,3,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C -242,1,3,"Murphy, Miss. Katherine ""Kate""",female,,1,0,367230,15.5,,Q -243,0,2,"Coleridge, Mr. Reginald Charles",male,29,0,0,W./C. 14263,10.5,,S -244,0,3,"Maenpaa, Mr. Matti Alexanteri",male,22,0,0,STON/O 2. 3101275,7.125,,S -245,0,3,"Attalah, Mr. Sleiman",male,30,0,0,2694,7.225,,C -246,0,1,"Minahan, Dr. William Edward",male,44,2,0,19928,90,C78,Q -247,0,3,"Lindahl, Miss. Agda Thorilda Viktoria",female,25,0,0,347071,7.775,,S -248,1,2,"Hamalainen, Mrs. William (Anna)",female,24,0,2,250649,14.5,,S -249,1,1,"Beckwith, Mr. Richard Leonard",male,37,1,1,11751,52.5542,D35,S -250,0,2,"Carter, Rev. Ernest Courtenay",male,54,1,0,244252,26,,S -251,0,3,"Reed, Mr. James George",male,,0,0,362316,7.25,,S -252,0,3,"Strom, Mrs. Wilhelm (Elna Matilda Persson)",female,29,1,1,347054,10.4625,G6,S -253,0,1,"Stead, Mr. William Thomas",male,62,0,0,113514,26.55,C87,S -254,0,3,"Lobb, Mr. William Arthur",male,30,1,0,A/5. 3336,16.1,,S -255,0,3,"Rosblom, Mrs. Viktor (Helena Wilhelmina)",female,41,0,2,370129,20.2125,,S -256,1,3,"Touma, Mrs. Darwis (Hanne Youssef Razi)",female,29,0,2,2650,15.2458,,C -257,1,1,"Thorne, Mrs. Gertrude Maybelle",female,,0,0,PC 17585,79.2,,C -258,1,1,"Cherry, Miss. Gladys",female,30,0,0,110152,86.5,B77,S -259,1,1,"Ward, Miss. Anna",female,35,0,0,PC 17755,512.3292,,C -260,1,2,"Parrish, Mrs. (Lutie Davis)",female,50,0,1,230433,26,,S -261,0,3,"Smith, Mr. Thomas",male,,0,0,384461,7.75,,Q -262,1,3,"Asplund, Master. Edvin Rojj Felix",male,3,4,2,347077,31.3875,,S -263,0,1,"Taussig, Mr. Emil",male,52,1,1,110413,79.65,E67,S -264,0,1,"Harrison, Mr. William",male,40,0,0,112059,0,B94,S -265,0,3,"Henry, Miss. Delia",female,,0,0,382649,7.75,,Q -266,0,2,"Reeves, Mr. David",male,36,0,0,C.A. 17248,10.5,,S -267,0,3,"Panula, Mr. Ernesti Arvid",male,16,4,1,3101295,39.6875,,S -268,1,3,"Persson, Mr. Ernst Ulrik",male,25,1,0,347083,7.775,,S -269,1,1,"Graham, Mrs. William Thompson (Edith Junkins)",female,58,0,1,PC 17582,153.4625,C125,S -270,1,1,"Bissette, Miss. Amelia",female,35,0,0,PC 17760,135.6333,C99,S -271,0,1,"Cairns, Mr. Alexander",male,,0,0,113798,31,,S -272,1,3,"Tornquist, Mr. William Henry",male,25,0,0,LINE,0,,S -273,1,2,"Mellinger, Mrs. (Elizabeth Anne Maidment)",female,41,0,1,250644,19.5,,S -274,0,1,"Natsch, Mr. Charles H",male,37,0,1,PC 17596,29.7,C118,C -275,1,3,"Healy, Miss. Hanora ""Nora""",female,,0,0,370375,7.75,,Q -276,1,1,"Andrews, Miss. Kornelia Theodosia",female,63,1,0,13502,77.9583,D7,S -277,0,3,"Lindblom, Miss. Augusta Charlotta",female,45,0,0,347073,7.75,,S -278,0,2,"Parkes, Mr. Francis ""Frank""",male,,0,0,239853,0,,S -279,0,3,"Rice, Master. Eric",male,7,4,1,382652,29.125,,Q -280,1,3,"Abbott, Mrs. Stanton (Rosa Hunt)",female,35,1,1,C.A. 2673,20.25,,S -281,0,3,"Duane, Mr. Frank",male,65,0,0,336439,7.75,,Q -282,0,3,"Olsson, Mr. Nils Johan Goransson",male,28,0,0,347464,7.8542,,S -283,0,3,"de Pelsmaeker, Mr. Alfons",male,16,0,0,345778,9.5,,S -284,1,3,"Dorking, Mr. Edward Arthur",male,19,0,0,A/5. 10482,8.05,,S -285,0,1,"Smith, Mr. Richard William",male,,0,0,113056,26,A19,S -286,0,3,"Stankovic, Mr. Ivan",male,33,0,0,349239,8.6625,,C -287,1,3,"de Mulder, Mr. Theodore",male,30,0,0,345774,9.5,,S -288,0,3,"Naidenoff, Mr. Penko",male,22,0,0,349206,7.8958,,S -289,1,2,"Hosono, Mr. Masabumi",male,42,0,0,237798,13,,S -290,1,3,"Connolly, Miss. Kate",female,22,0,0,370373,7.75,,Q -291,1,1,"Barber, Miss. Ellen ""Nellie""",female,26,0,0,19877,78.85,,S -292,1,1,"Bishop, Mrs. Dickinson H (Helen Walton)",female,19,1,0,11967,91.0792,B49,C -293,0,2,"Levy, Mr. Rene Jacques",male,36,0,0,SC/Paris 2163,12.875,D,C -294,0,3,"Haas, Miss. Aloisia",female,24,0,0,349236,8.85,,S -295,0,3,"Mineff, Mr. Ivan",male,24,0,0,349233,7.8958,,S -296,0,1,"Lewy, Mr. Ervin G",male,,0,0,PC 17612,27.7208,,C -297,0,3,"Hanna, Mr. Mansour",male,23.5,0,0,2693,7.2292,,C -298,0,1,"Allison, Miss. Helen Loraine",female,2,1,2,113781,151.55,C22 C26,S -299,1,1,"Saalfeld, Mr. Adolphe",male,,0,0,19988,30.5,C106,S -300,1,1,"Baxter, Mrs. James (Helene DeLaudeniere Chaput)",female,50,0,1,PC 17558,247.5208,B58 B60,C -301,1,3,"Kelly, Miss. Anna Katherine ""Annie Kate""",female,,0,0,9234,7.75,,Q -302,1,3,"McCoy, Mr. Bernard",male,,2,0,367226,23.25,,Q -303,0,3,"Johnson, Mr. William Cahoone Jr",male,19,0,0,LINE,0,,S -304,1,2,"Keane, Miss. Nora A",female,,0,0,226593,12.35,E101,Q -305,0,3,"Williams, Mr. Howard Hugh ""Harry""",male,,0,0,A/5 2466,8.05,,S -306,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S -307,1,1,"Fleming, Miss. Margaret",female,,0,0,17421,110.8833,,C -308,1,1,"Penasco y Castellana, Mrs. Victor de Satode (Maria Josefa Perez de Soto y Vallejo)",female,17,1,0,PC 17758,108.9,C65,C -309,0,2,"Abelson, Mr. Samuel",male,30,1,0,P/PP 3381,24,,C -310,1,1,"Francatelli, Miss. Laura Mabel",female,30,0,0,PC 17485,56.9292,E36,C -311,1,1,"Hays, Miss. Margaret Bechstein",female,24,0,0,11767,83.1583,C54,C -312,1,1,"Ryerson, Miss. Emily Borie",female,18,2,2,PC 17608,262.375,B57 B59 B63 B66,C -313,0,2,"Lahtinen, Mrs. William (Anna Sylfven)",female,26,1,1,250651,26,,S -314,0,3,"Hendekovic, Mr. Ignjac",male,28,0,0,349243,7.8958,,S -315,0,2,"Hart, Mr. Benjamin",male,43,1,1,F.C.C. 13529,26.25,,S -316,1,3,"Nilsson, Miss. Helmina Josefina",female,26,0,0,347470,7.8542,,S -317,1,2,"Kantor, Mrs. Sinai (Miriam Sternin)",female,24,1,0,244367,26,,S -318,0,2,"Moraweck, Dr. Ernest",male,54,0,0,29011,14,,S -319,1,1,"Wick, Miss. Mary Natalie",female,31,0,2,36928,164.8667,C7,S -320,1,1,"Spedden, Mrs. Frederic Oakley (Margaretta Corning Stone)",female,40,1,1,16966,134.5,E34,C -321,0,3,"Dennis, Mr. Samuel",male,22,0,0,A/5 21172,7.25,,S -322,0,3,"Danoff, Mr. Yoto",male,27,0,0,349219,7.8958,,S -323,1,2,"Slayter, Miss. Hilda Mary",female,30,0,0,234818,12.35,,Q -324,1,2,"Caldwell, Mrs. Albert Francis (Sylvia Mae Harbaugh)",female,22,1,1,248738,29,,S -325,0,3,"Sage, Mr. George John Jr",male,,8,2,CA. 2343,69.55,,S -326,1,1,"Young, Miss. Marie Grice",female,36,0,0,PC 17760,135.6333,C32,C -327,0,3,"Nysveen, Mr. Johan Hansen",male,61,0,0,345364,6.2375,,S -328,1,2,"Ball, Mrs. (Ada E Hall)",female,36,0,0,28551,13,D,S -329,1,3,"Goldsmith, Mrs. Frank John (Emily Alice Brown)",female,31,1,1,363291,20.525,,S -330,1,1,"Hippach, Miss. Jean Gertrude",female,16,0,1,111361,57.9792,B18,C -331,1,3,"McCoy, Miss. Agnes",female,,2,0,367226,23.25,,Q -332,0,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.5,C124,S -333,0,1,"Graham, Mr. George Edward",male,38,0,1,PC 17582,153.4625,C91,S -334,0,3,"Vander Planke, Mr. Leo Edmondus",male,16,2,0,345764,18,,S -335,1,1,"Frauenthal, Mrs. Henry William (Clara Heinsheimer)",female,,1,0,PC 17611,133.65,,S -336,0,3,"Denkoff, Mr. Mitto",male,,0,0,349225,7.8958,,S -337,0,1,"Pears, Mr. Thomas Clinton",male,29,1,0,113776,66.6,C2,S -338,1,1,"Burns, Miss. Elizabeth Margaret",female,41,0,0,16966,134.5,E40,C -339,1,3,"Dahl, Mr. Karl Edwart",male,45,0,0,7598,8.05,,S -340,0,1,"Blackwell, Mr. Stephen Weart",male,45,0,0,113784,35.5,T,S -341,1,2,"Navratil, Master. Edmond Roger",male,2,1,1,230080,26,F2,S -342,1,1,"Fortune, Miss. Alice Elizabeth",female,24,3,2,19950,263,C23 C25 C27,S -343,0,2,"Collander, Mr. Erik Gustaf",male,28,0,0,248740,13,,S -344,0,2,"Sedgwick, Mr. Charles Frederick Waddington",male,25,0,0,244361,13,,S -345,0,2,"Fox, Mr. Stanley Hubert",male,36,0,0,229236,13,,S -346,1,2,"Brown, Miss. Amelia ""Mildred""",female,24,0,0,248733,13,F33,S -347,1,2,"Smith, Miss. Marion Elsie",female,40,0,0,31418,13,,S -348,1,3,"Davison, Mrs. Thomas Henry (Mary E Finck)",female,,1,0,386525,16.1,,S -349,1,3,"Coutts, Master. William Loch ""William""",male,3,1,1,C.A. 37671,15.9,,S -350,0,3,"Dimic, Mr. Jovan",male,42,0,0,315088,8.6625,,S -351,0,3,"Odahl, Mr. Nils Martin",male,23,0,0,7267,9.225,,S -352,0,1,"Williams-Lambert, Mr. Fletcher Fellows",male,,0,0,113510,35,C128,S -353,0,3,"Elias, Mr. Tannous",male,15,1,1,2695,7.2292,,C -354,0,3,"Arnold-Franchi, Mr. Josef",male,25,1,0,349237,17.8,,S -355,0,3,"Yousif, Mr. Wazli",male,,0,0,2647,7.225,,C -356,0,3,"Vanden Steen, Mr. Leo Peter",male,28,0,0,345783,9.5,,S -357,1,1,"Bowerman, Miss. Elsie Edith",female,22,0,1,113505,55,E33,S -358,0,2,"Funk, Miss. Annie Clemmer",female,38,0,0,237671,13,,S -359,1,3,"McGovern, Miss. Mary",female,,0,0,330931,7.8792,,Q -360,1,3,"Mockler, Miss. Helen Mary ""Ellie""",female,,0,0,330980,7.8792,,Q -361,0,3,"Skoog, Mr. Wilhelm",male,40,1,4,347088,27.9,,S -362,0,2,"del Carlo, Mr. Sebastiano",male,29,1,0,SC/PARIS 2167,27.7208,,C -363,0,3,"Barbara, Mrs. (Catherine David)",female,45,0,1,2691,14.4542,,C -364,0,3,"Asim, Mr. Adola",male,35,0,0,SOTON/O.Q. 3101310,7.05,,S -365,0,3,"O'Brien, Mr. Thomas",male,,1,0,370365,15.5,,Q -366,0,3,"Adahl, Mr. Mauritz Nils Martin",male,30,0,0,C 7076,7.25,,S -367,1,1,"Warren, Mrs. Frank Manley (Anna Sophia Atkinson)",female,60,1,0,110813,75.25,D37,C -368,1,3,"Moussa, Mrs. (Mantoura Boulos)",female,,0,0,2626,7.2292,,C -369,1,3,"Jermyn, Miss. Annie",female,,0,0,14313,7.75,,Q -370,1,1,"Aubart, Mme. Leontine Pauline",female,24,0,0,PC 17477,69.3,B35,C -371,1,1,"Harder, Mr. George Achilles",male,25,1,0,11765,55.4417,E50,C -372,0,3,"Wiklund, Mr. Jakob Alfred",male,18,1,0,3101267,6.4958,,S -373,0,3,"Beavan, Mr. William Thomas",male,19,0,0,323951,8.05,,S -374,0,1,"Ringhini, Mr. Sante",male,22,0,0,PC 17760,135.6333,,C -375,0,3,"Palsson, Miss. Stina Viola",female,3,3,1,349909,21.075,,S -376,1,1,"Meyer, Mrs. Edgar Joseph (Leila Saks)",female,,1,0,PC 17604,82.1708,,C -377,1,3,"Landergren, Miss. Aurora Adelia",female,22,0,0,C 7077,7.25,,S -378,0,1,"Widener, Mr. Harry Elkins",male,27,0,2,113503,211.5,C82,C -379,0,3,"Betros, Mr. Tannous",male,20,0,0,2648,4.0125,,C -380,0,3,"Gustafsson, Mr. Karl Gideon",male,19,0,0,347069,7.775,,S -381,1,1,"Bidois, Miss. Rosalie",female,42,0,0,PC 17757,227.525,,C -382,1,3,"Nakid, Miss. Maria (""Mary"")",female,1,0,2,2653,15.7417,,C -383,0,3,"Tikkanen, Mr. Juho",male,32,0,0,STON/O 2. 3101293,7.925,,S -384,1,1,"Holverson, Mrs. Alexander Oskar (Mary Aline Towner)",female,35,1,0,113789,52,,S -385,0,3,"Plotcharsky, Mr. Vasil",male,,0,0,349227,7.8958,,S -386,0,2,"Davies, Mr. Charles Henry",male,18,0,0,S.O.C. 14879,73.5,,S -387,0,3,"Goodwin, Master. Sidney Leonard",male,1,5,2,CA 2144,46.9,,S -388,1,2,"Buss, Miss. Kate",female,36,0,0,27849,13,,S -389,0,3,"Sadlier, Mr. Matthew",male,,0,0,367655,7.7292,,Q -390,1,2,"Lehmann, Miss. Bertha",female,17,0,0,SC 1748,12,,C -391,1,1,"Carter, Mr. William Ernest",male,36,1,2,113760,120,B96 B98,S -392,1,3,"Jansson, Mr. Carl Olof",male,21,0,0,350034,7.7958,,S -393,0,3,"Gustafsson, Mr. Johan Birger",male,28,2,0,3101277,7.925,,S -394,1,1,"Newell, Miss. Marjorie",female,23,1,0,35273,113.275,D36,C -395,1,3,"Sandstrom, Mrs. Hjalmar (Agnes Charlotta Bengtsson)",female,24,0,2,PP 9549,16.7,G6,S -396,0,3,"Johansson, Mr. Erik",male,22,0,0,350052,7.7958,,S -397,0,3,"Olsson, Miss. Elina",female,31,0,0,350407,7.8542,,S -398,0,2,"McKane, Mr. Peter David",male,46,0,0,28403,26,,S -399,0,2,"Pain, Dr. Alfred",male,23,0,0,244278,10.5,,S -400,1,2,"Trout, Mrs. William H (Jessie L)",female,28,0,0,240929,12.65,,S -401,1,3,"Niskanen, Mr. Juha",male,39,0,0,STON/O 2. 3101289,7.925,,S -402,0,3,"Adams, Mr. John",male,26,0,0,341826,8.05,,S -403,0,3,"Jussila, Miss. Mari Aina",female,21,1,0,4137,9.825,,S -404,0,3,"Hakkarainen, Mr. Pekka Pietari",male,28,1,0,STON/O2. 3101279,15.85,,S -405,0,3,"Oreskovic, Miss. Marija",female,20,0,0,315096,8.6625,,S -406,0,2,"Gale, Mr. Shadrach",male,34,1,0,28664,21,,S -407,0,3,"Widegren, Mr. Carl/Charles Peter",male,51,0,0,347064,7.75,,S -408,1,2,"Richards, Master. William Rowe",male,3,1,1,29106,18.75,,S -409,0,3,"Birkeland, Mr. Hans Martin Monsen",male,21,0,0,312992,7.775,,S -410,0,3,"Lefebre, Miss. Ida",female,,3,1,4133,25.4667,,S -411,0,3,"Sdycoff, Mr. Todor",male,,0,0,349222,7.8958,,S -412,0,3,"Hart, Mr. Henry",male,,0,0,394140,6.8583,,Q -413,1,1,"Minahan, Miss. Daisy E",female,33,1,0,19928,90,C78,Q -414,0,2,"Cunningham, Mr. Alfred Fleming",male,,0,0,239853,0,,S -415,1,3,"Sundman, Mr. Johan Julian",male,44,0,0,STON/O 2. 3101269,7.925,,S -416,0,3,"Meek, Mrs. Thomas (Annie Louise Rowley)",female,,0,0,343095,8.05,,S -417,1,2,"Drew, Mrs. James Vivian (Lulu Thorne Christian)",female,34,1,1,28220,32.5,,S -418,1,2,"Silven, Miss. Lyyli Karoliina",female,18,0,2,250652,13,,S -419,0,2,"Matthews, Mr. William John",male,30,0,0,28228,13,,S -420,0,3,"Van Impe, Miss. Catharina",female,10,0,2,345773,24.15,,S -421,0,3,"Gheorgheff, Mr. Stanio",male,,0,0,349254,7.8958,,C -422,0,3,"Charters, Mr. David",male,21,0,0,A/5. 13032,7.7333,,Q -423,0,3,"Zimmerman, Mr. Leo",male,29,0,0,315082,7.875,,S -424,0,3,"Danbom, Mrs. Ernst Gilbert (Anna Sigrid Maria Brogren)",female,28,1,1,347080,14.4,,S -425,0,3,"Rosblom, Mr. Viktor Richard",male,18,1,1,370129,20.2125,,S -426,0,3,"Wiseman, Mr. Phillippe",male,,0,0,A/4. 34244,7.25,,S -427,1,2,"Clarke, Mrs. Charles V (Ada Maria Winfield)",female,28,1,0,2003,26,,S -428,1,2,"Phillips, Miss. Kate Florence (""Mrs Kate Louise Phillips Marshall"")",female,19,0,0,250655,26,,S -429,0,3,"Flynn, Mr. James",male,,0,0,364851,7.75,,Q -430,1,3,"Pickard, Mr. Berk (Berk Trembisky)",male,32,0,0,SOTON/O.Q. 392078,8.05,E10,S -431,1,1,"Bjornstrom-Steffansson, Mr. Mauritz Hakan",male,28,0,0,110564,26.55,C52,S -432,1,3,"Thorneycroft, Mrs. Percival (Florence Kate White)",female,,1,0,376564,16.1,,S -433,1,2,"Louch, Mrs. Charles Alexander (Alice Adelaide Slow)",female,42,1,0,SC/AH 3085,26,,S -434,0,3,"Kallio, Mr. Nikolai Erland",male,17,0,0,STON/O 2. 3101274,7.125,,S -435,0,1,"Silvey, Mr. William Baird",male,50,1,0,13507,55.9,E44,S -436,1,1,"Carter, Miss. Lucile Polk",female,14,1,2,113760,120,B96 B98,S -437,0,3,"Ford, Miss. Doolina Margaret ""Daisy""",female,21,2,2,W./C. 6608,34.375,,S -438,1,2,"Richards, Mrs. Sidney (Emily Hocking)",female,24,2,3,29106,18.75,,S -439,0,1,"Fortune, Mr. Mark",male,64,1,4,19950,263,C23 C25 C27,S -440,0,2,"Kvillner, Mr. Johan Henrik Johannesson",male,31,0,0,C.A. 18723,10.5,,S -441,1,2,"Hart, Mrs. Benjamin (Esther Ada Bloomfield)",female,45,1,1,F.C.C. 13529,26.25,,S -442,0,3,"Hampe, Mr. Leon",male,20,0,0,345769,9.5,,S -443,0,3,"Petterson, Mr. Johan Emil",male,25,1,0,347076,7.775,,S -444,1,2,"Reynaldo, Ms. Encarnacion",female,28,0,0,230434,13,,S -445,1,3,"Johannesen-Bratthammer, Mr. Bernt",male,,0,0,65306,8.1125,,S -446,1,1,"Dodge, Master. Washington",male,4,0,2,33638,81.8583,A34,S -447,1,2,"Mellinger, Miss. Madeleine Violet",female,13,0,1,250644,19.5,,S -448,1,1,"Seward, Mr. Frederic Kimber",male,34,0,0,113794,26.55,,S -449,1,3,"Baclini, Miss. Marie Catherine",female,5,2,1,2666,19.2583,,C -450,1,1,"Peuchen, Major. Arthur Godfrey",male,52,0,0,113786,30.5,C104,S -451,0,2,"West, Mr. Edwy Arthur",male,36,1,2,C.A. 34651,27.75,,S -452,0,3,"Hagland, Mr. Ingvald Olai Olsen",male,,1,0,65303,19.9667,,S -453,0,1,"Foreman, Mr. Benjamin Laventall",male,30,0,0,113051,27.75,C111,C -454,1,1,"Goldenberg, Mr. Samuel L",male,49,1,0,17453,89.1042,C92,C -455,0,3,"Peduzzi, Mr. Joseph",male,,0,0,A/5 2817,8.05,,S -456,1,3,"Jalsevac, Mr. Ivan",male,29,0,0,349240,7.8958,,C -457,0,1,"Millet, Mr. Francis Davis",male,65,0,0,13509,26.55,E38,S -458,1,1,"Kenyon, Mrs. Frederick R (Marion)",female,,1,0,17464,51.8625,D21,S -459,1,2,"Toomey, Miss. Ellen",female,50,0,0,F.C.C. 13531,10.5,,S -460,0,3,"O'Connor, Mr. Maurice",male,,0,0,371060,7.75,,Q -461,1,1,"Anderson, Mr. Harry",male,48,0,0,19952,26.55,E12,S -462,0,3,"Morley, Mr. William",male,34,0,0,364506,8.05,,S -463,0,1,"Gee, Mr. Arthur H",male,47,0,0,111320,38.5,E63,S -464,0,2,"Milling, Mr. Jacob Christian",male,48,0,0,234360,13,,S -465,0,3,"Maisner, Mr. Simon",male,,0,0,A/S 2816,8.05,,S -466,0,3,"Goncalves, Mr. Manuel Estanslas",male,38,0,0,SOTON/O.Q. 3101306,7.05,,S -467,0,2,"Campbell, Mr. William",male,,0,0,239853,0,,S -468,0,1,"Smart, Mr. John Montgomery",male,56,0,0,113792,26.55,,S -469,0,3,"Scanlan, Mr. James",male,,0,0,36209,7.725,,Q -470,1,3,"Baclini, Miss. Helene Barbara",female,0.75,2,1,2666,19.2583,,C -471,0,3,"Keefe, Mr. Arthur",male,,0,0,323592,7.25,,S -472,0,3,"Cacic, Mr. Luka",male,38,0,0,315089,8.6625,,S -473,1,2,"West, Mrs. Edwy Arthur (Ada Mary Worth)",female,33,1,2,C.A. 34651,27.75,,S -474,1,2,"Jerwan, Mrs. Amin S (Marie Marthe Thuillard)",female,23,0,0,SC/AH Basle 541,13.7917,D,C -475,0,3,"Strandberg, Miss. Ida Sofia",female,22,0,0,7553,9.8375,,S -476,0,1,"Clifford, Mr. George Quincy",male,,0,0,110465,52,A14,S -477,0,2,"Renouf, Mr. Peter Henry",male,34,1,0,31027,21,,S -478,0,3,"Braund, Mr. Lewis Richard",male,29,1,0,3460,7.0458,,S -479,0,3,"Karlsson, Mr. Nils August",male,22,0,0,350060,7.5208,,S -480,1,3,"Hirvonen, Miss. Hildur E",female,2,0,1,3101298,12.2875,,S -481,0,3,"Goodwin, Master. Harold Victor",male,9,5,2,CA 2144,46.9,,S -482,0,2,"Frost, Mr. Anthony Wood ""Archie""",male,,0,0,239854,0,,S -483,0,3,"Rouse, Mr. Richard Henry",male,50,0,0,A/5 3594,8.05,,S -484,1,3,"Turkula, Mrs. (Hedwig)",female,63,0,0,4134,9.5875,,S -485,1,1,"Bishop, Mr. Dickinson H",male,25,1,0,11967,91.0792,B49,C -486,0,3,"Lefebre, Miss. Jeannie",female,,3,1,4133,25.4667,,S -487,1,1,"Hoyt, Mrs. Frederick Maxfield (Jane Anne Forby)",female,35,1,0,19943,90,C93,S -488,0,1,"Kent, Mr. Edward Austin",male,58,0,0,11771,29.7,B37,C -489,0,3,"Somerton, Mr. Francis William",male,30,0,0,A.5. 18509,8.05,,S -490,1,3,"Coutts, Master. Eden Leslie ""Neville""",male,9,1,1,C.A. 37671,15.9,,S -491,0,3,"Hagland, Mr. Konrad Mathias Reiersen",male,,1,0,65304,19.9667,,S -492,0,3,"Windelov, Mr. Einar",male,21,0,0,SOTON/OQ 3101317,7.25,,S -493,0,1,"Molson, Mr. Harry Markland",male,55,0,0,113787,30.5,C30,S -494,0,1,"Artagaveytia, Mr. Ramon",male,71,0,0,PC 17609,49.5042,,C -495,0,3,"Stanley, Mr. Edward Roland",male,21,0,0,A/4 45380,8.05,,S -496,0,3,"Yousseff, Mr. Gerious",male,,0,0,2627,14.4583,,C -497,1,1,"Eustis, Miss. Elizabeth Mussey",female,54,1,0,36947,78.2667,D20,C -498,0,3,"Shellard, Mr. Frederick William",male,,0,0,C.A. 6212,15.1,,S -499,0,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25,1,2,113781,151.55,C22 C26,S -500,0,3,"Svensson, Mr. Olof",male,24,0,0,350035,7.7958,,S -501,0,3,"Calic, Mr. Petar",male,17,0,0,315086,8.6625,,S -502,0,3,"Canavan, Miss. Mary",female,21,0,0,364846,7.75,,Q -503,0,3,"O'Sullivan, Miss. Bridget Mary",female,,0,0,330909,7.6292,,Q -504,0,3,"Laitinen, Miss. Kristina Sofia",female,37,0,0,4135,9.5875,,S -505,1,1,"Maioni, Miss. Roberta",female,16,0,0,110152,86.5,B79,S -506,0,1,"Penasco y Castellana, Mr. Victor de Satode",male,18,1,0,PC 17758,108.9,C65,C -507,1,2,"Quick, Mrs. Frederick Charles (Jane Richards)",female,33,0,2,26360,26,,S -508,1,1,"Bradley, Mr. George (""George Arthur Brayton"")",male,,0,0,111427,26.55,,S -509,0,3,"Olsen, Mr. Henry Margido",male,28,0,0,C 4001,22.525,,S -510,1,3,"Lang, Mr. Fang",male,26,0,0,1601,56.4958,,S -511,1,3,"Daly, Mr. Eugene Patrick",male,29,0,0,382651,7.75,,Q -512,0,3,"Webber, Mr. James",male,,0,0,SOTON/OQ 3101316,8.05,,S -513,1,1,"McGough, Mr. James Robert",male,36,0,0,PC 17473,26.2875,E25,S -514,1,1,"Rothschild, Mrs. Martin (Elizabeth L. Barrett)",female,54,1,0,PC 17603,59.4,,C -515,0,3,"Coleff, Mr. Satio",male,24,0,0,349209,7.4958,,S -516,0,1,"Walker, Mr. William Anderson",male,47,0,0,36967,34.0208,D46,S -517,1,2,"Lemore, Mrs. (Amelia Milley)",female,34,0,0,C.A. 34260,10.5,F33,S -518,0,3,"Ryan, Mr. Patrick",male,,0,0,371110,24.15,,Q -519,1,2,"Angle, Mrs. William A (Florence ""Mary"" Agnes Hughes)",female,36,1,0,226875,26,,S -520,0,3,"Pavlovic, Mr. Stefo",male,32,0,0,349242,7.8958,,S -521,1,1,"Perreault, Miss. Anne",female,30,0,0,12749,93.5,B73,S -522,0,3,"Vovk, Mr. Janko",male,22,0,0,349252,7.8958,,S -523,0,3,"Lahoud, Mr. Sarkis",male,,0,0,2624,7.225,,C -524,1,1,"Hippach, Mrs. Louis Albert (Ida Sophia Fischer)",female,44,0,1,111361,57.9792,B18,C -525,0,3,"Kassem, Mr. Fared",male,,0,0,2700,7.2292,,C -526,0,3,"Farrell, Mr. James",male,40.5,0,0,367232,7.75,,Q -527,1,2,"Ridsdale, Miss. Lucy",female,50,0,0,W./C. 14258,10.5,,S -528,0,1,"Farthing, Mr. John",male,,0,0,PC 17483,221.7792,C95,S -529,0,3,"Salonen, Mr. Johan Werner",male,39,0,0,3101296,7.925,,S -530,0,2,"Hocking, Mr. Richard George",male,23,2,1,29104,11.5,,S -531,1,2,"Quick, Miss. Phyllis May",female,2,1,1,26360,26,,S -532,0,3,"Toufik, Mr. Nakli",male,,0,0,2641,7.2292,,C -533,0,3,"Elias, Mr. Joseph Jr",male,17,1,1,2690,7.2292,,C -534,1,3,"Peter, Mrs. Catherine (Catherine Rizk)",female,,0,2,2668,22.3583,,C -535,0,3,"Cacic, Miss. Marija",female,30,0,0,315084,8.6625,,S -536,1,2,"Hart, Miss. Eva Miriam",female,7,0,2,F.C.C. 13529,26.25,,S -537,0,1,"Butt, Major. Archibald Willingham",male,45,0,0,113050,26.55,B38,S -538,1,1,"LeRoy, Miss. Bertha",female,30,0,0,PC 17761,106.425,,C -539,0,3,"Risien, Mr. Samuel Beard",male,,0,0,364498,14.5,,S -540,1,1,"Frolicher, Miss. Hedwig Margaritha",female,22,0,2,13568,49.5,B39,C -541,1,1,"Crosby, Miss. Harriet R",female,36,0,2,WE/P 5735,71,B22,S -542,0,3,"Andersson, Miss. Ingeborg Constanzia",female,9,4,2,347082,31.275,,S -543,0,3,"Andersson, Miss. Sigrid Elisabeth",female,11,4,2,347082,31.275,,S -544,1,2,"Beane, Mr. Edward",male,32,1,0,2908,26,,S -545,0,1,"Douglas, Mr. Walter Donald",male,50,1,0,PC 17761,106.425,C86,C -546,0,1,"Nicholson, Mr. Arthur Ernest",male,64,0,0,693,26,,S -547,1,2,"Beane, Mrs. Edward (Ethel Clarke)",female,19,1,0,2908,26,,S -548,1,2,"Padro y Manent, Mr. Julian",male,,0,0,SC/PARIS 2146,13.8625,,C -549,0,3,"Goldsmith, Mr. Frank John",male,33,1,1,363291,20.525,,S -550,1,2,"Davies, Master. John Morgan Jr",male,8,1,1,C.A. 33112,36.75,,S -551,1,1,"Thayer, Mr. John Borland Jr",male,17,0,2,17421,110.8833,C70,C -552,0,2,"Sharp, Mr. Percival James R",male,27,0,0,244358,26,,S -553,0,3,"O'Brien, Mr. Timothy",male,,0,0,330979,7.8292,,Q -554,1,3,"Leeni, Mr. Fahim (""Philip Zenni"")",male,22,0,0,2620,7.225,,C -555,1,3,"Ohman, Miss. Velin",female,22,0,0,347085,7.775,,S -556,0,1,"Wright, Mr. George",male,62,0,0,113807,26.55,,S -557,1,1,"Duff Gordon, Lady. (Lucille Christiana Sutherland) (""Mrs Morgan"")",female,48,1,0,11755,39.6,A16,C -558,0,1,"Robbins, Mr. Victor",male,,0,0,PC 17757,227.525,,C -559,1,1,"Taussig, Mrs. Emil (Tillie Mandelbaum)",female,39,1,1,110413,79.65,E67,S -560,1,3,"de Messemaeker, Mrs. Guillaume Joseph (Emma)",female,36,1,0,345572,17.4,,S -561,0,3,"Morrow, Mr. Thomas Rowan",male,,0,0,372622,7.75,,Q -562,0,3,"Sivic, Mr. Husein",male,40,0,0,349251,7.8958,,S -563,0,2,"Norman, Mr. Robert Douglas",male,28,0,0,218629,13.5,,S -564,0,3,"Simmons, Mr. John",male,,0,0,SOTON/OQ 392082,8.05,,S -565,0,3,"Meanwell, Miss. (Marion Ogden)",female,,0,0,SOTON/O.Q. 392087,8.05,,S -566,0,3,"Davies, Mr. Alfred J",male,24,2,0,A/4 48871,24.15,,S -567,0,3,"Stoytcheff, Mr. Ilia",male,19,0,0,349205,7.8958,,S -568,0,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29,0,4,349909,21.075,,S -569,0,3,"Doharr, Mr. Tannous",male,,0,0,2686,7.2292,,C -570,1,3,"Jonsson, Mr. Carl",male,32,0,0,350417,7.8542,,S -571,1,2,"Harris, Mr. George",male,62,0,0,S.W./PP 752,10.5,,S -572,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53,2,0,11769,51.4792,C101,S -573,1,1,"Flynn, Mr. John Irwin (""Irving"")",male,36,0,0,PC 17474,26.3875,E25,S -574,1,3,"Kelly, Miss. Mary",female,,0,0,14312,7.75,,Q -575,0,3,"Rush, Mr. Alfred George John",male,16,0,0,A/4. 20589,8.05,,S -576,0,3,"Patchett, Mr. George",male,19,0,0,358585,14.5,,S -577,1,2,"Garside, Miss. Ethel",female,34,0,0,243880,13,,S -578,1,1,"Silvey, Mrs. William Baird (Alice Munger)",female,39,1,0,13507,55.9,E44,S -579,0,3,"Caram, Mrs. Joseph (Maria Elias)",female,,1,0,2689,14.4583,,C -580,1,3,"Jussila, Mr. Eiriik",male,32,0,0,STON/O 2. 3101286,7.925,,S -581,1,2,"Christy, Miss. Julie Rachel",female,25,1,1,237789,30,,S -582,1,1,"Thayer, Mrs. John Borland (Marian Longstreth Morris)",female,39,1,1,17421,110.8833,C68,C -583,0,2,"Downton, Mr. William James",male,54,0,0,28403,26,,S -584,0,1,"Ross, Mr. John Hugo",male,36,0,0,13049,40.125,A10,C -585,0,3,"Paulner, Mr. Uscher",male,,0,0,3411,8.7125,,C -586,1,1,"Taussig, Miss. Ruth",female,18,0,2,110413,79.65,E68,S -587,0,2,"Jarvis, Mr. John Denzil",male,47,0,0,237565,15,,S -588,1,1,"Frolicher-Stehli, Mr. Maxmillian",male,60,1,1,13567,79.2,B41,C -589,0,3,"Gilinski, Mr. Eliezer",male,22,0,0,14973,8.05,,S -590,0,3,"Murdlin, Mr. Joseph",male,,0,0,A./5. 3235,8.05,,S -591,0,3,"Rintamaki, Mr. Matti",male,35,0,0,STON/O 2. 3101273,7.125,,S -592,1,1,"Stephenson, Mrs. Walter Bertram (Martha Eustis)",female,52,1,0,36947,78.2667,D20,C -593,0,3,"Elsbury, Mr. William James",male,47,0,0,A/5 3902,7.25,,S -594,0,3,"Bourke, Miss. Mary",female,,0,2,364848,7.75,,Q -595,0,2,"Chapman, Mr. John Henry",male,37,1,0,SC/AH 29037,26,,S -596,0,3,"Van Impe, Mr. Jean Baptiste",male,36,1,1,345773,24.15,,S -597,1,2,"Leitch, Miss. Jessie Wills",female,,0,0,248727,33,,S -598,0,3,"Johnson, Mr. Alfred",male,49,0,0,LINE,0,,S -599,0,3,"Boulos, Mr. Hanna",male,,0,0,2664,7.225,,C -600,1,1,"Duff Gordon, Sir. Cosmo Edmund (""Mr Morgan"")",male,49,1,0,PC 17485,56.9292,A20,C -601,1,2,"Jacobsohn, Mrs. Sidney Samuel (Amy Frances Christy)",female,24,2,1,243847,27,,S -602,0,3,"Slabenoff, Mr. Petco",male,,0,0,349214,7.8958,,S -603,0,1,"Harrington, Mr. Charles H",male,,0,0,113796,42.4,,S -604,0,3,"Torber, Mr. Ernst William",male,44,0,0,364511,8.05,,S -605,1,1,"Homer, Mr. Harry (""Mr E Haven"")",male,35,0,0,111426,26.55,,C -606,0,3,"Lindell, Mr. Edvard Bengtsson",male,36,1,0,349910,15.55,,S -607,0,3,"Karaic, Mr. Milan",male,30,0,0,349246,7.8958,,S -608,1,1,"Daniel, Mr. Robert Williams",male,27,0,0,113804,30.5,,S -609,1,2,"Laroche, Mrs. Joseph (Juliette Marie Louise Lafargue)",female,22,1,2,SC/Paris 2123,41.5792,,C -610,1,1,"Shutes, Miss. Elizabeth W",female,40,0,0,PC 17582,153.4625,C125,S -611,0,3,"Andersson, Mrs. Anders Johan (Alfrida Konstantia Brogren)",female,39,1,5,347082,31.275,,S -612,0,3,"Jardin, Mr. Jose Neto",male,,0,0,SOTON/O.Q. 3101305,7.05,,S -613,1,3,"Murphy, Miss. Margaret Jane",female,,1,0,367230,15.5,,Q -614,0,3,"Horgan, Mr. John",male,,0,0,370377,7.75,,Q -615,0,3,"Brocklebank, Mr. William Alfred",male,35,0,0,364512,8.05,,S -616,1,2,"Herman, Miss. Alice",female,24,1,2,220845,65,,S -617,0,3,"Danbom, Mr. Ernst Gilbert",male,34,1,1,347080,14.4,,S -618,0,3,"Lobb, Mrs. William Arthur (Cordelia K Stanlick)",female,26,1,0,A/5. 3336,16.1,,S -619,1,2,"Becker, Miss. Marion Louise",female,4,2,1,230136,39,F4,S -620,0,2,"Gavey, Mr. Lawrence",male,26,0,0,31028,10.5,,S -621,0,3,"Yasbeck, Mr. Antoni",male,27,1,0,2659,14.4542,,C -622,1,1,"Kimball, Mr. Edwin Nelson Jr",male,42,1,0,11753,52.5542,D19,S -623,1,3,"Nakid, Mr. Sahid",male,20,1,1,2653,15.7417,,C -624,0,3,"Hansen, Mr. Henry Damsgaard",male,21,0,0,350029,7.8542,,S -625,0,3,"Bowen, Mr. David John ""Dai""",male,21,0,0,54636,16.1,,S -626,0,1,"Sutton, Mr. Frederick",male,61,0,0,36963,32.3208,D50,S -627,0,2,"Kirkland, Rev. Charles Leonard",male,57,0,0,219533,12.35,,Q -628,1,1,"Longley, Miss. Gretchen Fiske",female,21,0,0,13502,77.9583,D9,S -629,0,3,"Bostandyeff, Mr. Guentcho",male,26,0,0,349224,7.8958,,S -630,0,3,"O'Connell, Mr. Patrick D",male,,0,0,334912,7.7333,,Q -631,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,80,0,0,27042,30,A23,S -632,0,3,"Lundahl, Mr. Johan Svensson",male,51,0,0,347743,7.0542,,S -633,1,1,"Stahelin-Maeglin, Dr. Max",male,32,0,0,13214,30.5,B50,C -634,0,1,"Parr, Mr. William Henry Marsh",male,,0,0,112052,0,,S -635,0,3,"Skoog, Miss. Mabel",female,9,3,2,347088,27.9,,S -636,1,2,"Davis, Miss. Mary",female,28,0,0,237668,13,,S -637,0,3,"Leinonen, Mr. Antti Gustaf",male,32,0,0,STON/O 2. 3101292,7.925,,S -638,0,2,"Collyer, Mr. Harvey",male,31,1,1,C.A. 31921,26.25,,S -639,0,3,"Panula, Mrs. Juha (Maria Emilia Ojala)",female,41,0,5,3101295,39.6875,,S -640,0,3,"Thorneycroft, Mr. Percival",male,,1,0,376564,16.1,,S -641,0,3,"Jensen, Mr. Hans Peder",male,20,0,0,350050,7.8542,,S -642,1,1,"Sagesser, Mlle. Emma",female,24,0,0,PC 17477,69.3,B35,C -643,0,3,"Skoog, Miss. Margit Elizabeth",female,2,3,2,347088,27.9,,S -644,1,3,"Foo, Mr. Choong",male,,0,0,1601,56.4958,,S -645,1,3,"Baclini, Miss. Eugenie",female,0.75,2,1,2666,19.2583,,C -646,1,1,"Harper, Mr. Henry Sleeper",male,48,1,0,PC 17572,76.7292,D33,C -647,0,3,"Cor, Mr. Liudevit",male,19,0,0,349231,7.8958,,S -648,1,1,"Simonius-Blumer, Col. Oberst Alfons",male,56,0,0,13213,35.5,A26,C -649,0,3,"Willey, Mr. Edward",male,,0,0,S.O./P.P. 751,7.55,,S -650,1,3,"Stanley, Miss. Amy Zillah Elsie",female,23,0,0,CA. 2314,7.55,,S -651,0,3,"Mitkoff, Mr. Mito",male,,0,0,349221,7.8958,,S -652,1,2,"Doling, Miss. Elsie",female,18,0,1,231919,23,,S -653,0,3,"Kalvik, Mr. Johannes Halvorsen",male,21,0,0,8475,8.4333,,S -654,1,3,"O'Leary, Miss. Hanora ""Norah""",female,,0,0,330919,7.8292,,Q -655,0,3,"Hegarty, Miss. Hanora ""Nora""",female,18,0,0,365226,6.75,,Q -656,0,2,"Hickman, Mr. Leonard Mark",male,24,2,0,S.O.C. 14879,73.5,,S -657,0,3,"Radeff, Mr. Alexander",male,,0,0,349223,7.8958,,S -658,0,3,"Bourke, Mrs. John (Catherine)",female,32,1,1,364849,15.5,,Q -659,0,2,"Eitemiller, Mr. George Floyd",male,23,0,0,29751,13,,S -660,0,1,"Newell, Mr. Arthur Webster",male,58,0,2,35273,113.275,D48,C -661,1,1,"Frauenthal, Dr. Henry William",male,50,2,0,PC 17611,133.65,,S -662,0,3,"Badt, Mr. Mohamed",male,40,0,0,2623,7.225,,C -663,0,1,"Colley, Mr. Edward Pomeroy",male,47,0,0,5727,25.5875,E58,S -664,0,3,"Coleff, Mr. Peju",male,36,0,0,349210,7.4958,,S -665,1,3,"Lindqvist, Mr. Eino William",male,20,1,0,STON/O 2. 3101285,7.925,,S -666,0,2,"Hickman, Mr. Lewis",male,32,2,0,S.O.C. 14879,73.5,,S -667,0,2,"Butler, Mr. Reginald Fenton",male,25,0,0,234686,13,,S -668,0,3,"Rommetvedt, Mr. Knud Paust",male,,0,0,312993,7.775,,S -669,0,3,"Cook, Mr. Jacob",male,43,0,0,A/5 3536,8.05,,S -670,1,1,"Taylor, Mrs. Elmer Zebley (Juliet Cummins Wright)",female,,1,0,19996,52,C126,S -671,1,2,"Brown, Mrs. Thomas William Solomon (Elizabeth Catherine Ford)",female,40,1,1,29750,39,,S -672,0,1,"Davidson, Mr. Thornton",male,31,1,0,F.C. 12750,52,B71,S -673,0,2,"Mitchell, Mr. Henry Michael",male,70,0,0,C.A. 24580,10.5,,S -674,1,2,"Wilhelms, Mr. Charles",male,31,0,0,244270,13,,S -675,0,2,"Watson, Mr. Ennis Hastings",male,,0,0,239856,0,,S -676,0,3,"Edvardsson, Mr. Gustaf Hjalmar",male,18,0,0,349912,7.775,,S -677,0,3,"Sawyer, Mr. Frederick Charles",male,24.5,0,0,342826,8.05,,S -678,1,3,"Turja, Miss. Anna Sofia",female,18,0,0,4138,9.8417,,S -679,0,3,"Goodwin, Mrs. Frederick (Augusta Tyler)",female,43,1,6,CA 2144,46.9,,S -680,1,1,"Cardeza, Mr. Thomas Drake Martinez",male,36,0,1,PC 17755,512.3292,B51 B53 B55,C -681,0,3,"Peters, Miss. Katie",female,,0,0,330935,8.1375,,Q -682,1,1,"Hassab, Mr. Hammad",male,27,0,0,PC 17572,76.7292,D49,C -683,0,3,"Olsvigen, Mr. Thor Anderson",male,20,0,0,6563,9.225,,S -684,0,3,"Goodwin, Mr. Charles Edward",male,14,5,2,CA 2144,46.9,,S -685,0,2,"Brown, Mr. Thomas William Solomon",male,60,1,1,29750,39,,S -686,0,2,"Laroche, Mr. Joseph Philippe Lemercier",male,25,1,2,SC/Paris 2123,41.5792,,C -687,0,3,"Panula, Mr. Jaako Arnold",male,14,4,1,3101295,39.6875,,S -688,0,3,"Dakic, Mr. Branko",male,19,0,0,349228,10.1708,,S -689,0,3,"Fischer, Mr. Eberhard Thelander",male,18,0,0,350036,7.7958,,S -690,1,1,"Madill, Miss. Georgette Alexandra",female,15,0,1,24160,211.3375,B5,S -691,1,1,"Dick, Mr. Albert Adrian",male,31,1,0,17474,57,B20,S -692,1,3,"Karun, Miss. Manca",female,4,0,1,349256,13.4167,,C -693,1,3,"Lam, Mr. Ali",male,,0,0,1601,56.4958,,S -694,0,3,"Saad, Mr. Khalil",male,25,0,0,2672,7.225,,C -695,0,1,"Weir, Col. John",male,60,0,0,113800,26.55,,S -696,0,2,"Chapman, Mr. Charles Henry",male,52,0,0,248731,13.5,,S -697,0,3,"Kelly, Mr. James",male,44,0,0,363592,8.05,,S -698,1,3,"Mullens, Miss. Katherine ""Katie""",female,,0,0,35852,7.7333,,Q -699,0,1,"Thayer, Mr. John Borland",male,49,1,1,17421,110.8833,C68,C -700,0,3,"Humblen, Mr. Adolf Mathias Nicolai Olsen",male,42,0,0,348121,7.65,F G63,S -701,1,1,"Astor, Mrs. John Jacob (Madeleine Talmadge Force)",female,18,1,0,PC 17757,227.525,C62 C64,C -702,1,1,"Silverthorne, Mr. Spencer Victor",male,35,0,0,PC 17475,26.2875,E24,S -703,0,3,"Barbara, Miss. Saiide",female,18,0,1,2691,14.4542,,C -704,0,3,"Gallagher, Mr. Martin",male,25,0,0,36864,7.7417,,Q -705,0,3,"Hansen, Mr. Henrik Juul",male,26,1,0,350025,7.8542,,S -706,0,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39,0,0,250655,26,,S -707,1,2,"Kelly, Mrs. Florence ""Fannie""",female,45,0,0,223596,13.5,,S -708,1,1,"Calderhead, Mr. Edward Pennington",male,42,0,0,PC 17476,26.2875,E24,S -709,1,1,"Cleaver, Miss. Alice",female,22,0,0,113781,151.55,,S -710,1,3,"Moubarek, Master. Halim Gonios (""William George"")",male,,1,1,2661,15.2458,,C -711,1,1,"Mayne, Mlle. Berthe Antonine (""Mrs de Villiers"")",female,24,0,0,PC 17482,49.5042,C90,C -712,0,1,"Klaber, Mr. Herman",male,,0,0,113028,26.55,C124,S -713,1,1,"Taylor, Mr. Elmer Zebley",male,48,1,0,19996,52,C126,S -714,0,3,"Larsson, Mr. August Viktor",male,29,0,0,7545,9.4833,,S -715,0,2,"Greenberg, Mr. Samuel",male,52,0,0,250647,13,,S -716,0,3,"Soholt, Mr. Peter Andreas Lauritz Andersen",male,19,0,0,348124,7.65,F G73,S -717,1,1,"Endres, Miss. Caroline Louise",female,38,0,0,PC 17757,227.525,C45,C -718,1,2,"Troutt, Miss. Edwina Celia ""Winnie""",female,27,0,0,34218,10.5,E101,S -719,0,3,"McEvoy, Mr. Michael",male,,0,0,36568,15.5,,Q -720,0,3,"Johnson, Mr. Malkolm Joackim",male,33,0,0,347062,7.775,,S -721,1,2,"Harper, Miss. Annie Jessie ""Nina""",female,6,0,1,248727,33,,S -722,0,3,"Jensen, Mr. Svend Lauritz",male,17,1,0,350048,7.0542,,S -723,0,2,"Gillespie, Mr. William Henry",male,34,0,0,12233,13,,S -724,0,2,"Hodges, Mr. Henry Price",male,50,0,0,250643,13,,S -725,1,1,"Chambers, Mr. Norman Campbell",male,27,1,0,113806,53.1,E8,S -726,0,3,"Oreskovic, Mr. Luka",male,20,0,0,315094,8.6625,,S -727,1,2,"Renouf, Mrs. Peter Henry (Lillian Jefferys)",female,30,3,0,31027,21,,S -728,1,3,"Mannion, Miss. Margareth",female,,0,0,36866,7.7375,,Q -729,0,2,"Bryhl, Mr. Kurt Arnold Gottfrid",male,25,1,0,236853,26,,S -730,0,3,"Ilmakangas, Miss. Pieta Sofia",female,25,1,0,STON/O2. 3101271,7.925,,S -731,1,1,"Allen, Miss. Elisabeth Walton",female,29,0,0,24160,211.3375,B5,S -732,0,3,"Hassan, Mr. Houssein G N",male,11,0,0,2699,18.7875,,C -733,0,2,"Knight, Mr. Robert J",male,,0,0,239855,0,,S -734,0,2,"Berriman, Mr. William John",male,23,0,0,28425,13,,S -735,0,2,"Troupiansky, Mr. Moses Aaron",male,23,0,0,233639,13,,S -736,0,3,"Williams, Mr. Leslie",male,28.5,0,0,54636,16.1,,S -737,0,3,"Ford, Mrs. Edward (Margaret Ann Watson)",female,48,1,3,W./C. 6608,34.375,,S -738,1,1,"Lesurer, Mr. Gustave J",male,35,0,0,PC 17755,512.3292,B101,C -739,0,3,"Ivanoff, Mr. Kanio",male,,0,0,349201,7.8958,,S -740,0,3,"Nankoff, Mr. Minko",male,,0,0,349218,7.8958,,S -741,1,1,"Hawksford, Mr. Walter James",male,,0,0,16988,30,D45,S -742,0,1,"Cavendish, Mr. Tyrell William",male,36,1,0,19877,78.85,C46,S -743,1,1,"Ryerson, Miss. Susan Parker ""Suzette""",female,21,2,2,PC 17608,262.375,B57 B59 B63 B66,C -744,0,3,"McNamee, Mr. Neal",male,24,1,0,376566,16.1,,S -745,1,3,"Stranden, Mr. Juho",male,31,0,0,STON/O 2. 3101288,7.925,,S -746,0,1,"Crosby, Capt. Edward Gifford",male,70,1,1,WE/P 5735,71,B22,S -747,0,3,"Abbott, Mr. Rossmore Edward",male,16,1,1,C.A. 2673,20.25,,S -748,1,2,"Sinkkonen, Miss. Anna",female,30,0,0,250648,13,,S -749,0,1,"Marvin, Mr. Daniel Warner",male,19,1,0,113773,53.1,D30,S -750,0,3,"Connaghton, Mr. Michael",male,31,0,0,335097,7.75,,Q -751,1,2,"Wells, Miss. Joan",female,4,1,1,29103,23,,S -752,1,3,"Moor, Master. Meier",male,6,0,1,392096,12.475,E121,S -753,0,3,"Vande Velde, Mr. Johannes Joseph",male,33,0,0,345780,9.5,,S -754,0,3,"Jonkoff, Mr. Lalio",male,23,0,0,349204,7.8958,,S -755,1,2,"Herman, Mrs. Samuel (Jane Laver)",female,48,1,2,220845,65,,S -756,1,2,"Hamalainen, Master. Viljo",male,0.67,1,1,250649,14.5,,S -757,0,3,"Carlsson, Mr. August Sigfrid",male,28,0,0,350042,7.7958,,S -758,0,2,"Bailey, Mr. Percy Andrew",male,18,0,0,29108,11.5,,S -759,0,3,"Theobald, Mr. Thomas Leonard",male,34,0,0,363294,8.05,,S -760,1,1,"Rothes, the Countess. of (Lucy Noel Martha Dyer-Edwards)",female,33,0,0,110152,86.5,B77,S -761,0,3,"Garfirth, Mr. John",male,,0,0,358585,14.5,,S -762,0,3,"Nirva, Mr. Iisakki Antino Aijo",male,41,0,0,SOTON/O2 3101272,7.125,,S -763,1,3,"Barah, Mr. Hanna Assi",male,20,0,0,2663,7.2292,,C -764,1,1,"Carter, Mrs. William Ernest (Lucile Polk)",female,36,1,2,113760,120,B96 B98,S -765,0,3,"Eklund, Mr. Hans Linus",male,16,0,0,347074,7.775,,S -766,1,1,"Hogeboom, Mrs. John C (Anna Andrews)",female,51,1,0,13502,77.9583,D11,S -767,0,1,"Brewe, Dr. Arthur Jackson",male,,0,0,112379,39.6,,C -768,0,3,"Mangan, Miss. Mary",female,30.5,0,0,364850,7.75,,Q -769,0,3,"Moran, Mr. Daniel J",male,,1,0,371110,24.15,,Q -770,0,3,"Gronnestad, Mr. Daniel Danielsen",male,32,0,0,8471,8.3625,,S -771,0,3,"Lievens, Mr. Rene Aime",male,24,0,0,345781,9.5,,S -772,0,3,"Jensen, Mr. Niels Peder",male,48,0,0,350047,7.8542,,S -773,0,2,"Mack, Mrs. (Mary)",female,57,0,0,S.O./P.P. 3,10.5,E77,S -774,0,3,"Elias, Mr. Dibo",male,,0,0,2674,7.225,,C -775,1,2,"Hocking, Mrs. Elizabeth (Eliza Needs)",female,54,1,3,29105,23,,S -776,0,3,"Myhrman, Mr. Pehr Fabian Oliver Malkolm",male,18,0,0,347078,7.75,,S -777,0,3,"Tobin, Mr. Roger",male,,0,0,383121,7.75,F38,Q -778,1,3,"Emanuel, Miss. Virginia Ethel",female,5,0,0,364516,12.475,,S -779,0,3,"Kilgannon, Mr. Thomas J",male,,0,0,36865,7.7375,,Q -780,1,1,"Robert, Mrs. Edward Scott (Elisabeth Walton McMillan)",female,43,0,1,24160,211.3375,B3,S -781,1,3,"Ayoub, Miss. Banoura",female,13,0,0,2687,7.2292,,C -782,1,1,"Dick, Mrs. Albert Adrian (Vera Gillespie)",female,17,1,0,17474,57,B20,S -783,0,1,"Long, Mr. Milton Clyde",male,29,0,0,113501,30,D6,S -784,0,3,"Johnston, Mr. Andrew G",male,,1,2,W./C. 6607,23.45,,S -785,0,3,"Ali, Mr. William",male,25,0,0,SOTON/O.Q. 3101312,7.05,,S -786,0,3,"Harmer, Mr. Abraham (David Lishin)",male,25,0,0,374887,7.25,,S -787,1,3,"Sjoblom, Miss. Anna Sofia",female,18,0,0,3101265,7.4958,,S -788,0,3,"Rice, Master. George Hugh",male,8,4,1,382652,29.125,,Q -789,1,3,"Dean, Master. Bertram Vere",male,1,1,2,C.A. 2315,20.575,,S -790,0,1,"Guggenheim, Mr. Benjamin",male,46,0,0,PC 17593,79.2,B82 B84,C -791,0,3,"Keane, Mr. Andrew ""Andy""",male,,0,0,12460,7.75,,Q -792,0,2,"Gaskell, Mr. Alfred",male,16,0,0,239865,26,,S -793,0,3,"Sage, Miss. Stella Anna",female,,8,2,CA. 2343,69.55,,S -794,0,1,"Hoyt, Mr. William Fisher",male,,0,0,PC 17600,30.6958,,C -795,0,3,"Dantcheff, Mr. Ristiu",male,25,0,0,349203,7.8958,,S -796,0,2,"Otter, Mr. Richard",male,39,0,0,28213,13,,S -797,1,1,"Leader, Dr. Alice (Farnham)",female,49,0,0,17465,25.9292,D17,S -798,1,3,"Osman, Mrs. Mara",female,31,0,0,349244,8.6833,,S -799,0,3,"Ibrahim Shawah, Mr. Yousseff",male,30,0,0,2685,7.2292,,C -800,0,3,"Van Impe, Mrs. Jean Baptiste (Rosalie Paula Govaert)",female,30,1,1,345773,24.15,,S -801,0,2,"Ponesell, Mr. Martin",male,34,0,0,250647,13,,S -802,1,2,"Collyer, Mrs. Harvey (Charlotte Annie Tate)",female,31,1,1,C.A. 31921,26.25,,S -803,1,1,"Carter, Master. William Thornton II",male,11,1,2,113760,120,B96 B98,S -804,1,3,"Thomas, Master. Assad Alexander",male,0.42,0,1,2625,8.5167,,C -805,1,3,"Hedman, Mr. Oskar Arvid",male,27,0,0,347089,6.975,,S -806,0,3,"Johansson, Mr. Karl Johan",male,31,0,0,347063,7.775,,S -807,0,1,"Andrews, Mr. Thomas Jr",male,39,0,0,112050,0,A36,S -808,0,3,"Pettersson, Miss. Ellen Natalia",female,18,0,0,347087,7.775,,S -809,0,2,"Meyer, Mr. August",male,39,0,0,248723,13,,S -810,1,1,"Chambers, Mrs. Norman Campbell (Bertha Griggs)",female,33,1,0,113806,53.1,E8,S -811,0,3,"Alexander, Mr. William",male,26,0,0,3474,7.8875,,S -812,0,3,"Lester, Mr. James",male,39,0,0,A/4 48871,24.15,,S -813,0,2,"Slemen, Mr. Richard James",male,35,0,0,28206,10.5,,S -814,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6,4,2,347082,31.275,,S -815,0,3,"Tomlin, Mr. Ernest Portage",male,30.5,0,0,364499,8.05,,S -816,0,1,"Fry, Mr. Richard",male,,0,0,112058,0,B102,S -817,0,3,"Heininen, Miss. Wendla Maria",female,23,0,0,STON/O2. 3101290,7.925,,S -818,0,2,"Mallet, Mr. Albert",male,31,1,1,S.C./PARIS 2079,37.0042,,C -819,0,3,"Holm, Mr. John Fredrik Alexander",male,43,0,0,C 7075,6.45,,S -820,0,3,"Skoog, Master. Karl Thorsten",male,10,3,2,347088,27.9,,S -821,1,1,"Hays, Mrs. Charles Melville (Clara Jennings Gregg)",female,52,1,1,12749,93.5,B69,S -822,1,3,"Lulic, Mr. Nikola",male,27,0,0,315098,8.6625,,S -823,0,1,"Reuchlin, Jonkheer. John George",male,38,0,0,19972,0,,S -824,1,3,"Moor, Mrs. (Beila)",female,27,0,1,392096,12.475,E121,S -825,0,3,"Panula, Master. Urho Abraham",male,2,4,1,3101295,39.6875,,S -826,0,3,"Flynn, Mr. John",male,,0,0,368323,6.95,,Q -827,0,3,"Lam, Mr. Len",male,,0,0,1601,56.4958,,S -828,1,2,"Mallet, Master. Andre",male,1,0,2,S.C./PARIS 2079,37.0042,,C -829,1,3,"McCormack, Mr. Thomas Joseph",male,,0,0,367228,7.75,,Q -830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62,0,0,113572,80,B28, -831,1,3,"Yasbeck, Mrs. Antoni (Selini Alexander)",female,15,1,0,2659,14.4542,,C -832,1,2,"Richards, Master. George Sibley",male,0.83,1,1,29106,18.75,,S -833,0,3,"Saad, Mr. Amin",male,,0,0,2671,7.2292,,C -834,0,3,"Augustsson, Mr. Albert",male,23,0,0,347468,7.8542,,S -835,0,3,"Allum, Mr. Owen George",male,18,0,0,2223,8.3,,S -836,1,1,"Compton, Miss. Sara Rebecca",female,39,1,1,PC 17756,83.1583,E49,C -837,0,3,"Pasic, Mr. Jakob",male,21,0,0,315097,8.6625,,S -838,0,3,"Sirota, Mr. Maurice",male,,0,0,392092,8.05,,S -839,1,3,"Chip, Mr. Chang",male,32,0,0,1601,56.4958,,S -840,1,1,"Marechal, Mr. Pierre",male,,0,0,11774,29.7,C47,C -841,0,3,"Alhomaki, Mr. Ilmari Rudolf",male,20,0,0,SOTON/O2 3101287,7.925,,S -842,0,2,"Mudd, Mr. Thomas Charles",male,16,0,0,S.O./P.P. 3,10.5,,S -843,1,1,"Serepeca, Miss. Augusta",female,30,0,0,113798,31,,C -844,0,3,"Lemberopolous, Mr. Peter L",male,34.5,0,0,2683,6.4375,,C -845,0,3,"Culumovic, Mr. Jeso",male,17,0,0,315090,8.6625,,S -846,0,3,"Abbing, Mr. Anthony",male,42,0,0,C.A. 5547,7.55,,S -847,0,3,"Sage, Mr. Douglas Bullen",male,,8,2,CA. 2343,69.55,,S -848,0,3,"Markoff, Mr. Marin",male,35,0,0,349213,7.8958,,C -849,0,2,"Harper, Rev. John",male,28,0,1,248727,33,,S -850,1,1,"Goldenberg, Mrs. Samuel L (Edwiga Grabowska)",female,,1,0,17453,89.1042,C92,C -851,0,3,"Andersson, Master. Sigvard Harald Elias",male,4,4,2,347082,31.275,,S -852,0,3,"Svensson, Mr. Johan",male,74,0,0,347060,7.775,,S -853,0,3,"Boulos, Miss. Nourelain",female,9,1,1,2678,15.2458,,C -854,1,1,"Lines, Miss. Mary Conover",female,16,0,1,PC 17592,39.4,D28,S -855,0,2,"Carter, Mrs. Ernest Courtenay (Lilian Hughes)",female,44,1,0,244252,26,,S -856,1,3,"Aks, Mrs. Sam (Leah Rosen)",female,18,0,1,392091,9.35,,S -857,1,1,"Wick, Mrs. George Dennick (Mary Hitchcock)",female,45,1,1,36928,164.8667,,S -858,1,1,"Daly, Mr. Peter Denis ",male,51,0,0,113055,26.55,E17,S -859,1,3,"Baclini, Mrs. Solomon (Latifa Qurban)",female,24,0,3,2666,19.2583,,C -860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C -861,0,3,"Hansen, Mr. Claus Peter",male,41,2,0,350026,14.1083,,S -862,0,2,"Giles, Mr. Frederick Edward",male,21,1,0,28134,11.5,,S -863,1,1,"Swift, Mrs. Frederick Joel (Margaret Welles Barron)",female,48,0,0,17466,25.9292,D17,S -864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.55,,S -865,0,2,"Gill, Mr. John William",male,24,0,0,233866,13,,S -866,1,2,"Bystrom, Mrs. (Karolina)",female,42,0,0,236852,13,,S -867,1,2,"Duran y More, Miss. Asuncion",female,27,1,0,SC/PARIS 2149,13.8583,,C -868,0,1,"Roebling, Mr. Washington Augustus II",male,31,0,0,PC 17590,50.4958,A24,S -869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5,,S -870,1,3,"Johnson, Master. Harold Theodor",male,4,1,1,347742,11.1333,,S -871,0,3,"Balkic, Mr. Cerin",male,26,0,0,349248,7.8958,,S -872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47,1,1,11751,52.5542,D35,S -873,0,1,"Carlsson, Mr. Frans Olof",male,33,0,0,695,5,B51 B53 B55,S -874,0,3,"Vander Cruyssen, Mr. Victor",male,47,0,0,345765,9,,S -875,1,2,"Abelson, Mrs. Samuel (Hannah Wizosky)",female,28,1,0,P/PP 3381,24,,C -876,1,3,"Najib, Miss. Adele Kiamie ""Jane""",female,15,0,0,2667,7.225,,C -877,0,3,"Gustafsson, Mr. Alfred Ossian",male,20,0,0,7534,9.8458,,S -878,0,3,"Petroff, Mr. Nedelio",male,19,0,0,349212,7.8958,,S -879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S -880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56,0,1,11767,83.1583,C50,C -881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25,0,1,230433,26,,S -882,0,3,"Markun, Mr. Johann",male,33,0,0,349257,7.8958,,S -883,0,3,"Dahlberg, Miss. Gerda Ulrika",female,22,0,0,7552,10.5167,,S -884,0,2,"Banfield, Mr. Frederick James",male,28,0,0,C.A./SOTON 34068,10.5,,S -885,0,3,"Sutehall, Mr. Henry Jr",male,25,0,0,SOTON/OQ 392076,7.05,,S -886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39,0,5,382652,29.125,,Q -887,0,2,"Montvila, Rev. Juozas",male,27,0,0,211536,13,,S -888,1,1,"Graham, Miss. Margaret Edith",female,19,0,0,112053,30,B42,S -889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S -890,1,1,"Behr, Mr. Karl Howell",male,26,0,0,111369,30,C148,C -891,0,3,"Dooley, Mr. Patrick",male,32,0,0,370376,7.75,,Q \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/preprocessor.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/preprocessor.py index 1920dbd..279d334 100644 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/preprocessor.py +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/preprocessor.py @@ -1,5 +1,6 @@ from ml.preprocessing.preprocessing import Preprocessing from ml.preprocessing.dataquality import DataQuality +from ml.data_source.spreadsheet import Spreadsheet import great_expectations as ge from datetime import date import pandas as pd @@ -28,7 +29,7 @@ logging.info('Reading the inputs') file = glob.glob("/opt/ml/processing/input/raw_data/*.csv")[0] logging.info(f'Reading file: {file}') - df = pd.read_csv(file) + df = Spreadsheet().get_data(file) logging.info("Data Quality") diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/api/app.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/api/app.py deleted file mode 100644 index 7441f7c..0000000 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/api/app.py +++ /dev/null @@ -1,41 +0,0 @@ -from flask import Flask, request, redirect, url_for, flash, jsonify -import numpy as np -import pandas as pd -from joblib import load -import json -import logging - -logging.getLogger().setLevel(logging.INFO) - -app = Flask(__name__) - -def predict_new(X, probs=True): - model = load('model/titanic_model_rf.pkl') - p = model.get_preprocessing() - - X = p.clean_data(X) - X = p.categ_encoding(X) - - columns = model.get_columns() - for col in columns: - if col not in X.columns: - X[col] = 0 - if probs: - return model.predict_proba(X)[:,1] - else: - return model.predict(X) - -@app.route('/invocations', methods=['POST']) -def predict(): - data = pd.read_json(request.json) - predictions = np.array2string(predict_new(data, probs=True)) - return jsonify(predictions) - -@app.route('/health', methods=['GET']) -def health_check(): - resp = jsonify(success=True) - return resp - - -if __name__ == "__main__": - app.run(host='0.0.0.0') \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/api/myrequests.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/api/myrequests.py deleted file mode 100644 index 46fda29..0000000 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/api/myrequests.py +++ /dev/null @@ -1,17 +0,0 @@ -import requests -import json - -url = 'http://localhost:5000/invocations' - -data = { - 'Pclass':[3,3,3], - 'Sex': ['male', 'female', 'male'], - 'Age':[4, 22, 28] - } -j_data = json.dumps(data) - -headers = {'Content-Type': 'application/json'} -print("Sending request for model...") -print(f"Data: {j_data}") -r = requests.post(url, json=j_data, headers=headers) -print(f"Response: {r.text}") \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/api/wsgi.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/api/wsgi.py deleted file mode 100644 index 9e83905..0000000 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/api/wsgi.py +++ /dev/null @@ -1,4 +0,0 @@ -from app import app - -if __name__ == "__main__": - app.run(use_reloader=True, debug=True) diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_Processor.ipynb b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_Processor.ipynb index ad85e0f..ebd6aae 100644 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_Processor.ipynb +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_Processor.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "b5264128", + "id": "d1dd5820", "metadata": {}, "source": [ "# Sagemaker Processor" @@ -10,7 +10,7 @@ }, { "cell_type": "markdown", - "id": "5bd7a5cd", + "id": "fbaaa9e6", "metadata": {}, "source": [ "This script generates the train, val and inference files with the processor previous uploaded in ECR." @@ -18,7 +18,7 @@ }, { "cell_type": "markdown", - "id": "0488ed05", + "id": "864a2e0f", "metadata": {}, "source": [ "## Import modules" @@ -27,7 +27,7 @@ { "cell_type": "code", "execution_count": 1, - "id": "e7b20785", + "id": "fa0d1522", "metadata": {}, "outputs": [], "source": [ @@ -40,7 +40,7 @@ }, { "cell_type": "markdown", - "id": "7f3fd305", + "id": "43887859", "metadata": {}, "source": [ "## Setup" @@ -48,7 +48,7 @@ }, { "cell_type": "markdown", - "id": "6528a20b", + "id": "4422ac46", "metadata": {}, "source": [ "Modify according to your configurations." @@ -57,7 +57,7 @@ { "cell_type": "code", "execution_count": 2, - "id": "d5cdd5d1", + "id": "4d423fcf", "metadata": {}, "outputs": [], "source": [ @@ -68,7 +68,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "5ec68bf7", + "id": "b503dba8", "metadata": {}, "outputs": [], "source": [ @@ -80,7 +80,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "4d011a47", + "id": "c00d86d1", "metadata": {}, "outputs": [], "source": [ @@ -91,7 +91,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "25f76666", + "id": "667c8bb6", "metadata": {}, "outputs": [], "source": [ @@ -102,7 +102,7 @@ { "cell_type": "code", "execution_count": 6, - "id": "fafb5f18", + "id": "3b02cf9e", "metadata": {}, "outputs": [], "source": [ @@ -114,7 +114,7 @@ { "cell_type": "code", "execution_count": 7, - "id": "2ef594d3", + "id": "32c8ab3d", "metadata": {}, "outputs": [], "source": [ @@ -131,10 +131,23 @@ "}" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "eac4ac37", + "metadata": {}, + "outputs": [], + "source": [ + "# upload train and test data in S3\n", + "s3 = boto3.resource('s3') \n", + "s3.Bucket(bucket).upload_file('../../../data/raw/raw_train.csv', 'TRAIN_RAW/raw_train.csv')\n", + "s3.Bucket(bucket).upload_file('../../../data/raw/raw_test.csv', 'TEST_RAW/raw_test.csv')" + ] + }, { "cell_type": "code", "execution_count": 8, - "id": "2b625b74", + "id": "1b175317", "metadata": {}, "outputs": [], "source": [ @@ -145,7 +158,7 @@ }, { "cell_type": "markdown", - "id": "6e8e92ba", + "id": "a9bcf199", "metadata": {}, "source": [ "## Processor - Train" @@ -154,7 +167,7 @@ { "cell_type": "code", "execution_count": 9, - "id": "e1b41ed1", + "id": "becf4d16", "metadata": {}, "outputs": [], "source": [ @@ -169,7 +182,7 @@ { "cell_type": "code", "execution_count": 10, - "id": "cd67446b", + "id": "2ccaf4a1", "metadata": {}, "outputs": [], "source": [ @@ -202,7 +215,7 @@ { "cell_type": "code", "execution_count": 11, - "id": "902f8e4f", + "id": "e0287211", "metadata": {}, "outputs": [], "source": [ @@ -216,7 +229,7 @@ { "cell_type": "code", "execution_count": 12, - "id": "fd8a28a1", + "id": "854dc0d7", "metadata": {}, "outputs": [ { @@ -268,7 +281,7 @@ }, { "cell_type": "markdown", - "id": "a0b0636e", + "id": "0f54bf21", "metadata": {}, "source": [ "## Processor - Inference" @@ -277,7 +290,7 @@ { "cell_type": "code", "execution_count": 13, - "id": "4e1df020", + "id": "bb2a86dc", "metadata": {}, "outputs": [], "source": [ @@ -298,7 +311,7 @@ { "cell_type": "code", "execution_count": 14, - "id": "4fa3439a", + "id": "c3e8dd48", "metadata": {}, "outputs": [], "source": [ @@ -320,7 +333,7 @@ { "cell_type": "code", "execution_count": 15, - "id": "c399b969", + "id": "62de176e", "metadata": {}, "outputs": [], "source": [ @@ -334,7 +347,7 @@ { "cell_type": "code", "execution_count": 16, - "id": "8cb61e97", + "id": "e9255f5a", "metadata": {}, "outputs": [ { diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/tests/test_project.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/tests/test_project.py index 2d6936f..87a62d9 100644 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/tests/test_project.py +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/tests/test_project.py @@ -5,15 +5,26 @@ sys.path.append('..') @pytest.fixture(scope='module') -def read_data(): +def read_data_train(): from ml.data_source.spreadsheet import Spreadsheet - yield Spreadsheet().get_data('../../data/raw/train.csv') + yield Spreadsheet().get_data('../../data/raw/raw_train.csv') @pytest.fixture(scope='module') -def cleaned_data(read_data): +def read_data_test(): + from ml.data_source.spreadsheet import Spreadsheet + yield Spreadsheet().get_data('../../data/raw/raw_test.csv') + +@pytest.fixture(scope='module') +def cleaned_data_train(read_data_train): from ml.preprocessing.preprocessing import Preprocessing p = Preprocessing() - yield p.clean_data(read_data) + yield p.clean_data(read_data_train) + +@pytest.fixture(scope='module') +def cleaned_data_test(read_data_test): + from ml.preprocessing.preprocessing import Preprocessing + p = Preprocessing() + yield p.clean_data(read_data_test) def test_tree(): """ @@ -29,26 +40,71 @@ def test_tree(): assert os.path.exists(os.path.join('..','..', 'src', 'ml', 'preprocessing')) assert os.path.exists(os.path.join('..','..', 'src', 'tests')) -def test_spreadsheet(read_data): +def test_spreadsheet(read_data_train): """ Test that spreadsheet is importing correctly """ - assert read_data.shape[0] > 1 + assert read_data_train.shape[0] > 1 -def test_clean_data(cleaned_data): +def test_clean_data(cleaned_data_train): """ Test that the df is cleaned correctly """ - assert cleaned_data.Pclass.dtype == 'object' - assert pd.isnull(cleaned_data.Age).sum() == 0 + assert cleaned_data_train.Pclass.dtype == 'object' + assert pd.isnull(cleaned_data_train.Age).sum() == 0 + +def all_columns(df, names): + """ + Test if df has all columns + """ + array = [name in df.columns for name in names] + return sum(array) == len(array) + +def values_between(df, col, min_value, max_value): + """ + Test if column has values between min and max + """ + array = [value >= min_value and max_value <= 1 for value in df[col]] + return sum(array) == len(array) -def test_categ_encoding(cleaned_data): +def test_categ_encoding(cleaned_data_train,cleaned_data_test): """ - Test if column PClass is + Test if column PClass is encoding """ from ml.preprocessing.preprocessing import Preprocessing - p = Preprocessing() - df = p.categ_encoding(cleaned_data) - names = ['Survived', 'Age', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male'] - assert [name in df.columns for name in names] \ No newline at end of file + names = ['Survived', 'Age', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_1', 'Sex_2'] + p = Preprocessing(oneHot_cols=['Pclass','Sex']) + df_train = p.categ_encoding_oneHot(cleaned_data_train, step_train=True) + assert all_columns(df_train,names) + df_test = p.categ_encoding_oneHot(cleaned_data_test, step_train=False) + assert all_columns(df_test,names) + +def test_normalize(cleaned_data_train,cleaned_data_test): + """ + Test if column Age is normalized + """ + from ml.preprocessing.preprocessing import Preprocessing + p = Preprocessing(norm_cols={'min-max': ['Age']}) + df_train = p.normalize(cleaned_data_train, step_train=True) + assert values_between(df_train,'Age',0,1) + df_test = p.normalize(cleaned_data_test, step_train=False) + assert values_between(df_test,'Age',0,1) + +def test_execute_train(read_data_train,read_data_test): + """ + Test if execute is correct + """ + from ml.preprocessing.preprocessing import Preprocessing + names = ['Survived', 'Age', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_1', 'Sex_2'] + norm_cols={'min-max': ['Age']} + oneHot_cols=['Pclass','Sex'] + p = Preprocessing(norm_cols,oneHot_cols) + X_train, X_val = p.execute(read_data_train, step_train=True) + assert all_columns(X_train,names) + assert values_between(X_train,'Age',0,1) + assert all_columns(X_val,names) + assert values_between(X_val,'Age',0,1) + X_test = p.execute(read_data_test, step_train=False) + assert all_columns(X_test,names) + assert values_between(X_test,'Age',0,1) \ No newline at end of file From fc563b1b9cd692a583f1875eeac0aafcddcd1512 Mon Sep 17 00:00:00 2001 From: karenstemartins Date: Fri, 28 May 2021 13:15:00 +0000 Subject: [PATCH 05/10] Data quality and test modifications --- README.md | 2 +- .../processor/preprocessor.py | 12 +++++++----- .../src/ml/preprocessing/dataquality.py | 5 +++-- hermione/tests/test_hermione.py | 1 + 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 6722308..eb7249f 100644 --- a/README.md +++ b/README.md @@ -197,7 +197,7 @@ Here we describe briefly what each class is doing: - **Preprocessing** - concentrates all preprocessing steps that must be performed on the data before the model is trained. - **Normalization** - applies normalization and denormalization to reported columns. This class contains the following normalization algorithms already implemented: StandardScaler e MinMaxScaler. - **TextVectorizer** - transforms text into vector. Implemented methods: Bag of words, TF_IDF, Embedding: mean, median e indexing. -- **DataQuality** - concentrates all data validation steps that must be performed on the data to ensure its quality (Available in Sagemaker version). +- **DataQuality** - concentrates all data validation steps that must be performed on the data to ensure its quality. ### Visualization diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/preprocessor.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/preprocessor.py index 279d334..c78c24b 100644 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/preprocessor.py +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/preprocessor.py @@ -30,17 +30,19 @@ file = glob.glob("/opt/ml/processing/input/raw_data/*.csv")[0] logging.info(f'Reading file: {file}') df = Spreadsheet().get_data(file) - - + logging.info("Data Quality") # If True, it creates the DataQuality object, otherwise it loads an existing one if step_train: - dq = DataQuality(discrete_cat_cols=['Sex', 'Pclass','Survived']) - df_ge = dq.perform(df) + dq = DataQuality(discrete_cat_cols=['Sex', 'Pclass']) + df_ge = dq.perform(df, target='Survived') df_ge.save_expectation_suite('/opt/ml/processing/output/expectations/expectations.json') else: date = date.today().strftime('%Y%m%d') - df_ge = ge.dataset.PandasDataset(df) + df_without_target = df.copy() + if 'Survived' in df_without_target.columns: + df_without_target.drop(columns=['Survived'], inplace=True) + df_ge = ge.dataset.PandasDataset(df_without_target) ge_val = df_ge.validate(expectation_suite='/opt/ml/processing/input/expectations/expectations.json', only_return_failures=False) with open(f'/opt/ml/processing/output/validations/{date}.json', 'w') as f: json.dump(ge_val.to_json_dict(), f) diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/dataquality.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/dataquality.py index 5ce7b61..68d8ad2 100644 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/dataquality.py +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/dataquality.py @@ -22,8 +22,7 @@ def __init__(self, continuous_cols=None, discrete_cat_cols=None): self.continuous_cols = continuous_cols self.discrete_cat_cols = discrete_cat_cols - def perform(self, - df: pd.DataFrame): + def perform(self, df: pd.DataFrame, target=None, cut_off = 2): """ Perform data quality @@ -36,6 +35,8 @@ def perform(self, ------- json """ + if target != None: + df.drop(columns=[target], inplace=True) df_ge = ge.dataset.PandasDataset(df) cols = df_ge.columns df_ge.expect_table_columns_to_match_ordered_list(cols) diff --git a/hermione/tests/test_hermione.py b/hermione/tests/test_hermione.py index 5cb2214..3842a19 100644 --- a/hermione/tests/test_hermione.py +++ b/hermione/tests/test_hermione.py @@ -19,4 +19,5 @@ def test_info(): def test_implementation_script_folders(): assert os.path.exists(os.path.join(os.getcwd(), 'hermione', 'module_templates', '__IMPLEMENTED_BASE__')) + assert os.path.exists(os.path.join(os.getcwd(), 'hermione', 'module_templates', '__IMPLEMENTED_SAGEMAKER__')) assert os.path.exists(os.path.join(os.getcwd(), 'hermione', 'module_templates', '__NOT_IMPLEMENTED_BASE__')) From c5c8322e5d44d60dcd15b5b28a3c986ad6ea9f0f Mon Sep 17 00:00:00 2001 From: karenstemartins Date: Fri, 28 May 2021 13:39:48 +0000 Subject: [PATCH 06/10] Remove checkpoints --- .../.ipynb_checkpoints/Dockerfile-checkpoint | 59 ------------- .../.ipynb_checkpoints/handler-checkpoint.py | 65 -------------- .../.ipynb_checkpoints/main-checkpoint.py | 12 --- .../.ipynb_checkpoints/Dockerfile-checkpoint | 60 ------------- .../preprocessor-checkpoint.py | 68 --------------- .../.ipynb_checkpoints/Dockerfile-checkpoint | 66 --------------- .../.ipynb_checkpoints/train-checkpoint.py | 84 ------------------- 7 files changed, 414 deletions(-) delete mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/.ipynb_checkpoints/Dockerfile-checkpoint delete mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/.ipynb_checkpoints/handler-checkpoint.py delete mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/.ipynb_checkpoints/main-checkpoint.py delete mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/.ipynb_checkpoints/Dockerfile-checkpoint delete mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/.ipynb_checkpoints/preprocessor-checkpoint.py delete mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/train/.ipynb_checkpoints/Dockerfile-checkpoint delete mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/train/.ipynb_checkpoints/train-checkpoint.py diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/.ipynb_checkpoints/Dockerfile-checkpoint b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/.ipynb_checkpoints/Dockerfile-checkpoint deleted file mode 100644 index b9524cc..0000000 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/.ipynb_checkpoints/Dockerfile-checkpoint +++ /dev/null @@ -1,59 +0,0 @@ -FROM ubuntu:latest -# Set a docker label to advertise multi-model support on the container -LABEL com.amazonaws.sagemaker.capabilities.multi-models=false -# Set a docker label to enable container to use SAGEMAKER_BIND_TO_PORT environment variable if present -LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true - -# Install some handful libraries like curl, wget, git, build-essential, zlib -RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - gcc \ - python3.7 \ - python3-dev \ - python3-pip \ - ca-certificates \ - git \ - curl \ - openjdk-8-jre-headless\ - wget &&\ - rm -rf /var/lib/apt/lists/* - -# install the SageMaker Inference Toolkit -RUN pip3 install --no-cache \ - multi-model-server \ - sagemaker-inference \ - retrying - -# Change working directory -WORKDIR / - -# Install requirements -COPY requirements.txt /opt/ml/code/src/requirements.txt -RUN pip3 install --no-cache -r /opt/ml/code/src/requirements.txt - -# set some environment variables -ENV PYTHONDONTWRITEBYTECODE=1 \ - PYTHONUNBUFFERED=1 \ - PYTHONIOENCODING=UTF-8 \ - LANG=C.UTF-8 \ - LC_ALL=C.UTF-8 - -# copy folders for code -COPY src/config/ /opt/ml/code/config/ -COPY src/ml/ /opt/ml/code/ml/ -COPY src/util.py /opt/ml/code/util.py - -# Copy entrypoint script to the image and make it executable -COPY inference/main.py /opt/ml/code/main.py -COPY inference/handler.py /opt/ml/code/serving/handler.py - -# install sagemaker training -RUN pip3 install --no-cache --upgrade \ - boto3 \ - sagemaker - -# Setting PYTHONPATH to access the copied code -ENV PYTHONPATH="/opt/ml/code:${PATH}" - -# Add a Python script and configure Docker to run it -ENTRYPOINT ["python3", "/opt/ml/code/main.py"] diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/.ipynb_checkpoints/handler-checkpoint.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/.ipynb_checkpoints/handler-checkpoint.py deleted file mode 100644 index b6bdc50..0000000 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/.ipynb_checkpoints/handler-checkpoint.py +++ /dev/null @@ -1,65 +0,0 @@ -import sys -sys.path.append("..") - -import os -import logging -import pandas as pd -from joblib import load -from six import StringIO - -from ml.model.wrapper import Wrapper -from sagemaker_inference.default_inference_handler import DefaultInferenceHandler -from sagemaker_inference.default_handler_service import DefaultHandlerService -from sagemaker_inference import content_types, errors, transformer, encoder, decoder - -logging.getLogger().setLevel('INFO') - -# Path to access the model -MODEL_DIR = '/opt/ml/model' - -def _csv_to_pandas(string_like): # type: (str) -> pd.DataFrame - """Convert a CSV object to a pandas DataFrame. - Args: - string_like (str): CSV string. - - Returns: - (pd.DataFrame): pandas DataFrame - """ - stream = StringIO(string_like) - res = pd.read_csv(stream) - return res - -class HandlerService(DefaultHandlerService, DefaultInferenceHandler): - """ - Execute the inference step in the virtual environment - - """ - def __init__(self): - op = transformer.Transformer(default_inference_handler=self) - super(HandlerService, self).__init__(transformer=op) - - # Loads the model from the disk - def default_model_fn(self, model_dir): - logging.info('Loading the model') - return load(os.path.join(MODEL_DIR, "model.pkl")) - - # Parse and check the format of the input data - def default_input_fn(self, input_data, content_type): - global colunas - if content_type != "text/csv": - raise Exception("Invalid content-type: %s" % content_type) - return _csv_to_pandas(input_data) - - # Run our model and do the prediction - def default_predict_fn(self, df, model): - logging.info('Predicting...') - resultados = model.predict(df,included_input=True) - logging.info('Prediction Complete') - return resultados.reset_index(drop=True).T.reset_index().T - - # Gets the prediction output and format it to be returned to the user - def default_output_fn(self, prediction, accept): - logging.info('Saving') - if accept != "text/csv": - raise Exception("Invalid accept: %s" % accept) - return encoder.encode(prediction, accept) \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/.ipynb_checkpoints/main-checkpoint.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/.ipynb_checkpoints/main-checkpoint.py deleted file mode 100644 index 9ff9b2a..0000000 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/.ipynb_checkpoints/main-checkpoint.py +++ /dev/null @@ -1,12 +0,0 @@ -import argparse -import sys -import os -import logging -from sagemaker_inference import model_server - -logging.getLogger().setLevel(logging.INFO) - - -if __name__ == "__main__": - - model_server.start_model_server(handler_service="serving.handler") \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/.ipynb_checkpoints/Dockerfile-checkpoint b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/.ipynb_checkpoints/Dockerfile-checkpoint deleted file mode 100644 index 38fa906..0000000 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/.ipynb_checkpoints/Dockerfile-checkpoint +++ /dev/null @@ -1,60 +0,0 @@ -FROM ubuntu:latest -# Set a docker label to advertise multi-model support on the container -LABEL com.amazonaws.sagemaker.capabilities.multi-models=false -# Set a docker label to enable container to use SAGEMAKER_BIND_TO_PORT environment variable if present -LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true - -# No question/dialog is asked during apt-get install -ARG DEBIAN_FRONTEND=noninteractive - -# Setting the Timezone Environment Variable -ENV TZ=America/Sao_Paulo - -# install ubuntu libraries -RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - gcc \ - python3.7 \ - python3-dev \ - python3-pip \ - ca-certificates \ - git \ - curl \ - nginx \ - openjdk-8-jre-headless\ - wget &&\ - rm -rf /var/lib/apt/lists/* - -# Create folders for code -RUN mkdir /opt/ml && \ - mkdir /opt/ml/processing && \ - mkdir /opt/ml/processing/input && \ - mkdir /opt/ml/processing/input/raw_data && \ - mkdir /opt/ml/processing/input/preprocessing && \ - mkdir /opt/ml/processing/input/expectations && \ - mkdir /opt/ml/processing/output && \ - mkdir /opt/ml/processing/output/processed && \ - mkdir /opt/ml/processing/output/processed/train && \ - mkdir /opt/ml/processing/output/processed/val && \ - mkdir /opt/ml/processing/output/processed/inference && \ - mkdir /opt/ml/processing/output/expectations && \ - mkdir /opt/ml/processing/output/validations - -# Install requirements -COPY requirements.txt /opt/ml/code/src/requirements.txt -RUN pip3 install --no-cache -r /opt/ml/code/src/requirements.txt - -# Copy entrypoint script to the image and make it executable -COPY src/config/ /opt/ml/code/src/config/ -COPY src/ml/ /opt/ml/processing/ml/ -COPY src/util.py /opt/ml/processing/util.py -COPY processor/preprocessor.py /opt/ml/processing/preprocessor.py - -# Change working directory -WORKDIR /opt/ml/processing - -# Setting PYTHONPATH to access the copied code -ENV PYTHONPATH="/opt/ml/processing:${PATH}" - -# Add a Python script and configure Docker to run it -ENTRYPOINT ["python3", "preprocessor.py"] diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/.ipynb_checkpoints/preprocessor-checkpoint.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/.ipynb_checkpoints/preprocessor-checkpoint.py deleted file mode 100644 index 1920dbd..0000000 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/.ipynb_checkpoints/preprocessor-checkpoint.py +++ /dev/null @@ -1,68 +0,0 @@ -from ml.preprocessing.preprocessing import Preprocessing -from ml.preprocessing.dataquality import DataQuality -import great_expectations as ge -from datetime import date -import pandas as pd -import argparse -import logging -import glob -import json -from joblib import dump, load - -logging.getLogger().setLevel('INFO') - -if __name__=='__main__': - """ - Execute the processor step in the virtual environment - - """ - logging.info('Starting the preprocessing') - - # Read the step argument (train or test) - parser = argparse.ArgumentParser() - parser.add_argument('--step', type=str, default='train') - args = parser.parse_args() - step_train = True if args.step == "train" else False - logging.info(f'step_train: {step_train}') - - logging.info('Reading the inputs') - file = glob.glob("/opt/ml/processing/input/raw_data/*.csv")[0] - logging.info(f'Reading file: {file}') - df = pd.read_csv(file) - - - logging.info("Data Quality") - # If True, it creates the DataQuality object, otherwise it loads an existing one - if step_train: - dq = DataQuality(discrete_cat_cols=['Sex', 'Pclass','Survived']) - df_ge = dq.perform(df) - df_ge.save_expectation_suite('/opt/ml/processing/output/expectations/expectations.json') - else: - date = date.today().strftime('%Y%m%d') - df_ge = ge.dataset.PandasDataset(df) - ge_val = df_ge.validate(expectation_suite='/opt/ml/processing/input/expectations/expectations.json', only_return_failures=False) - with open(f'/opt/ml/processing/output/validations/{date}.json', 'w') as f: - json.dump(ge_val.to_json_dict(), f) - - logging.info("Preprocessing") - # If True, it creates the Preprocessing object, otherwise it loads an existing one - if step_train: - norm_cols = {'min-max': ['Age']} - oneHot_cols = ['Pclass','Sex'] - p = Preprocessing(norm_cols, oneHot_cols) - train, test_train = p.execute(df, step_train = True, val_size = 0.2) - else: - p = load("/opt/ml/processing/input/preprocessing/preprocessing.pkl") - test = p.execute(df, step_train = False) - - logging.info("Saving") - # If True, it saves the Preprocessing to be used later in the inference step - if step_train: - dump(p, '/opt/ml/processing/output/preprocessing/preprocessing.pkl') - - # If True, it saves the train and val files, otherwise it saves only the inference file - if step_train: - train.to_csv('/opt/ml/processing/output/processed/train/train.csv', index=False) - test_train.to_csv('/opt/ml/processing/output/processed/val/val.csv', index=False) - else: - test.to_csv('/opt/ml/processing/output/processed/inference/inference.csv', index=False) \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/train/.ipynb_checkpoints/Dockerfile-checkpoint b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/train/.ipynb_checkpoints/Dockerfile-checkpoint deleted file mode 100644 index 207b1f7..0000000 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/train/.ipynb_checkpoints/Dockerfile-checkpoint +++ /dev/null @@ -1,66 +0,0 @@ -FROM ubuntu:latest -# Set a docker label to advertise multi-model support on the container -LABEL com.amazonaws.sagemaker.capabilities.multi-models=false -# Set a docker label to enable container to use SAGEMAKER_BIND_TO_PORT environment variable if present -LABEL com.amazonaws.sagemaker.capabilities.accept-bind-to-port=true - -# No question/dialog is asked during apt-get install -ARG DEBIAN_FRONTEND=noninteractive - -# Setting the Timezone Environment Variable -ENV TZ=America/Sao_Paulo - -# install ubuntu libraries -RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - gcc \ - python3.7 \ - python3-dev \ - python3-pip \ - ca-certificates \ - git \ - curl \ - nginx \ - openjdk-8-jre-headless\ - wget &&\ - rm -rf /var/lib/apt/lists/* - -# Create folders for code -RUN mkdir /opt/ml && \ - mkdir /opt/ml/output && \ - mkdir /opt/ml/code && \ - mkdir /opt/ml/code/train && \ - mkdir /opt/ml/code/src - -# Install requirements -COPY requirements.txt /opt/ml/code/src/requirements.txt -RUN pip3 install --no-cache -r /opt/ml/code/src/requirements.txt - -# Install the SageMaker Training Toolkit -RUN pip3 install --no-cache \ - boto3 \ - sagemaker \ - sagemaker-training - -# copy folders for code -COPY src/config/ /opt/ml/code/src/config/ -COPY src/ml/ /opt/ml/code/src/ml/ -COPY src/util.py /opt/ml/code/src/util.py -COPY train/train.py /opt/ml/code/train.py - -# Copy entrypoint script to the image and make it executable -WORKDIR /opt/ml/code - -# Environment variables -ENV PYTHONDONTWRITEBYTECODE=1 \ - PYTHONUNBUFFERED=1 \ - PYTHONIOENCODING=UTF-8 \ - LANG=C.UTF-8 \ - LC_ALL=C.UTF-8 - -# Setting PYTHONPATH to access the copied code -ENV PYTHONPATH="/opt/ml/code:${PATH}" - -# Add a Python script and configure Docker to run it -RUN chmod +x train.py -ENV SAGEMAKER_PROGRAM train.py diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/train/.ipynb_checkpoints/train-checkpoint.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/train/.ipynb_checkpoints/train-checkpoint.py deleted file mode 100644 index bc7b4cd..0000000 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/train/.ipynb_checkpoints/train-checkpoint.py +++ /dev/null @@ -1,84 +0,0 @@ -import sys -sys.path.append("src/") - -import os -import traceback -import pandas as pd -import logging -from sklearn.metrics import * -from ml.model.trainer import TrainerSklearn -from sklearn.ensemble import RandomForestClassifier -from util import * - -logging.getLogger().setLevel('INFO') - -# Paths to access the datasets and salve the model -prefix = '/opt/ml/' - -training_path = os.environ["SM_CHANNEL_TRAIN"] -val_path = os.environ["SM_CHANNEL_VALIDATION"] - -error_path = os.path.join(prefix, 'output') -model_path = os.environ['SM_MODEL_DIR'] - -def train(): - """ - Execute the train step in the virtual environment - - """ - logging.info('Starting the training') - try: - logging.info('Reading the inputs') - # Take the set of train files and read them all into a single pandas dataframe - input_files = [ os.path.join(training_path, file) for file in os.listdir(training_path) ] - if len(input_files) == 0: - raise ValueError(('There are no files in {}.\n' + - 'This usually indicates that the channel ({}) was incorrectly specified,\n' + - 'the data specification in S3 was incorrectly specified or the role specified\n' + - 'does not have permission to access the data.').format(training_path, channel_name)) - raw_data = [ pd.read_csv(file) for file in input_files ] - train = pd.concat(raw_data) - - # Take the set of val files and read them all into a single pandas dataframe - input_files = [ os.path.join(val_path, file) for file in os.listdir(val_path) ] - if len(input_files) == 0: - raise ValueError(('There are no files in {}.\n' + - 'This usually indicates that the channel ({}) was incorrectly specified,\n' + - 'the data specification in S3 was incorrectly specified or the role specified\n' + - 'does not have permission to access the data.').format(val_path, channel_name)) - raw_data = [ pd.read_csv(file) for file in input_files ] - val = pd.concat(raw_data) - - # Define the target and columns to be used in the train - target = "Survived" - columns = train.columns.drop(target) - - logging.info("Training the model") - model = TrainerSklearn().train(train, val, target, classification=True, - algorithm=RandomForestClassifier, - columns=columns) - - # Salve the model and metrics - logging.info("Saving") - model.save_model(os.path.join(model_path, 'model.pkl')) - metrics = model.artifacts["metrics"] - logging.info(f"accuracy={metrics['accuracy']}; f1={metrics['f1']}; precision={metrics['precision']}; recall={metrics['recall']};") - pd.DataFrame(model.artifacts["metrics"].items(), columns=['Metric', 'Value']).to_csv(os.path.join(model_path, 'metrics.csv'), index=False) - logging.info('Training complete.') - - except Exception as e: - # Write out an error file. This will be returned as the failureReason in the - # DescribeTrainingJob result. - trc = traceback.format_exc() - with open(os.path.join(error_path, 'failure'), 'w') as s: - s.write('Exception during training: ' + str(e) + '\n' + trc) - # Printing this causes the exception to be in the training job logs, as well. - logging.info('Exception during training: ' + str(e) + '\n' + trc, file=sys.stderr) - # A non-zero exit code causes the training job to be marked as Failed. - sys.exit(255) - -if __name__ == '__main__': - train() - - # A zero exit code causes the job to be marked a Succeeded. - sys.exit(0) From 73929b80c08e8873bc07ebb8155a25819a18c31a Mon Sep 17 00:00:00 2001 From: karenstemartins Date: Fri, 28 May 2021 13:47:21 +0000 Subject: [PATCH 07/10] Remove checkpoints --- hermione/.ipynb_checkpoints/cli-checkpoint.py | 136 ------------------ .../__IMPLEMENTED_SAGEMAKER__-checkpoint.json | 7 - 2 files changed, 143 deletions(-) delete mode 100644 hermione/.ipynb_checkpoints/cli-checkpoint.py delete mode 100644 hermione/module_templates/.ipynb_checkpoints/__IMPLEMENTED_SAGEMAKER__-checkpoint.json diff --git a/hermione/.ipynb_checkpoints/cli-checkpoint.py b/hermione/.ipynb_checkpoints/cli-checkpoint.py deleted file mode 100644 index 73b67ef..0000000 --- a/hermione/.ipynb_checkpoints/cli-checkpoint.py +++ /dev/null @@ -1,136 +0,0 @@ -import click -import os -import re -import sys -from .writer import * -from .module_writer import modules_autocomplete, write_module -from .__init__ import __version__ as version - -LOCAL_PATH = os.getcwd() - -# Correct LOCAL_PATH in case of empty spaces #21 - -logo = r""" - _ _ -| |__ ___ _ __ _ __ ___ (_) ___ _ __ ___ -| '_ \ / _ \ '__| '_ ` _ \| |/ _ \| '_ \ / _ \ -| | | | __/ | | | | | | | | (_) | | | | __/ -|_| |_|\___|_| |_| |_| |_|_|\___/|_| |_|\___| -v{} -""".format(version) - - -@click.group() -def cli(): - pass - -@cli.command() -def info(): - """ - Checks that hermione is correctly installed - """ - click.echo(logo) - -@cli.command() -@click.argument('project_name') -@click.option('-imp', '--implemented', 'implemented', prompt='Do you want to start with an implemented example (recommended) [y/n]?', - default='y', show_default=True) -def new(project_name, implemented): - """ - Create a new hermione project - """ - if implemented in ['yes', 'ye', 'y', 'Yes', 'YES', 'Y']: - is_imp = True - else: - is_imp = False - - click.echo(f"Creating project {project_name}") - - - custom_inputs = { - 'project_name':project_name, - "project_start_date": datetime.today().strftime("%B %d, %Y") - } - os.makedirs(os.path.join(LOCAL_PATH, project_name)) - if is_imp: - option = click.prompt('Do you want to start with: \n\t(1) Sagemaker \n\t(2) Local version \n', type=int, default=2) - implemented_version_type(project_name,custom_inputs,option) - else: - write_module(os.path.join(LOCAL_PATH, project_name), '__NOT_IMPLEMENTED_BASE__', True, custom_inputs) - - print(f'Creating virtual environment {project_name}_env') - os.chdir(project_name) - env_name = f"{project_name}_env" - os.system(f"python -m venv {env_name}") - - # Create git repo - os.system('git init') - print("A git repository was created. You should add your files and make your first commit.\n") - -def implemented_version_type(project_name,custom_inputs,option): - """ - Create a new hermione project - """ - if option == 1: - write_module(os.path.join(LOCAL_PATH, project_name), '__IMPLEMENTED_SAGEMAKER__', True, custom_inputs) - else: - write_module(os.path.join(LOCAL_PATH, project_name), '__IMPLEMENTED_BASE__', True, custom_inputs) - -@cli.command() -def train(): - """ - Execute the script in train.py. One should be at src directory - """ - if not os.path.exists('./train.py'): - click.echo("You gotta have an src/train.py file") - else: - os.system('python ./train.py') - print("\nModel trained. For MLFlow logging control, type:\nmlflow ui\nand visit http://localhost:5000/") - - -@cli.command() -def predict(): - """ - Execute the script in predict.py to make batch predictions. - One should be at src directory - """ - if not os.path.exists('./predict.py'): - click.echo("You gotta have an src/predict.py file") - else: - print("Making predictions: ") - os.system('python ./predict.py') - - -@click.argument('image_name') -@click.option('-t', '--tag', 'tag', default='latest', show_default=True) -@cli.command() -def build(image_name, tag): - """ - Build a docker image with given image_name. Only run if you have docker installed. - One should be at the root directory. - """ - if not os.path.exists('src/Dockerfile'): - click.echo("You gotta have an src/Dockerfile file. You must be at the project's root folder.") - else: - os.system(f'docker build -f src/Dockerfile -t {image_name}:{tag} .') - - -@click.argument('image_name') -@click.option('-t', '--tag', 'tag', default='latest', show_default=True) -@cli.command() -def run(image_name, tag): - """ - Run a container with given image_name. - Only run if you have docker installed. - """ - if not os.path.exists('src/Dockerfile'): - click.echo("You gotta have an src/Dockerfile file. You must be at the project's root folder.") - else: - os.system(f'docker run --rm -p 5000:5000 {image_name}:{tag}') - - -@click.argument("module_name", type = click.STRING, autocompletion=modules_autocomplete) -@cli.command() -@click.option('-y','--autoconfirm', is_flag=True) -def add_module(module_name, autoconfirm): - write_module(LOCAL_PATH, module_name, autoconfirm) \ No newline at end of file diff --git a/hermione/module_templates/.ipynb_checkpoints/__IMPLEMENTED_SAGEMAKER__-checkpoint.json b/hermione/module_templates/.ipynb_checkpoints/__IMPLEMENTED_SAGEMAKER__-checkpoint.json deleted file mode 100644 index aa8798f..0000000 --- a/hermione/module_templates/.ipynb_checkpoints/__IMPLEMENTED_SAGEMAKER__-checkpoint.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "info": "Base files with implemented example", - "input_info": [ - ["project_name", "My Project", "Enter your project name"], - ["project_start_date", "01/01/21", "Enter the date your project started"] - ] -} \ No newline at end of file From b3c38083d480685b8f3fbd7657c7662e2dbba01a Mon Sep 17 00:00:00 2001 From: karenstemartins Date: Fri, 28 May 2021 14:13:10 +0000 Subject: [PATCH 08/10] Remove checkpoints --- .../.ipynb_checkpoints/config-checkpoint.json | 7 - .../.ipynb_checkpoints/cluster-checkpoint.py | 166 ---- .../feature_selection-checkpoint.py | 387 --------- .../.ipynb_checkpoints/pca-checkpoint.py | 149 ---- .../.ipynb_checkpoints/vif-checkpoint.py | 48 -- .../.ipynb_checkpoints/metrics-checkpoint.py | 212 ----- .../.ipynb_checkpoints/trainer-checkpoint.py | 104 --- .../.ipynb_checkpoints/wrapper-checkpoint.py | 252 ------ .../Sagemaker_Inference-checkpoint.ipynb | 322 -------- .../Sagemaker_Processor-checkpoint.ipynb | 396 ---------- ...r_StepFunctions_Inference-checkpoint.ipynb | 737 ------------------ ...maker_StepFunctions_Train-checkpoint.ipynb | 540 ------------- .../Sagemaker_Train-checkpoint.ipynb | 393 ---------- .../dataquality-checkpoint.py | 60 -- .../normalization-checkpoint.py | 159 ---- .../preprocessing-checkpoint.py | 141 ---- .../text_vectorizer-checkpoint.py | 201 ----- .../.ipynb_checkpoints/README-checkpoint.md | 41 - .../test_project-checkpoint.py | 54 -- 19 files changed, 4369 deletions(-) delete mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/config/.ipynb_checkpoints/config-checkpoint.json delete mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/.ipynb_checkpoints/cluster-checkpoint.py delete mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/.ipynb_checkpoints/feature_selection-checkpoint.py delete mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/.ipynb_checkpoints/pca-checkpoint.py delete mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/.ipynb_checkpoints/vif-checkpoint.py delete mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/.ipynb_checkpoints/metrics-checkpoint.py delete mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/.ipynb_checkpoints/trainer-checkpoint.py delete mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/.ipynb_checkpoints/wrapper-checkpoint.py delete mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_Inference-checkpoint.ipynb delete mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_Processor-checkpoint.ipynb delete mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_StepFunctions_Inference-checkpoint.ipynb delete mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_StepFunctions_Train-checkpoint.ipynb delete mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_Train-checkpoint.ipynb delete mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/.ipynb_checkpoints/dataquality-checkpoint.py delete mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/.ipynb_checkpoints/normalization-checkpoint.py delete mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/.ipynb_checkpoints/preprocessing-checkpoint.py delete mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/.ipynb_checkpoints/text_vectorizer-checkpoint.py delete mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/tests/.ipynb_checkpoints/README-checkpoint.md delete mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/tests/.ipynb_checkpoints/test_project-checkpoint.py diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/config/.ipynb_checkpoints/config-checkpoint.json b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/config/.ipynb_checkpoints/config-checkpoint.json deleted file mode 100644 index c34a7bc..0000000 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/config/.ipynb_checkpoints/config-checkpoint.json +++ /dev/null @@ -1,7 +0,0 @@ -{ -"project_name": "hermione-sagemaker", - "env_path": "hermione-sagemaker/hermione-sagemaker_env", - "files_path": "../data/raw/", - "key": "<<<>>>", - "user": "<<<>>>" - } \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/.ipynb_checkpoints/cluster-checkpoint.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/.ipynb_checkpoints/cluster-checkpoint.py deleted file mode 100644 index 5e5f7a6..0000000 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/.ipynb_checkpoints/cluster-checkpoint.py +++ /dev/null @@ -1,166 +0,0 @@ -from sklearn.mixture import GaussianMixture -from sklearn.cluster import KMeans -from sklearn import metrics -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt - -class Cluster: - - @classmethod - def analyzeK(cls, X, k_min = 2, k_max = 20): - """ - Plot the result of the methods (elbow, silhouette and calinski_harabas) to find the best k - - Parameters - ---------- - X : array - values ​​that will be used to find the best k - k_min : int - minimum interval for K - k_max : int - maximum range for K - - Returns - ------- - None - """ - - if X is None: - raise Exception("Error: X is None.") - if k_min is None or k_max is None: - raise Exception("Error: Range is None.") - if k_min < 2: - raise Exception("Error: k_min < 2") - - wss = [] - s_gmm = [] - s_kmeans = [] - ch_gmm = [] - ch_kmeans = [] - - K = range(k_min, k_max) - - for k in K: - kmeans = KMeans(n_clusters=k) - kmeans.fit(X) - gmm = GaussianMixture(n_components=k, covariance_type='full') - gmm.fit(X) - - labels_kmeans = kmeans.predict(X) - labels_gmm = gmm.predict(X) - - s_kmeans.append(metrics.silhouette_score(X, labels_kmeans, metric='euclidean')) - s_gmm.append(metrics.silhouette_score(X, labels_gmm, metric='euclidean')) - - ch_kmeans.append(metrics.calinski_harabasz_score(X, labels_kmeans)) - ch_gmm.append(metrics.calinski_harabasz_score(X, labels_gmm)) - - wss.append(kmeans.inertia_) - - cls._elbow(K, wss) - cls._silhouette_coefficient(K, s_kmeans, s_gmm) - cls._calinski_harabaz(K, ch_kmeans, ch_gmm) - - @classmethod - def _elbow(cls, K, wss): - """ - Function plots the result of the elbow method - - Parameters - ---------- - k : array - possible k values - k_min : array - Total WSS measures cluster compression and we want it to be as small as possible - Returns - ------- - None - """ - plt.plot(K, wss, 'bx-') - plt.xlabel('k') - plt.ylabel('WSS') - plt.title('The Elbow Method showing the optimal k') - plt.show() - - @classmethod - def _silhouette_coefficient(cls, K, s_kmeans, s_gmm): - """ - Function plots the result of the silhouette method for kmeans and Gaussian Mixture Models - - Parameters - ---------- - k : array - k values - s_kmeans : array - Silhouette kmeans values - s_gmm : array - Silhouette Gaussian Mixture Models values - - Returns - ---- - None - """ - plt.plot(K, s_kmeans, 'xr-') # plotting t, a separately - plt.plot(K, s_gmm, 'ob-') - plt.legend(["kmeans", "gmm"]) - plt.xlabel('k') - plt.ylabel('Mean Silhouette Coefficient') - plt.title('Mean Silhouette Coefficient for each k') - plt.show() - - @classmethod - def _calinski_harabaz(cls, K, ch_kmeans, ch_gmm): - """ - Function plots the result of the calinski_harabaz method for kmeans and Gaussian Mixture Models - - Parameters - ---------- - k : array - possible k values - s_kmeans : array - calinski_harabaz kmeans values - s_gmm : array - Gaussian Mixture Models values - - Returns - ------- - None - """ - plt.plot(K, ch_kmeans, 'xr-') # plotting t, a separately - plt.plot(K, ch_gmm, 'ob-') - plt.legend(["kmeans", "gmm"]) - plt.xlabel('k') - plt.ylabel('Calinski and Harabaz score') - plt.title('Calinski and Harabaz score for each k') - plt.show() - - @classmethod - def plot_cluster(cls, df_res_algorithm, algorithm_name = "K-means"): - """ - Function that plots clusters - - Parameters - ---------- - df_res_algoritmo : pd.DataFrame - Dataframe must have the following columns (x, y, cluster) - algorithm_name : str - algorithm name - Return - ------- - None - """ - # verifica quantos clusters tem - qtde_cluster = df_res_algorithm.cluster.max()+1 - plots = [] - for cluster in range(qtde_cluster): - p = plt.scatter(df_res_algorithm[df_res_algorithm['cluster'] == cluster].x, - df_res_algorithm[df_res_algorithm['cluster'] == cluster].y) - plots.append(p) - plt.legend(tuple(plots), - (tuple(["Cluster {}".format(c) for c in range(1, qtde_cluster+1)])), - loc=2, fontsize=8, bbox_to_anchor=(1.05, 1)) - plt.xlabel("X") - plt.ylabel("Y") - plt.title("Clusters created by "+algorithm_name) - plt.show() \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/.ipynb_checkpoints/feature_selection-checkpoint.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/.ipynb_checkpoints/feature_selection-checkpoint.py deleted file mode 100644 index 4b3a7bf..0000000 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/.ipynb_checkpoints/feature_selection-checkpoint.py +++ /dev/null @@ -1,387 +0,0 @@ -from sklearn.feature_selection import VarianceThreshold -from sklearn.feature_selection import SelectKBest -from sklearn.feature_selection import SelectPercentile -from sklearn.feature_selection import RFE -from sklearn.feature_selection import SelectFromModel -from sklearn.feature_selection import SequentialFeatureSelector -from mlxtend.feature_selection import ExhaustiveFeatureSelector -from abc import ABC, abstractmethod -import numpy as np -import pandas as pd - -class SelectAlgorithm(ABC): - """ - Abstract class for feature selection algorithms - """ - def transform(self, df: pd.DataFrame): - """ - Select features based on fit - - Parameters - ---------- - df : pd.DataFrame - dataframe with features to be selected - - Returns - ------- - pd.DataFrame - dataframe with selected features only - """ - return df[df.columns[self.selected_columns]] - - def get_support(self): - """ - Get a mask, or integer index, of the features selected - - Parameters - ---------- - - Returns - ------- - np.array - """ - return self.selected_columns - - @abstractmethod - def fit(self) -> None: - """ - Abstract method that is implemented in classes that inherit it - """ - pass - -class SelectCoefficients(SelectAlgorithm): - """ - Class to select features based on model coefficients - """ - def __init__(self, model, num_feat = None): - """ - Constructor - - Parameters - ---------- - model : - should be an instance of a classification or regression model class from scikit-learn and have coef_.ravel method - - num_feats : int - number of features to be selected - Returns - ------- - SelectCoefficients - """ - self.model = model - self.num_feat = num_feat - - def fit(self, X: pd.DataFrame, y = None): - """ - Identify the features to be selected. - - Parameters - ---------- - X : pd.DataFrame - features to be selected - - y : pd.DataFrame - target values - - Returns - ------- - None - """ - self.num_feat = int(X.shape[1]/2) if self.num_feat == None else self.num_feat - trained_model = self.model.fit(X,y) - self.selected_columns = np.argsort(np.abs(trained_model.coef_.ravel()))[-self.num_feat:] - -class SelectCorrelation(SelectAlgorithm): - """ - Class to select features based on correlation between features - """ - def __init__(self, threshold = 1.0): - """ - Constructor - - Parameters - ---------- - threshold : float - correlation threshold - Returns - ------- - SelectCorrelation - """ - self.threshold = threshold - def fit(self, X: pd.DataFrame, y = None): - """ - Identify the features to be selected. - - Parameters - ---------- - X : pd.DataFrame - features to be selected - - y : pd.DataFrame - target values - - Returns - ------- - None - """ - corr = X.corr() - self.selected_columns = np.full((corr.shape[0],), True, dtype=bool) - [self.check_correlation(corr.iloc[i,j],j) for i in range(corr.shape[0]) for j in range(i+1, corr.shape[0])] - - def check_correlation(self,corr,j): - """ - Auxiliar method to check if correlation between features is above threshold - Parameters - ---------- - corr : float - correlation between two atributes - - j : int - index of column to be removed in case corr >= self.threshold - - Returns - ------- - None - """ - if np.abs(corr) >= self.threshold and self.selected_columns[j]: - self.selected_columns[j] = False - -class MyExhaustiveFeatureSelector(ExhaustiveFeatureSelector): - """ - Class that inherits from ExhaustiveFeatureSelector (from mlxtend) and implements get_support method for - compatibility issues - """ - def get_support(self): - return list(self.best_idx_) - -class SelectEnsemble(SelectAlgorithm): - """ - Class to select features based on ensemble of methods - """ - def __init__(self, dic_selection: dict, num_feat = None): - """ - Constructor - - Parameters - ---------- - dic_selection : dict - dict with name of the algorithm as keys and dicts of parameters as values - Ex: dic_selection = { 'variance': {'threshold' : 0.3}, - 'recursive': {'estimator' : LinearSVC(), 'n_features_to_select' : 2}} - num_feats : int - number of features to be selected - Returns - ------- - SelectCoefficients - """ - self.dic_selection = dic_selection - self.num_feat = num_feat - - def fit(self, X: pd.DataFrame, y = None): - """ - Identify the features to be selected. - - Parameters - ---------- - X : pd.DataFrame - features to be selected - - y : pd.DataFrame - target values - - Returns - ------- - None - """ - self.num_feat = int(X.shape[1]/2) if self.num_feat == None else self.num_feat - self.column_dic = {} - for i,column in enumerate(X.columns): - self.column_dic[column] = i - self.column_count = [0 for column in X.columns] - selections = [FeatureSelector(selector,**self.dic_selection[selector]) for selector in self.dic_selection] - [selection.fit(X,y) for selection in selections] - [self.increment_count(column) for selection in selections for column in selection.selected_columns] - self.selected_columns = np.argsort(self.column_count)[-self.num_feat:] - - def increment_count(self,column): - """ - Auxiliar method to increment the count of a column - Parameters - ---------- - column : int - column which the count will be incremented - - Returns - ------- - None - """ - self.column_count[self.column_dic[column]]+=1 - -class FeatureSelector: - - def __init__(self, selector, **kwargs): - """ - Constructor - - Parameters - ---------- - selector : str - name of algorithm to be applied - **kwargs : - optional and positional arguments of the choosen algorithm (selector) - Returns - ------- - FeatureSelector - Examples - --------- - variance thresholding: f = FeatureSelector('variance', threshold=0.3) #Instantiating - f.fit(X[,y]) #fitting (y is optional for variance thresholding) - X = f.transform(X) #transforming - - filter-based, k best (MAD): f = FeatureSelector('univariate_kbest', score_func=FeatureSelector.mean_abs_diff, k=2) #Instantiating - #score_func can be any function f: R^n -> R^n (n = number of columns) - f.fit(X,y) #fitting - X = f.transform(X) #transforming - - wrapper, recursive: f = FeatureSelector('recursive', estimator = LinearSVC(), n_features_to_select=2) #Instantiating - #estimator should be an instance of a classification or regression model class from scikit-learn - #one can use a custom class but it must be compatible with scikit-learn arquitecture - f.fit(X,y) #fitting - X = f.transform(X) #transforming - - wrapper, sequential: f = FeatureSelector('sequential', estimator = LinearSVC(), direction='forward') #Instantiating - #estimator should be an instance of a classification or regression model class from scikit-learn - #one can use a custom class but it must be compatible with scikit-learn arquitecture - f.fit(X,y) #fitting - X = f.transform(X) #transforming - - to better understand the optional arguments of each algorithm see: https://scikit-learn.org/stable/modules/feature_selection.html - """ - self.selector = selector - self.selectors = {'variance': VarianceThreshold, - 'univariate_kbest': SelectKBest, - 'univariate_percentile': SelectPercentile, - 'recursive': RFE, - 'model':SelectFromModel, - 'sequential':SequentialFeatureSelector, - 'exaustive':MyExhaustiveFeatureSelector, - 'correlation':SelectCorrelation, - 'coefficients':SelectCoefficients, - 'ensemble':SelectEnsemble} - self.kwargs = kwargs - self.fitted = False - - def fit(self, X: pd.DataFrame, y = None): - """ - Identify the features to be selected. - - Parameters - ---------- - X : pd.DataFrame - features to be selected - - y : pd.DataFrame - target values - - Returns - ------- - None - """ - self.columns = X.columns - self.selection = self.selectors[self.selector](**self.kwargs) - self.selection.fit(X,y) - self.selected_columns = self.columns[self.selection.get_support()] - self.fitted = True - - def transform(self, df: pd.DataFrame): - """ - Select features based on fit - - Parameters - ---------- - pd.DataFrame - dataframe with features to be selected - - Returns - ------- - df : pd.DataFrame - dataframe with selected features only - """ - if not self.fitted: - raise Exception("Not yet trained.") - - - #return self.selection.transform(df) - return df[self.selected_columns] - - def inverse_transform(self, df: pd.DataFrame): - """ - Apply the invese_transform of vectorizer to each column - Options: index, bag_of_words and tf_idf - - Parameters - ---------- - df : pd.DataFrame - dataframe with columns to be unvectorizer - - Returns - ------- - pd.DataFrame - """ - pass - - #return df - - @staticmethod - def mean_abs_diff(X, y=None): - """ - method to compute the mean absolute difference (MAD) of all atributes of X - - Parameters - ---------- - X : pd.DataFrame - dataframe - y: any type - not necessary, used only for compatibility issues - - Returns - ------- - pd.DataFrame - """ - return np.sum(np.abs(X - np.mean(X, axis = 0)), axis = 0)/X.shape[0] - - @staticmethod - def variance(X, y=None): - """ - method to compute the mean variance of all atributes of X - - Parameters - ---------- - X : pd.DataFrame - dataframe - y: any type - not necessary, used only for compatibility issues - - Returns - ------- - pd.DataFrame - """ - return np.sum((X - np.mean(X, axis = 0)**2), axis = 0)/X.shape[0] - - @staticmethod - def disp_ratio(X, y=None): - """ - method to compute the dispersion ratio of all atributes od X - - Parameters - ---------- - X : pd.DataFrame - dataframe - y: any type - not necessary, used only for compatibility issues - - Returns - ------- - pd.DataFrame - """ - return np.mean(X, axis = 0)/np.power(np.prod(X, axis = 0),1/X.shape[0]) diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/.ipynb_checkpoints/pca-checkpoint.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/.ipynb_checkpoints/pca-checkpoint.py deleted file mode 100644 index 2596a64..0000000 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/.ipynb_checkpoints/pca-checkpoint.py +++ /dev/null @@ -1,149 +0,0 @@ -import pandas as pd -from sklearn.decomposition import PCA as PCA_sklearn -from sklearn import metrics - -class PCA: - - def __init__(self, columns, prefix="prefix", k=2): - """ - Constructor - - Parameters - ---------- - columns : list - Columns for dimensionality reduction - prefix : bool - column prefix - k : int - Number of dimensions - - Returns - ------- - PCA - """ - self.columns = columns - self.prefix = prefix - self.k = k - - - def __find_k(self, df, threshold): - """ - Find how many k dimensions will be reduced - - Parameters - ---------- - df : pd.Dataframe - dataframe to be reduced - - Returns - ------- - int - """ - self.pca = PCA_sklearn(n_components=len(self.columns)) - self.pca.fit(df[ self.columns ].values) - for i in range(len(self.columns)-1): - if self.pca.explained_variance_ratio_[i]+self.pca.explained_variance_ratio_[i+1] < threshold: - if i == 0: - raise Expecption("Not reduced by poor explicability") - return i+1 - - def __check(self, df: pd.DataFrame): - """ - Check dataframe contains all columns - - Parameters - ---------- - df : pd.Dataframe - dataframe to be reduced - - Returns - ------- - bool - """ - if not all(col in list(df.columns) for col in self.columns): - raise Exception('Missing columns') - return True - - - def transform(self, df: pd.DataFrame): - """ - Transform the data - - Parameters - ---------- - df : pd.Dataframe - dataframe to be reduced - - Returns - ------- - None - """ - self.__check(df) - if self.pca is None: - raise Exception("Error - object not fitted") - reduced = self.pca.transform(df[self.columns].values) - for col in range(self.k): - df[self.prefix+"_"+str(col)] = [line[col] for line in reduced] - df.drop(self.columns, axis=1, inplace=True) - - - def fit(self, df : pd.DataFrame, threshold=0.4): - """ - Compute PCA object - - Parameters - ---------- - df : pd.Dataframe - dataframe to be reduced - - Returns - ------- - None - """ - self.__check(df) - if self.k is None: - self.k = self.__find_k(df,threshold) - self.pca = PCA_sklearn(n_components=self.k) - self.pca.fit(df[ self.columns ].values) - - - def fit_transform (self, df : pd.DataFrame, threshold=0.4): - """ - Fit to data, then transform it. - - Parameters - ---------- - df : pd.Dataframe - dataframe to be reduced - - Returns - ------- - None - """ - self.__check(df) - if self.k is None: - self.k = self.__find_k(df,threshold) - self.pca = PCA_sklearn(n_components=self.k) - self.pca.fit(df[ self.columns ].values) - self.transform(df) - self.report() - - - - - def report(self): - """ - Returns explained variance - - Parameters - ---------- - None - - Returns - ------- - None - """ - for col in range(self.k): - print("Explained variance ({col}): {ratio}". - format(col = self.prefix+"_"+str(col), - ratio = str(self.pca.explained_variance_ratio_[col]))) \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/.ipynb_checkpoints/vif-checkpoint.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/.ipynb_checkpoints/vif-checkpoint.py deleted file mode 100644 index 79535f8..0000000 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/analysis/.ipynb_checkpoints/vif-checkpoint.py +++ /dev/null @@ -1,48 +0,0 @@ -import pandas as pd -from statsmodels.stats.outliers_influence import variance_inflation_factor - -class VIF: - - @classmethod - def analyze(cls, df: pd.DataFrame, thresh=5.0, verbose=True): - """ - Multicollinearity analysis - - Parameters - ---------- - df : pd.DataFrame - Dataframe must have the following columns (x, y, cluster) - thresh : int - value of cut - verbose : bool - if true prints possible variables to be removed - - - Return - ------- - pd.DataFrame - """ - variables = list(range(df.shape[1])) - dropped = True - while dropped: - dropped = False - vif = [variance_inflation_factor(df.iloc[:, variables].values, ix) - for ix in range(df.iloc[:, variables].shape[1])] - - maxloc = vif.index(max(vif)) - if max(vif) > thresh: - m = max(vif) - index_max = [i for i, j in enumerate(vif) if j == m] - if verbose: - cols_possibles_remove = [str(df.iloc[:, variables].columns[i]) for i in index_max] - print("Columns that can be removed -> " + ", ".join(cols_possibles_remove)) - print("------") - print('dropping \'' + str(df.iloc[:, variables].columns[maxloc]) + - '\' at index: ' + str(maxloc)) - print("_____________________________________________________________") - del variables[maxloc] - dropped = True - - print('Remaining variables:') - print(df.columns[variables]) - return df.iloc[:, variables] \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/.ipynb_checkpoints/metrics-checkpoint.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/.ipynb_checkpoints/metrics-checkpoint.py deleted file mode 100644 index 34cd079..0000000 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/.ipynb_checkpoints/metrics-checkpoint.py +++ /dev/null @@ -1,212 +0,0 @@ -from sklearn.metrics import * -import numpy as np -from sklearn.metrics import make_scorer -from sklearn.model_selection import cross_validate - -class Metrics: - - @classmethod - def smape(cls, A, F): - """ - Calculates the smape value between the real and the predicted - - Parameters - ---------- - A : array - Target values - F : array - Predicted values - - Returns - ------- - float: smape value - """ - return 100/len(A) * np.sum(np.abs(F - A) / (np.abs(A) + np.abs(F))) - - @classmethod - def __custom_score(cls, y_true, y_pred): - """ - Creates a custom metric - - Parameters - ---------- - y_true : array - Target values - y_pred : array - Predicted values - - Returns - ------- - sklearn.metrics - """ - #return sklearn.metrics.fbeta_score(y_true, y_pred, 2) - pass - - @classmethod - def customized(cls, y_true, y_pred): - """ - Creates a custom metric - - Parameters - ---------- - y_true : array - Target values - y_pred : array - Predicted values - - Returns - ------- - float - """ - custom_metric = make_scorer(cls.__custom_score, greater_is_better=True) - return custom_metric - - @classmethod - def mape(cls, y_true, y_pred): - """ - Calculates the map value between the real and the predicted - - Parameters - ---------- - y_true : array - Target values - y_pred : array - Predicted values - - Returns - ------- - float : value of mape - """ - y_true, y_pred = np.array(y_true), np.array(y_pred) - return np.mean(np.abs(((y_true+1) - (y_pred+1)) / (y_true+1))) * 100 - - - @classmethod - def regression(cls, y_true, y_pred): - """ - Calculates some metrics for regression problems - - Parameters - ---------- - y_true : array - Target values - y_pred : array - Predicted values - - Returns - ------- - dict : metrics results - """ - results = {'mean_absolute_error': round(mean_absolute_error(y_true, y_pred), 7), - 'root_mean_squared_error': round(np.sqrt(mean_squared_error(y_true, y_pred)), 7), - 'r2': round(r2_score(y_true, y_pred), 7), - 'smape': round(cls.smape(y_true, y_pred), 7), - 'mape': round(cls.mape(y_true, y_pred), 7) - } - return results - - @classmethod - def crossvalidation(cls, model, X, y, classification: bool, cv=5, agg=np.mean): - if classification: - if len(set(y)) > 2: - metrics = ['accuracy','f1_weighted', 'recall_weighted','precision_weighted'] - else: - metrics = ['accuracy','f1', 'recall','precision', 'roc_auc'] - else: - metrics = ['mean_absolute_error', 'r2', 'root_mean_squared_error', 'smape', 'mape'] - res_metrics = cross_validate(model, X, y, cv=cv, return_train_score=False, scoring=metrics) - results = {metric.replace("test_", ""): round(agg(res_metrics[metric]),7) for metric in res_metrics} - return results - - @classmethod - def __multiclass_classification(cls, y_true, y_pred): - """ - Calculates some metrics for multiclass classification problems - - Parameters - ---------- - y_true : array - Target values - y_pred : array - Predicted values - - Returns - ------- - dict : metrics results - """ - results = {'accuracy': accuracy_score(y_true, y_pred), - 'f1': f1_score(y_true, y_pred, average='weighted'), - 'precision': precision_score(y_true, y_pred, average='weighted'), - 'recall': recall_score(y_true, y_pred, average='weighted'), - } - return results - - @classmethod - def __binary_classification(cls, y_true, y_pred, y_probs): - """ - Calculates some metrics for binary classification problems - - Parameters - ---------- - y_true : array - Target values - y_pred : array - Predicted values - - Returns - ------- - dict : metrics results - """ - results = {'accuracy': accuracy_score(y_true, y_pred), - 'f1': f1_score(y_true, y_pred), - 'precision': precision_score(y_true, y_pred), - 'recall': recall_score(y_true, y_pred), - 'roc_auc': roc_auc_score(y_true, y_probs) - } - return results - - @classmethod - def classification(cls, y_true, y_pred, y_probs): - """ - Checks which classification method will be applied: binary or multiclass - - Parameters - ---------- - y_true : array - Target values - y_pred : array - Predicted values - y_probs : array - Probabilities values - - Returns - ------- - dict: metrics results - """ - if len(set(y_true)) > 2: - results = cls.__multiclass_classification(y_true, y_pred) - else: - results = cls.__binary_classification(y_true, y_pred, y_probs) - return results - - - @classmethod - def clusterization(cls, X, labels): - """ - Calculates some metrics on clustering quality - - Parameters - ---------- - X : array[array], shape (n_linha, n_colunas) - Matrix with the values that were used in the cluster - labels : array, shape (n_linha, 1) - Vector with labels selected by the clustering method (eg KMeans) - - Returns - ------- - dict : metrics results - """ - results = {'silhouette': silhouette_score(X, labels, metric='euclidean'), - 'calinski_harabaz': calinski_harabaz_score(X, labels) - } - return results \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/.ipynb_checkpoints/trainer-checkpoint.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/.ipynb_checkpoints/trainer-checkpoint.py deleted file mode 100644 index 1266611..0000000 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/.ipynb_checkpoints/trainer-checkpoint.py +++ /dev/null @@ -1,104 +0,0 @@ -from abc import ABC, abstractmethod -from ml.model.wrapper import Wrapper -from ml.model.metrics import Metrics -import statsmodels.formula.api as smf -from sklearn.model_selection import train_test_split -import numpy as np - -class Trainer(ABC): - def __init__(self): - """ - Constructor - - Parameters - ---------- - None - - Returns - ------- - Trainer - """ - - @abstractmethod - def train(self): - """ - Abstract method that should be implemented in every class that inherits TrainerModel - Parameters - ---------- - None - - Returns - ------- - None - """ - pass - -class TrainerSklearn(Trainer): - - def train(self, train, val, y_name, - classification: bool, - algorithm, - columns = None, - **params): - """ - Method that builds the Sklearn model - - Parameters - ---------- - train : pd.Dataframe - data to train the model - val : pd.Dataframe - data to validate the model - y_name : str - target name - algorithm : Sklearn algorithm - algorithm to be trained - classification : bool - if True, classification model training takes place, otherwise Regression - columns : array - columns name to be used in the train - - Returns - ------- - Wrapper - """ - model = algorithm(**params) #model - y_train = train[y_name] - y_val = val[y_name] - X_train = train[columns] - X_val = val[columns] - model.fit(X_train,y_train) - y_pred = model.predict(X_val) - y_probs = model.predict_proba(X_val)[:,1] - if classification: - res_metrics = Metrics.classification(y_val.values, y_pred, y_probs) - else: - res_metrics = Metrics.regression(y_val.values, y_pred) - model = Wrapper(model, res_metrics, X_train.columns) - return model - - -class TrainerSklearnUnsupervised(Trainer): - - def train(self, X, - algorithm, - **params): - """ - Method that builds the Sklearn model - - Parameters - ---------- - model_name : str - model name - - Returns - ------- - Wrapper - """ - model = algorithm(**params) #model - columns = list(X.columns) - model.fit(X) - labels = model.predict(X) - res_metrics = Metrics.clusterization(X, labels) - model = Wrapper(model, res_metrics, columns) - return model diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/.ipynb_checkpoints/wrapper-checkpoint.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/.ipynb_checkpoints/wrapper-checkpoint.py deleted file mode 100644 index 8f812cf..0000000 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/.ipynb_checkpoints/wrapper-checkpoint.py +++ /dev/null @@ -1,252 +0,0 @@ -from joblib import dump, load -from datetime import date -import mlflow.pyfunc -from mlflow import pyfunc -from interpret.ext.blackbox import TabularExplainer, MimicExplainer -from interpret.ext.glassbox import * -import pandas as pd - -from util import load_yaml, load_json - - -class Wrapper(mlflow.pyfunc.PythonModel): - def __init__(self, model=None, metrics=None, columns=None): - """ - Constructor - - Parameters - ---------- - model : object - If it's just a model: enter all parameters - if it is more than one model: do not enter parameters and use - the add method to add each of the models - metrics : dict - Dictionary with the metrics of the result of the model - columns : list - list with columns names - Returns - ------- - WrapperModel - """ - self.artifacts = dict() - self.artifacts["model"] = model - self.artifacts["metrics"] = metrics - self.artifacts["columns"] = columns - self.artifacts["creation_date"] = date.today() - - def predict(self, model_input, included_input=False): - """ - Method that returns the result of the prediction on a dataset - - Parameters - ---------- - df : pd.DataFrame - Data to be predicted - - Returns - ------- - list - """ - df_processed = model_input.copy() - model = self.artifacts["model"] - columns = self.artifacts["columns"] - result = model.predict(df_processed[columns]) - if included_input: - model_input['predict'] = result - result = model_input - return result - - def predict_proba(self, model_input, binary=False): - """ - Method that returns the result of the prediction on a dataset - - Parameters - ---------- - df : pd.DataFrame - data to be predicted - - Returns - ------- - list - """ - df_processed = model_input.copy() - model = self.artifacts["model"] - columns = self.artifacts["columns"] - if binary: - return model.predict_proba(df_processed[columns])[:, 1] - else: - return model.predict_proba(df_processed[columns]) - - def save_model(self, path): - """ - Saves the model object to a specific path - - Parameters - ---------- - path : str - path where the model object will be saved - - Returns - ------- - None - """ - dump(self, path) - - @staticmethod - def load_model(path): - """ - Loads the model object in a specific path - - Parameters - ---------- - path : str - path where the model object will be loaded. - - Returns - ------- - None - """ - model = load(path) - return model - - def save(self, path): - """ - Save model as a Wrapper class - - Parameters - ---------- - path : str - path where the model object will be loaded. - - Returns - ------- - None - """ - path_artifacts = path + "_artifacts.pkl" - dump(self.artifacts, path_artifacts) - content = load_json("config/arquivos.json") - conda_env = load_yaml(content["path_yaml"]) - mlflow.pyfunc.save_model( - path=path, - python_model=self, - artifacts={"model": path_artifacts}, - conda_env=conda_env, - ) - - def get_metrics(self): - """ - Return metrics - - Parameters - ---------- - self : object Wrapper - - Returns - ------- - dict - """ - return self.artifacts["metrics"] - - def get_columns(self): - """ - Return columns - - Parameters - ---------- - self : object Wrapper - - Returns - ------- - list - """ - return self.artifacts["columns"] - - def get_model(self): - """ - Return model - - Parameters - ---------- - self : object Wrapper - - Returns - ------- - dict - """ - return self.artifacts["model"] - - def train_interpret(self, X, model="tabular"): - """ - Train a interpret model - - Parameters - ---------- - self : object Wrapper - X : pd.DataFrame - Data that were used in the train for interpret - model : string, optional - Model to use for the interpret [tabular,mimic_LGBME, - mimic_Linear,mimic_SGDE,mimic_Dec_Tree] - Returns - ------- - None - """ - mimic_models = { - "mimic_LGBME": LGBMExplainableModel, - "mimic_Linear": LinearExplainableModel, - "mimic_SGDE": SGDExplainableModel, - "mimic_Dec_Tree": DecisionTreeExplainableModel, - } - if model == "tabular": - explainer = TabularExplainer( - self.artifacts["model"], X, features=self.artifacts["columns"] - ) - else: - explainer = MimicExplainer( - self.artifacts["model"], - X, - mimic_models[model], - augment_data=True, - max_num_of_augmentations=10, - features=self.artifacts["columns"], - ) - self.artifacts["explainer"] = explainer - - def local_interpret(self, X, n_feat=3, norm=True): - """ - Return a local interpret for each row in data - - Parameters - ---------- - self : object Wrapper - X : array[array], shape (n_linha, n_colunas) - Matrix with the data that were used to return interpret - n_feat : int, optional - Number of features to return - norm : bool, optional - if True, do normalization in the features importances - - Returns - ------- - pd.DataFrame - """ - local_explanation = self.artifacts["explainer"].explain_local(X) - n_obs = X.shape[0] - predictions = self.artifacts["model"].predict(X) - local_values = local_explanation.get_ranked_local_values() - local_values = [local_values[predictions[i]][i] for i in range(n_obs)] - local_names = local_explanation.get_ranked_local_names() - local_names = [local_names[predictions[i]][i] for i in range(n_obs)] - if norm: - local_values = [ - [(i - min(l)) / (max(l) - min(l)) for i in l] for l in local_values - ] - result = [ - (local_names[i][:n_feat] + local_values[i][:n_feat]) for i in range(n_obs) - ] - column_names = [ - f"Importance_{item}_{str(i)}" - for item in ["Name", "Value"] - for i in range(n_feat) - ] - return pd.DataFrame(result, columns=column_names) \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_Inference-checkpoint.ipynb b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_Inference-checkpoint.ipynb deleted file mode 100644 index aa21796..0000000 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_Inference-checkpoint.ipynb +++ /dev/null @@ -1,322 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "616d65aa", - "metadata": {}, - "source": [ - "# Sagemaker Inference" - ] - }, - { - "cell_type": "markdown", - "id": "aee7320a", - "metadata": {}, - "source": [ - "This script predicts new data with the uploaded image in ECR." - ] - }, - { - "cell_type": "markdown", - "id": "ea32612e", - "metadata": {}, - "source": [ - "## Import modules" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "3f188c9f", - "metadata": {}, - "outputs": [], - "source": [ - "import time\n", - "import boto3\n", - "import sagemaker\n", - "from sagemaker import get_execution_role" - ] - }, - { - "cell_type": "markdown", - "id": "430e1eb4", - "metadata": {}, - "source": [ - "## Setup" - ] - }, - { - "cell_type": "markdown", - "id": "ebe50488", - "metadata": {}, - "source": [ - "Modify according to your configurations." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "8893b148", - "metadata": {}, - "outputs": [], - "source": [ - "# Bucket name in S3\n", - "bucket = \"hermione-sagemaker\"" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "a6ba2451", - "metadata": {}, - "outputs": [], - "source": [ - "# Set session\n", - "region_name=\"us-east-1\"\n", - "boto3.setup_default_session(region_name=region_name)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "797c5fa6", - "metadata": {}, - "outputs": [], - "source": [ - "# Get user role\n", - "role = get_execution_role()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "d8148140", - "metadata": {}, - "outputs": [], - "source": [ - "# Get AWS Account ID\n", - "account_number = boto3.client(\"sts\").get_caller_identity()[\"Account\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "1b1fba48", - "metadata": {}, - "outputs": [], - "source": [ - "# Image previous uploaded in ECR\n", - "image_name = \"hermione-inference\"\n", - "image_uri = f\"{account_number}.dkr.ecr.{region_name}.amazonaws.com/{image_name}\"" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "f907e610", - "metadata": {}, - "outputs": [], - "source": [ - "# Input and output paths to execute inference\n", - "paths = {\n", - " 'inference_processed': f\"s3://{bucket}/PREPROCESSING/INFERENCE_PROCESSED/inference.csv\",\n", - " 'model': f\"s3://{bucket}/PREPROCESSING/MODEL/Hermione-train-2021-05-26-12-41-29-505/output/model.tar.gz\",\n", - " 'output_path': f\"s3://{bucket}/PREPROCESSING/OUTPUT\"\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "f5fdfdd8", - "metadata": {}, - "outputs": [], - "source": [ - "# instance to run the code\n", - "instance_type=\"ml.m5.large\"" - ] - }, - { - "cell_type": "markdown", - "id": "55fe64d7", - "metadata": {}, - "source": [ - "## Inference" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "60b7dc56", - "metadata": {}, - "outputs": [], - "source": [ - "# Receives the processed inference data in S3\n", - "input_path = paths['inference_processed']" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "e3dc913c", - "metadata": {}, - "outputs": [], - "source": [ - "# Receives the model created during the training in S3\n", - "model_path = paths['model']" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "5b69f31c", - "metadata": {}, - "outputs": [], - "source": [ - "# Saves the prediction in S3\n", - "output_path = paths['output_path']" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "29f7ce88", - "metadata": {}, - "outputs": [], - "source": [ - "# Creates the model to access the ECR image\n", - "model = sagemaker.model.Model(\n", - " image_uri= image_uri,\n", - " model_data=model_path,\n", - " role=role)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "aacdf22a", - "metadata": {}, - "outputs": [], - "source": [ - "# Creates a transformer object from the trained model\n", - "transformer = model.transformer(\n", - " instance_count=1,\n", - " instance_type=instance_type, \n", - " output_path=output_path,\n", - " accept = 'text/csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "6452e276", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - ".........................\u001b[34m2021-05-26 12:57:00,312 [INFO ] main com.amazonaws.ml.mms.ModelServer - \u001b[0m\n", - "\u001b[34mMMS Home: /usr/local/lib/python3.8/dist-packages\u001b[0m\n", - "\u001b[34mCurrent directory: /\u001b[0m\n", - "\u001b[34mTemp directory: /tmp\u001b[0m\n", - "\u001b[34mNumber of GPUs: 0\u001b[0m\n", - "\u001b[34mNumber of CPUs: 2\u001b[0m\n", - "\u001b[34mMax heap size: 857 M\u001b[0m\n", - "\u001b[34mPython executable: /usr/bin/python3\u001b[0m\n", - "\u001b[34mConfig file: /etc/sagemaker-mms.properties\u001b[0m\n", - "\u001b[34mInference address: http://0.0.0.0:8080\u001b[0m\n", - "\u001b[34mManagement address: http://0.0.0.0:8080\u001b[0m\n", - "\u001b[34mModel Store: /.sagemaker/mms/models\u001b[0m\n", - "\u001b[34mInitial Models: ALL\u001b[0m\n", - "\u001b[34mLog dir: /logs\u001b[0m\n", - "\u001b[34mMetrics dir: /logs\u001b[0m\n", - "\u001b[34mNetty threads: 0\u001b[0m\n", - "\u001b[34mNetty client threads: 0\u001b[0m\n", - "\u001b[34mDefault workers per model: 2\u001b[0m\n", - "\u001b[34mBlacklist Regex: N/A\u001b[0m\n", - "\u001b[34mMaximum Response Size: 6553500\u001b[0m\n", - "\u001b[34mMaximum Request Size: 6553500\u001b[0m\n", - "\u001b[34mPreload model: false\u001b[0m\n", - "\u001b[34mPrefer direct buffer: false\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:00,419 [WARN ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerLifeCycle - attachIOStreams() threadName=W-9000-model\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:00,506 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - model_service_worker started with args: --sock-type unix --sock-name /tmp/.mms.sock.9000 --handler serving.handler --model-path /.sagemaker/mms/models/model --model-name model --preload-model false --tmp-dir /tmp\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:00,508 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Listening on port: /tmp/.mms.sock.9000\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:00,509 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - [PID] 23\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:00,509 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - MMS worker started.\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:00,509 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Python runtime: 3.8.5\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:00,512 [INFO ] main com.amazonaws.ml.mms.wlm.ModelManager - Model model loaded.\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:00,517 [INFO ] main com.amazonaws.ml.mms.ModelServer - Initialize Inference server with: EpollServerSocketChannel.\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:00,536 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Connecting to: /tmp/.mms.sock.9000\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:00,536 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Connecting to: /tmp/.mms.sock.9000\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:00,607 [INFO ] main com.amazonaws.ml.mms.ModelServer - Inference API bind to: http://0.0.0.0:8080\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:00,613 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Connection accepted: /tmp/.mms.sock.9000.\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:00,614 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Connection accepted: /tmp/.mms.sock.9000.\u001b[0m\n", - "\u001b[34mModel server started.\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:00,636 [WARN ] pool-2-thread-1 com.amazonaws.ml.mms.metrics.MetricCollector - worker pid is not available yet.\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:02,508 [WARN ] W-9000-model-stderr com.amazonaws.ml.mms.wlm.WorkerLifeCycle - /usr/local/lib/python3.8/dist-packages/interpret_community/common/gpu_kmeans.py:30: UserWarning: cuML is required to use GPU explainers. Check https://rapids.ai/start.html for more information on how to install it.\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:02,510 [WARN ] W-9000-model-stderr com.amazonaws.ml.mms.wlm.WorkerLifeCycle - warnings.warn(\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:02,510 [WARN ] W-9000-model-stderr com.amazonaws.ml.mms.wlm.WorkerLifeCycle - /usr/local/lib/python3.8/dist-packages/interpret_community/common/gpu_kmeans.py:30: UserWarning: cuML is required to use GPU explainers. Check https://rapids.ai/start.html for more information on how to install it.\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:02,510 [WARN ] W-9000-model-stderr com.amazonaws.ml.mms.wlm.WorkerLifeCycle - warnings.warn(\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:03,375 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - generated new fontManager\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:03,393 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - generated new fontManager\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:03,635 [WARN ] W-9000-model-stderr com.amazonaws.ml.mms.wlm.WorkerLifeCycle - cuML is required to use GPU explainers. Check https://rapids.ai/start.html for more information on how to install it.\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:03,658 [WARN ] W-9000-model-stderr com.amazonaws.ml.mms.wlm.WorkerLifeCycle - cuML is required to use GPU explainers. Check https://rapids.ai/start.html for more information on how to install it.\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:03,690 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Loading the model\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:03,715 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Loading the model\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:03,741 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Model model loaded io_fd=0242a9fffefeff83-00000009-00000002-e6c9db643cbfeb7b-a47635f7\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:03,750 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Backend response time: 3046\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:03,752 [WARN ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerLifeCycle - attachIOStreams() threadName=W-model-1\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:03,768 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Model model loaded io_fd=0242a9fffefeff83-00000009-00000001-f549db643cbfeb7b-e2a66100\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:03,768 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Backend response time: 3065\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:03,769 [WARN ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerLifeCycle - attachIOStreams() threadName=W-model-2\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:09,272 [INFO ] pool-1-thread-4 ACCESS_LOG - /169.254.255.130:59054 \"GET /ping HTTP/1.1\" 200 11\u001b[0m\n", - "\u001b[35m2021-05-26 12:57:09,272 [INFO ] pool-1-thread-4 ACCESS_LOG - /169.254.255.130:59054 \"GET /ping HTTP/1.1\" 200 11\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:09,353 [INFO ] epollEventLoopGroup-3-2 ACCESS_LOG - /169.254.255.130:59058 \"GET /execution-parameters HTTP/1.1\" 404 2\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:09,462 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Predicting...\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:09,486 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Prediction Complete\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:09,491 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Saving\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:09,494 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Backend response time: 37\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:09,494 [INFO ] W-9000-model ACCESS_LOG - /169.254.255.130:59068 \"POST /invocations HTTP/1.1\" 200 42\u001b[0m\n", - "\u001b[35m2021-05-26 12:57:09,353 [INFO ] epollEventLoopGroup-3-2 ACCESS_LOG - /169.254.255.130:59058 \"GET /execution-parameters HTTP/1.1\" 404 2\u001b[0m\n", - "\u001b[35m2021-05-26 12:57:09,462 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Predicting...\u001b[0m\n", - "\u001b[35m2021-05-26 12:57:09,486 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Prediction Complete\u001b[0m\n", - "\u001b[35m2021-05-26 12:57:09,491 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Saving\u001b[0m\n", - "\u001b[35m2021-05-26 12:57:09,494 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Backend response time: 37\u001b[0m\n", - "\u001b[35m2021-05-26 12:57:09,494 [INFO ] W-9000-model ACCESS_LOG - /169.254.255.130:59068 \"POST /invocations HTTP/1.1\" 200 42\u001b[0m\n", - "\u001b[32m2021-05-26T12:57:09.364:[sagemaker logs]: MaxConcurrentTransforms=1, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD\u001b[0m\n", - "\n", - "CPU times: user 547 ms, sys: 59 ms, total: 606 ms\n", - "Wall time: 4min 43s\n" - ] - } - ], - "source": [ - "%%time\n", - "# Predicts the data\n", - "transformer.transform(data=input_path, data_type='S3Prefix', content_type='text/csv', split_type='Line')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "conda_python3", - "language": "python", - "name": "conda_python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_Processor-checkpoint.ipynb b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_Processor-checkpoint.ipynb deleted file mode 100644 index ad85e0f..0000000 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_Processor-checkpoint.ipynb +++ /dev/null @@ -1,396 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "b5264128", - "metadata": {}, - "source": [ - "# Sagemaker Processor" - ] - }, - { - "cell_type": "markdown", - "id": "5bd7a5cd", - "metadata": {}, - "source": [ - "This script generates the train, val and inference files with the processor previous uploaded in ECR." - ] - }, - { - "cell_type": "markdown", - "id": "0488ed05", - "metadata": {}, - "source": [ - "## Import modules" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "e7b20785", - "metadata": {}, - "outputs": [], - "source": [ - "import boto3\n", - "import time\n", - "from datetime import datetime\n", - "from sagemaker import get_execution_role\n", - "from sagemaker.processing import Processor, ProcessingInput, ProcessingOutput" - ] - }, - { - "cell_type": "markdown", - "id": "7f3fd305", - "metadata": {}, - "source": [ - "## Setup" - ] - }, - { - "cell_type": "markdown", - "id": "6528a20b", - "metadata": {}, - "source": [ - "Modify according to your configurations." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "d5cdd5d1", - "metadata": {}, - "outputs": [], - "source": [ - "# Bucket name in S3\n", - "bucket = \"hermione-sagemaker\"" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "5ec68bf7", - "metadata": {}, - "outputs": [], - "source": [ - "# Set session\n", - "region_name=\"us-east-1\"\n", - "boto3.setup_default_session(region_name=region_name)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "4d011a47", - "metadata": {}, - "outputs": [], - "source": [ - "# Get user role\n", - "role = get_execution_role()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "25f76666", - "metadata": {}, - "outputs": [], - "source": [ - "# Get AWS Account ID\n", - "account_number = boto3.client(\"sts\").get_caller_identity()[\"Account\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "fafb5f18", - "metadata": {}, - "outputs": [], - "source": [ - "# Image previous uploaded in ECR\n", - "image_name = \"hermione-processor\"\n", - "image_uri = f\"{account_number}.dkr.ecr.{region_name}.amazonaws.com/{image_name}\"" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "2ef594d3", - "metadata": {}, - "outputs": [], - "source": [ - "# Input and output paths to execute train and inference\n", - "paths = {\n", - " 'train_raw': f\"s3://{bucket}/TRAIN_RAW\",\n", - " 'expectations': f\"s3://{bucket}/PREPROCESSING/EXPECTATIONS\",\n", - " 'preprocessing': f\"s3://{bucket}/PREPROCESSING/PREPROCESSING\",\n", - " 'train_processed': f\"s3://{bucket}/PREPROCESSING/TRAIN_PROCESSED\",\n", - " 'val_processed': f\"s3://{bucket}/PREPROCESSING/VAL_PROCESSED\",\n", - " 'test_raw': f\"s3://{bucket}/TEST_RAW\",\n", - " 'inference_processed': f\"s3://{bucket}/PREPROCESSING/INFERENCE_PROCESSED\",\n", - " 'validations': f\"s3://{bucket}/PREPROCESSING/VALIDATIONS\"\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "2b625b74", - "metadata": {}, - "outputs": [], - "source": [ - "# instance to run the code\n", - "instance_type_train=\"ml.t3.medium\"\n", - "instance_type_inference=\"ml.t3.medium\"" - ] - }, - { - "cell_type": "markdown", - "id": "6e8e92ba", - "metadata": {}, - "source": [ - "## Processor - Train" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "e1b41ed1", - "metadata": {}, - "outputs": [], - "source": [ - "# Receives a raw data in S3\n", - "inputs=[\n", - " ProcessingInput(source=paths['train_raw'], \n", - " destination='/opt/ml/processing/input/raw_data', \n", - " input_name=\"raw_data\")\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "cd67446b", - "metadata": {}, - "outputs": [], - "source": [ - "# Returns the great expectation object, preprocessing object, \n", - "# processed training data and processed validation data, and saves them in S3\n", - "outputs = [\n", - " ProcessingOutput(\n", - " source=\"/opt/ml/processing/output/expectations\",\n", - " destination=paths['expectations'],\n", - " output_name=\"expectations\",\n", - " ),\n", - " ProcessingOutput(\n", - " source=\"/opt/ml/processing/output/preprocessing\",\n", - " destination=paths['preprocessing'],\n", - " output_name=\"preprocessing\",\n", - " ),\n", - " ProcessingOutput(\n", - " source=\"/opt/ml/processing/output/processed/train\",\n", - " destination=paths['train_processed'],\n", - " output_name=\"train_data\",\n", - " ),\n", - " ProcessingOutput(\n", - " source=\"/opt/ml/processing/output/processed/val\",\n", - " destination=paths['val_processed'],\n", - " output_name=\"val_data\",\n", - " )\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "902f8e4f", - "metadata": {}, - "outputs": [], - "source": [ - "# Creates the processor to access the ECR image\n", - "processor = Processor(image_uri=image_uri,\n", - " role=role,\n", - " instance_count=1,\n", - " instance_type=instance_type_train)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "fd8a28a1", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Job Name: hermione-processor-2021-05-25-21-03-59-873\n", - "Inputs: [{'InputName': 'raw_data', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://hermione-sagemaker/TRAIN_RAW', 'LocalPath': '/opt/ml/processing/input/raw_data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]\n", - "Outputs: [{'OutputName': 'expectations', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/EXPECTATIONS', 'LocalPath': '/opt/ml/processing/output/expectations', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'preprocessing', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/PREPROCESSING', 'LocalPath': '/opt/ml/processing/output/preprocessing', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'train_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/TRAIN_PROCESSED', 'LocalPath': '/opt/ml/processing/output/processed/train', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'val_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/VAL_PROCESSED', 'LocalPath': '/opt/ml/processing/output/processed/val', 'S3UploadMode': 'EndOfJob'}}]\n", - "......................................................\n", - "\u001b[34mINFO:root:Starting the preprocessing\u001b[0m\n", - "\u001b[34mINFO:root:step_train: True\u001b[0m\n", - "\u001b[34mINFO:root:Reading the inputs\u001b[0m\n", - "\u001b[34mINFO:root:Reading file: /opt/ml/processing/input/raw_data/raw_train.csv\u001b[0m\n", - "\u001b[34mINFO:root:Data Quality\u001b[0m\n", - "\u001b[34mINFO:great_expectations.data_asset.data_asset:#01110 expectation(s) included in expectation_suite. Omitting 1 expectation(s) that failed when last run; set discard_failed_expectations=False to include them. result_format settings filtered.\u001b[0m\n", - "\u001b[34mINFO:root:Preprocessing\u001b[0m\n", - "\u001b[34mINFO:root:Cleaning data\u001b[0m\n", - "\u001b[34mINFO:root:One hot encoding\u001b[0m\n", - "\u001b[34mWARNING:py.warnings:/usr/local/lib/python3.8/dist-packages/category_encoders/utils.py:21: FutureWarning: is_categorical is deprecated and will be removed in a future version. Use is_categorical_dtype instead\n", - " elif pd.api.types.is_categorical(cols):\n", - "\u001b[0m\n", - "\u001b[34mINFO:root:Divide train and test\u001b[0m\n", - "\u001b[34mINFO:root:Normalizing\u001b[0m\n", - "\u001b[34mWARNING:py.warnings:/usr/local/lib/python3.8/dist-packages/pandas/core/indexing.py:1738: SettingWithCopyWarning: \u001b[0m\n", - "\u001b[34mA value is trying to be set on a copy of a slice from a DataFrame.\u001b[0m\n", - "\u001b[34mTry using .loc[row_indexer,col_indexer] = value instead\n", - "\u001b[0m\n", - "\u001b[34mSee the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", - " self._setitem_single_column(loc, value[:, i].tolist(), pi)\n", - "\u001b[0m\n", - "\u001b[34mINFO:root:Normalizing\u001b[0m\n", - "\u001b[34mINFO:root:shape train (393, 7) val (99, 7)\u001b[0m\n", - "\u001b[34mINFO:root:Saving\u001b[0m\n", - "CPU times: user 1.02 s, sys: 104 ms, total: 1.13 s\n", - "Wall time: 9min 14s\n" - ] - } - ], - "source": [ - "%%time\n", - "# Runs the processor to access the ECR image and process the training data\n", - "processor.run(inputs=inputs,\n", - " outputs= outputs,\n", - " arguments=[\"--step\", \"train\"] \n", - " )" - ] - }, - { - "cell_type": "markdown", - "id": "a0b0636e", - "metadata": {}, - "source": [ - "## Processor - Inference" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "4e1df020", - "metadata": {}, - "outputs": [], - "source": [ - "# Receives a raw data in S3, the preprocessing and great expectation objects created in the training\n", - "inputs=[\n", - " ProcessingInput(source=paths['test_raw'],\n", - " destination='/opt/ml/processing/input/raw_data', \n", - " input_name='raw_data'),\n", - " ProcessingInput(source=paths['preprocessing'], \n", - " destination='/opt/ml/processing/input/preprocessing', \n", - " input_name='preprocessing'),\n", - " ProcessingInput(source=paths['expectations'], \n", - " destination='/opt/ml/processing/input/expectations', \n", - " input_name='expectations')\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "4fa3439a", - "metadata": {}, - "outputs": [], - "source": [ - "# Returns the processed inference data and validations, and saves them in S3\n", - "outputs = [\n", - " ProcessingOutput(\n", - " source=\"/opt/ml/processing/output/processed/inference\",\n", - " destination=paths['inference_processed'],\n", - " output_name=\"inference_data\",\n", - " ),\n", - " ProcessingOutput(\n", - " source=\"/opt/ml/processing/output/validations\",\n", - " destination=paths['validations'],\n", - " output_name=\"validations\",\n", - " )\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "c399b969", - "metadata": {}, - "outputs": [], - "source": [ - "# Creates the processor to access the ECR image\n", - "processor = Processor(image_uri=image_uri,\n", - " role=role,\n", - " instance_count=1,\n", - " instance_type=instance_type_inference)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "8cb61e97", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Job Name: hermione-processor-2021-05-25-21-13-13-987\n", - "Inputs: [{'InputName': 'raw_data', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://hermione-sagemaker/TEST_RAW', 'LocalPath': '/opt/ml/processing/input/raw_data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'preprocessing', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/PREPROCESSING', 'LocalPath': '/opt/ml/processing/input/preprocessing', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'expectations', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/EXPECTATIONS', 'LocalPath': '/opt/ml/processing/input/expectations', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]\n", - "Outputs: [{'OutputName': 'inference_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/INFERENCE_PROCESSED', 'LocalPath': '/opt/ml/processing/output/processed/inference', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'validations', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/VALIDATIONS', 'LocalPath': '/opt/ml/processing/output/validations', 'S3UploadMode': 'EndOfJob'}}]\n", - "............................................................\n", - "\u001b[34mINFO:root:Starting the preprocessing\u001b[0m\n", - "\u001b[34mINFO:root:step_train: False\u001b[0m\n", - "\u001b[34mINFO:root:Reading the inputs\u001b[0m\n", - "\u001b[34mINFO:root:Reading file: /opt/ml/processing/input/raw_data/raw_test.csv\u001b[0m\n", - "\u001b[34mINFO:root:Data Quality\u001b[0m\n", - "\u001b[34mINFO:root:Preprocessing\u001b[0m\n", - "\u001b[34mINFO:root:Cleaning data\u001b[0m\n", - "\u001b[34mINFO:root:One hot encoding\u001b[0m\n", - "\u001b[34mINFO:root:Normalizing\u001b[0m\n", - "\u001b[34mINFO:root:shape (222, 7)\u001b[0m\n", - "\u001b[34mINFO:root:Saving\u001b[0m\n", - "CPU times: user 1.19 s, sys: 38.4 ms, total: 1.23 s\n", - "Wall time: 10min 14s\n" - ] - } - ], - "source": [ - "%%time\n", - "# Runs the processor to access the ECR image and process the inference data\n", - "processor.run(inputs=inputs,\n", - " outputs= outputs,\n", - " arguments=[\"--step\", \"test\"] \n", - " )" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "conda_python3", - "language": "python", - "name": "conda_python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_StepFunctions_Inference-checkpoint.ipynb b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_StepFunctions_Inference-checkpoint.ipynb deleted file mode 100644 index 1c9af76..0000000 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_StepFunctions_Inference-checkpoint.ipynb +++ /dev/null @@ -1,737 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Build machine learning workflow to predict new data with Amazon SageMaker and AWS Step Functions" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This script creates a Step Function state machine to preprocess the inference data and predict with the images in ECR." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Import modules" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import uuid\n", - "import boto3\n", - "import sagemaker\n", - "from sagemaker.amazon.amazon_estimator import get_image_uri\n", - "from sagemaker.s3 import S3Uploader\n", - "from sagemaker import get_execution_role\n", - "from sagemaker.sklearn.processing import SKLearnProcessor\n", - "from sagemaker.processing import Processor, ProcessingInput, ProcessingOutput\n", - "import stepfunctions\n", - "from stepfunctions.steps import (\n", - " Chain,\n", - " ProcessingStep,\n", - " TransformStep\n", - ")\n", - "from stepfunctions.inputs import ExecutionInput\n", - "from stepfunctions.workflow import Workflow" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setup" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Modify according to your configurations." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# Bucket name in S3\n", - "bucket = \"hermione-sagemaker\"" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# Set session\n", - "region_name=\"us-east-1\"\n", - "boto3.setup_default_session(region_name=region_name)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "# Get user role\n", - "role = get_execution_role()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# Role to create and execute step functions\n", - "# paste the AmazonSageMaker-StepFunctionsWorkflowExecutionRole ARN\n", - "workflow_execution_role = \"\"" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# SageMaker expects unique names for each job, model and endpoint.\n", - "# Otherwise, the execution will fail. The ExecutionInput creates\n", - "# dynamically names for each execution.\n", - "execution_input = ExecutionInput(\n", - " schema={\n", - " \"PreprocessingJobName\": str,\n", - " \"TransformJobName\": str \n", - " }\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# Get AWS Account ID\n", - "account_number = boto3.client(\"sts\").get_caller_identity()[\"Account\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "# Processor image name previous uploaded in ECR\n", - "image_name_processor = \"hermione-processor\"" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "# Inference image name previous uploaded in ECR\n", - "image_name_inference = \"hermione-inference\"" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# Input and output paths to execute train and inference\n", - "paths = {\n", - " 'expectations': f\"s3://{bucket}/PREPROCESSING/EXPECTATIONS\",\n", - " 'preprocessing': f\"s3://{bucket}/PREPROCESSING/PREPROCESSING\",\n", - " 'test_raw': f\"s3://{bucket}/TEST_RAW\",\n", - " 'inference_processed': f\"s3://{bucket}/PREPROCESSING/INFERENCE_PROCESSED\",\n", - " 'validations': f\"s3://{bucket}/PREPROCESSING/VALIDATIONS\",\n", - " 'model': f\"s3://{bucket}/PREPROCESSING/MODEL/Hermione-train-2021-05-26-12-41-29-505/output/model.tar.gz\",\n", - " 'output_path': f\"s3://{bucket}/PREPROCESSING/OUTPUT\"\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "# instance to run the code\n", - "instance_type_preprocessing=\"ml.t3.medium\"\n", - "instance_type_inference=\"ml.m5.large\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Preprocessing Step" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "# Processor image previous uploaded in ECR\n", - "image_uri_processor = f\"{account_number}.dkr.ecr.{region_name}.amazonaws.com/{image_name_processor}\"" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "# Creates the processor to access the ECR image\n", - "processor = Processor(image_uri=image_uri_processor,\n", - " role=role,\n", - " instance_count=1,\n", - " instance_type=instance_type_preprocessing)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "# Creates input and output objects for ProcessingStep\n", - "inputs=[\n", - " ProcessingInput(source=paths['test_raw'],\n", - " destination='/opt/ml/processing/input/raw_data', \n", - " input_name='raw_data'),\n", - " ProcessingInput(source=paths['preprocessing'], \n", - " destination='/opt/ml/processing/input/preprocessing', \n", - " input_name='preprocessing'),\n", - " ProcessingInput(source=paths['expectations'], \n", - " destination='/opt/ml/processing/input/expectations', \n", - " input_name='expectations')\n", - "]\n", - "outputs = [\n", - " ProcessingOutput(\n", - " source=\"/opt/ml/processing/output/processed/inference\",\n", - " destination=paths['inference_processed'],\n", - " output_name=\"inference_data\",\n", - " ),\n", - " ProcessingOutput(\n", - " source=\"/opt/ml/processing/output/validations\",\n", - " destination=paths['validations'],\n", - " output_name=\"validations\",\n", - " )\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "# Creates the ProcessingStep\n", - "processing_step = ProcessingStep(\n", - " \"SageMaker Preprocessing step\",\n", - " processor=processor,\n", - " job_name=execution_input[\"PreprocessingJobName\"],\n", - " inputs=inputs,\n", - " outputs=outputs,\n", - " container_arguments=[\"--step\", \"test\"]\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Inference Step" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "# Inference image previous uploaded in ECR\n", - "image_uri_inference = f\"{account_number}.dkr.ecr.{region_name}.amazonaws.com/{image_name_inference}\"" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "# Creates input and output objects for TransformStep\n", - "input_path = paths['inference_processed']\n", - "model_path = paths['model']\n", - "output_path = paths['output_path']" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "# Creates the model to access the ECR image\n", - "model = sagemaker.model.Model(\n", - " image_uri = image_uri_inference,\n", - " model_data=model_path,\n", - " role=role)" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "# Creates a transformer object from the trained model\n", - "transformer = model.transformer(\n", - " instance_count=1,\n", - " instance_type=instance_type_inference, \n", - " output_path=output_path,\n", - " accept = 'text/csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "# Creates the TransformStep\n", - "transform_step = TransformStep(\n", - " \"Inference Step\",\n", - " transformer=transformer,\n", - " job_name=execution_input[\"TransformJobName\"],\n", - " data=input_path,\n", - " content_type='text/csv',\n", - " wait_for_completion=True,\n", - " model_name=model.name\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create Workflow and Execute" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "# Creates Fail state to mark the workflow failed in case any of the steps fail.\n", - "failed_state_sagemaker_processing_failure = stepfunctions.steps.states.Fail(\n", - " \"ML Workflow failed\", cause=\"SageMakerProcessingJobFailed\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [], - "source": [ - "# Adds the Error handling in the workflow\n", - "catch_state_processing = stepfunctions.steps.states.Catch(\n", - " error_equals=[\"States.TaskFailed\"],\n", - " next_step=failed_state_sagemaker_processing_failure,\n", - ")\n", - "\n", - "processing_step.add_catch(catch_state_processing)\n", - "transform_step.add_catch(catch_state_processing)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Creates workflow with Pre-Processing Job and Transform Job\n", - "workflow_graph = Chain([processing_step, transform_step])\n", - "branching_workflow = Workflow(\n", - " name=\"SFN_Hermione_Inference\",\n", - " definition=workflow_graph,\n", - " role=workflow_execution_role,\n", - ")\n", - "branching_workflow.create()" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [], - "source": [ - "# Generates unique names for Pre-Processing Job and Training Job\n", - "# Each job requires a unique name\n", - "preprocessing_job_name = \"Hermione-Preprocessing-{}\".format(\n", - " uuid.uuid1().hex\n", - ") \n", - "inference_job_name = \"Hermione-Inference-{}\".format(\n", - " uuid.uuid1().hex\n", - ") " - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - " \n", - " \n", - "
\n", - "
    \n", - "
  • \n", - "
    \n", - " Success\n", - "
  • \n", - "
  • \n", - "
    \n", - " Failed\n", - "
  • \n", - "
  • \n", - "
    \n", - " Cancelled\n", - "
  • \n", - "
  • \n", - "
    \n", - " In Progress\n", - "
  • \n", - "
  • \n", - "
    \n", - " Caught Error\n", - "
  • \n", - "
\n", - "
\n", - "\n", - " \n", - " Inspect in AWS Step Functions \n", - "
\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Executes the workflow\n", - "execution = branching_workflow.execute(\n", - " inputs={\n", - " \"PreprocessingJobName\": preprocessing_job_name,\n", - " \"TransformJobName\": inference_job_name\n", - " }\n", - ")\n", - "execution_output = execution.get_output(wait=False)\n", - "execution.render_progress()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Results" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
SurvivedAgePclass_1Pclass_2Pclass_3Sex_1Sex_2predict
01.00.0072880.00.01.01.00.01.0
10.00.3717010.01.00.00.01.00.0
20.00.7612470.01.00.00.01.00.0
30.00.3340040.00.01.00.01.00.0
40.00.5727571.00.00.00.01.00.0
...........................
2170.00.2083440.00.01.00.01.00.0
2180.00.2334760.00.01.00.01.00.0
2190.00.0198540.00.01.01.00.01.0
2201.00.2209101.00.00.01.00.01.0
2211.00.6481530.01.00.01.00.01.0
\n", - "

222 rows × 8 columns

\n", - "
" - ], - "text/plain": [ - " Survived Age Pclass_1 Pclass_2 Pclass_3 Sex_1 Sex_2 predict\n", - "0 1.0 0.007288 0.0 0.0 1.0 1.0 0.0 1.0\n", - "1 0.0 0.371701 0.0 1.0 0.0 0.0 1.0 0.0\n", - "2 0.0 0.761247 0.0 1.0 0.0 0.0 1.0 0.0\n", - "3 0.0 0.334004 0.0 0.0 1.0 0.0 1.0 0.0\n", - "4 0.0 0.572757 1.0 0.0 0.0 0.0 1.0 0.0\n", - ".. ... ... ... ... ... ... ... ...\n", - "217 0.0 0.208344 0.0 0.0 1.0 0.0 1.0 0.0\n", - "218 0.0 0.233476 0.0 0.0 1.0 0.0 1.0 0.0\n", - "219 0.0 0.019854 0.0 0.0 1.0 1.0 0.0 1.0\n", - "220 1.0 0.220910 1.0 0.0 0.0 1.0 0.0 1.0\n", - "221 1.0 0.648153 0.0 1.0 0.0 1.0 0.0 1.0\n", - "\n", - "[222 rows x 8 columns]" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pandas as pd\n", - "pd.read_csv('s3://hermione-sagemaker/PREPROCESSING/OUTPUT/inference.csv.out')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "conda_python3", - "language": "python", - "name": "conda_python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.13" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_StepFunctions_Train-checkpoint.ipynb b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_StepFunctions_Train-checkpoint.ipynb deleted file mode 100644 index a4c655a..0000000 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_StepFunctions_Train-checkpoint.ipynb +++ /dev/null @@ -1,540 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Build machine learning workflow to train a model with Amazon SageMaker and AWS Step Functions" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This script creates a Step Function state machine to preprocess the training data and train a model with the images in ECR." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Import modules" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import uuid\n", - "import boto3\n", - "import sagemaker\n", - "from sagemaker import get_execution_role\n", - "from sagemaker.processing import Processor, ProcessingInput, ProcessingOutput\n", - "import stepfunctions\n", - "from stepfunctions.inputs import ExecutionInput\n", - "from stepfunctions.workflow import Workflow\n", - "from stepfunctions.steps import (\n", - " TrainingStep, \n", - " Chain,\n", - " ProcessingStep,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setup" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Modify according to your configurations." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# Bucket name in S3\n", - "bucket = \"hermione-sagemaker\"" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# Set session\n", - "region_name=\"us-east-1\"\n", - "boto3.setup_default_session(region_name=region_name)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "# Get user role\n", - "role = get_execution_role()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# Role to create and execute step functions\n", - "# paste the AmazonSageMaker-StepFunctionsWorkflowExecutionRole ARN\n", - "workflow_execution_role = \"\"" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# SageMaker expects unique names for each job, model and endpoint.\n", - "# Otherwise, the execution will fail. The ExecutionInput creates\n", - "# dynamically names for each execution.\n", - "execution_input = ExecutionInput(\n", - " schema={\n", - " \"PreprocessingJobName\": str,\n", - " \"TrainingJobName\": str\n", - " }\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# Get AWS Account ID\n", - "account_number = boto3.client(\"sts\").get_caller_identity()[\"Account\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "# Processor image name previous uploaded in ECR\n", - "image_name_processor = \"hermione-processor\"" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "# Training image name previous uploaded in ECR\n", - "image_name_train = \"hermione-train\"" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# Input and output paths to execute\n", - "paths = {\n", - " 'train_raw': f\"s3://{bucket}/TRAIN_RAW\",\n", - " 'expectations': f\"s3://{bucket}/PREPROCESSING/EXPECTATIONS\",\n", - " 'preprocessing': f\"s3://{bucket}/PREPROCESSING/PREPROCESSING\",\n", - " 'train_processed': f\"s3://{bucket}/PREPROCESSING/TRAIN_PROCESSED\",\n", - " 'val_processed': f\"s3://{bucket}/PREPROCESSING/VAL_PROCESSED\",\n", - " 'model': f\"s3://{bucket}/PREPROCESSING/MODEL\"\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "# instance to run the code\n", - "instance_type_preprocessing=\"ml.t3.medium\"\n", - "instance_type_train=\"ml.m5.large\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Preprocessing Step" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "# Processor image previous uploaded in ECR\n", - "image_uri_processor = f\"{account_number}.dkr.ecr.{region_name}.amazonaws.com/{image_name_processor}\"" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "# Creates the processor to access the ECR image\n", - "processor = Processor(image_uri=image_uri_processor,\n", - " role=role,\n", - " instance_count=1,\n", - " instance_type=instance_type_preprocessing)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "# Creates input and output objects for ProcessingStep\n", - "inputs=[\n", - " ProcessingInput(source=paths['train_raw'], \n", - " destination='/opt/ml/processing/input/raw_data', \n", - " input_name=\"raw_data\")\n", - "]\n", - "outputs = [\n", - " ProcessingOutput(\n", - " source=\"/opt/ml/processing/output/expectations\",\n", - " destination=paths['expectations'],\n", - " output_name=\"expectations\",\n", - " ),\n", - " ProcessingOutput(\n", - " source=\"/opt/ml/processing/output/preprocessing\",\n", - " destination=paths['preprocessing'],\n", - " output_name=\"preprocessing\",\n", - " ),\n", - " ProcessingOutput(\n", - " source=\"/opt/ml/processing/output/processed/train\",\n", - " destination=paths['train_processed'],\n", - " output_name=\"train_data\",\n", - " ),\n", - " ProcessingOutput(\n", - " source=\"/opt/ml/processing/output/processed/val\",\n", - " destination=paths['val_processed'],\n", - " output_name=\"val_data\",\n", - " )\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "# Creates the ProcessingStep\n", - "processing_step = ProcessingStep(\n", - " \"Preprocessing step\",\n", - " processor=processor,\n", - " job_name=execution_input[\"PreprocessingJobName\"],\n", - " inputs=inputs,\n", - " outputs=outputs,\n", - " container_arguments=[\"--step\", \"train\"]\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## TrainingStep" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "# Training image previous uploaded in ECR\n", - "image_uri_train = f\"{account_number}.dkr.ecr.{region_name}.amazonaws.com/{image_name_train}\"" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "# Creates input and output objects for TrainingStep\n", - "train_config = sagemaker.inputs.TrainingInput(\n", - " paths['train_processed'],\n", - " content_type='text/csv',\n", - ")\n", - "val_config = sagemaker.inputs.TrainingInput(\n", - " paths['val_processed'],\n", - " content_type='text/csv'\n", - ")\n", - "output_path = paths['model']" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "# Creates the estimator to access the ECR image\n", - "est = sagemaker.estimator.Estimator(\n", - " image_uri_train,\n", - " role, \n", - " instance_count=1, \n", - " instance_type=instance_type_train,\n", - " volume_size = 30,\n", - " output_path = output_path,\n", - " base_job_name = \"Hermione-Train\",\n", - " use_spot_instances=True, # Usar instâncias SPOT\n", - " max_run = 24*60*60,\n", - " max_wait = 24*60*60 # timeout em segundos. Required if use_spot_instances == True\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "# Creates the TrainingStep\n", - "training_step = TrainingStep(\n", - " 'TrainStep',\n", - " estimator=est,\n", - " data={\n", - " 'train': train_config,\n", - " 'validation': val_config\n", - " }, \n", - " job_name=execution_input[\"TrainingJobName\"] \n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create Workflow and Execute" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "# Creates Fail state to mark the workflow failed in case any of the steps fail.\n", - "failed_state_sagemaker_processing_failure = stepfunctions.steps.states.Fail(\n", - " \"ML Workflow failed\", cause=\"SageMakerProcessingJobFailed\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "# Adds the Error handling in the workflow\n", - "catch_state_processing = stepfunctions.steps.states.Catch(\n", - " error_equals=[\"States.TaskFailed\"],\n", - " next_step=failed_state_sagemaker_processing_failure,\n", - ")\n", - "\n", - "processing_step.add_catch(catch_state_processing)\n", - "training_step.add_catch(catch_state_processing)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Creates workflow with Pre-Processing Job and Training Job\n", - "workflow_graph = Chain([processing_step, training_step])\n", - "branching_workflow = Workflow(\n", - " name=\"SFN_Hermione_Train\",\n", - " definition=workflow_graph,\n", - " role=workflow_execution_role,\n", - ")\n", - "branching_workflow.create()" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [], - "source": [ - "# Generates unique names for Pre-Processing Job and Training Job\n", - "# Each job requires a unique name\n", - "preprocessing_job_name = \"Hermione-Preprocessing-{}\".format(\n", - " uuid.uuid1().hex\n", - ") \n", - "training_job_name = \"Hermione-Training-{}\".format(\n", - " uuid.uuid1().hex\n", - ") " - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", - " \n", - " \n", - "
\n", - "
    \n", - "
  • \n", - "
    \n", - " Success\n", - "
  • \n", - "
  • \n", - "
    \n", - " Failed\n", - "
  • \n", - "
  • \n", - "
    \n", - " Cancelled\n", - "
  • \n", - "
  • \n", - "
    \n", - " In Progress\n", - "
  • \n", - "
  • \n", - "
    \n", - " Caught Error\n", - "
  • \n", - "
\n", - "
\n", - "\n", - " \n", - " Inspect in AWS Step Functions \n", - "
\n", - "\n", - "\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Executes the workflow\n", - "execution = branching_workflow.execute(\n", - " inputs={\n", - " \"PreprocessingJobName\": preprocessing_job_name,\n", - " \"TrainingJobName\": training_job_name\n", - " }\n", - ")\n", - "execution_output = execution.get_output(wait=False)\n", - "execution.render_progress()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "conda_python3", - "language": "python", - "name": "conda_python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.13" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_Train-checkpoint.ipynb b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_Train-checkpoint.ipynb deleted file mode 100644 index b0a796f..0000000 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/.ipynb_checkpoints/Sagemaker_Train-checkpoint.ipynb +++ /dev/null @@ -1,393 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "577c4f6b", - "metadata": {}, - "source": [ - "# Sagemaker Train" - ] - }, - { - "cell_type": "markdown", - "id": "501ef5b6", - "metadata": {}, - "source": [ - "This script creates and trains the model with the uploaded image in ECR." - ] - }, - { - "cell_type": "markdown", - "id": "e66b3975", - "metadata": {}, - "source": [ - "## Import modules" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "d658fb44", - "metadata": {}, - "outputs": [], - "source": [ - "import time\n", - "import boto3\n", - "import sagemaker\n", - "from sagemaker import get_execution_role" - ] - }, - { - "cell_type": "markdown", - "id": "64036230", - "metadata": {}, - "source": [ - "## Setup" - ] - }, - { - "cell_type": "markdown", - "id": "28411012", - "metadata": {}, - "source": [ - "Modify according to your configurations." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "7e937373", - "metadata": {}, - "outputs": [], - "source": [ - "# Bucket name in S3\n", - "bucket = \"hermione-sagemaker\"" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "16450249", - "metadata": {}, - "outputs": [], - "source": [ - "# Set session\n", - "region_name=\"us-east-1\"\n", - "boto3.setup_default_session(region_name=region_name)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "2e144eb8", - "metadata": {}, - "outputs": [], - "source": [ - "# Get user role\n", - "role = get_execution_role()" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "50b4a590", - "metadata": {}, - "outputs": [], - "source": [ - "# Get AWS Account ID\n", - "account_number = boto3.client(\"sts\").get_caller_identity()[\"Account\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "8d56e6ca", - "metadata": {}, - "outputs": [], - "source": [ - "# Image previous uploaded in ECR\n", - "image_name = \"hermione-train\"\n", - "image_uri = f\"{account_number}.dkr.ecr.{region_name}.amazonaws.com/{image_name}\"" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "e710ea0a", - "metadata": {}, - "outputs": [], - "source": [ - "# Input and output paths to execute train\n", - "paths = {\n", - " 'train_processed': f\"s3://{bucket}/PREPROCESSING/TRAIN_PROCESSED\",\n", - " 'val_processed': f\"s3://{bucket}/PREPROCESSING/VAL_PROCESSED\",\n", - " 'model': f\"s3://{bucket}/PREPROCESSING/MODEL\"\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "f8a27026", - "metadata": {}, - "outputs": [], - "source": [ - "# instance to run the code\n", - "instance_type=\"ml.m5.large\"" - ] - }, - { - "cell_type": "markdown", - "id": "b6efb8ce", - "metadata": {}, - "source": [ - "## Train" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "ed9cb39b", - "metadata": {}, - "outputs": [], - "source": [ - "# Receives the processed train data in S3\n", - "train_config = sagemaker.inputs.TrainingInput(\n", - " paths['train_processed'],\n", - " content_type='text/csv',\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "34f144e0", - "metadata": {}, - "outputs": [], - "source": [ - "# Receives the processed validation data in S3\n", - "val_config = sagemaker.inputs.TrainingInput(\n", - " paths['val_processed'],\n", - " content_type='text/csv'\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "a0bbbf7d", - "metadata": {}, - "outputs": [], - "source": [ - "# Saves the model object in S3\n", - "output_path = paths['model']" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "299813d5", - "metadata": {}, - "outputs": [], - "source": [ - "# Metrics to visualize in the Monitor\n", - "metrics = [\n", - " {\n", - " \"Name\": \"accuracy\",\n", - " \"Regex\": \"accuracy=(.*?);\",\n", - " },\n", - " {\n", - " \"Name\": \"f1\",\n", - " \"Regex\": \"f1=(.*?);\",\n", - " },\n", - " {\n", - " \"Name\": \"precision\",\n", - " \"Regex\": \"precision=(.*?);\",\n", - " },\n", - " {\n", - " \"Name\": \"recall\",\n", - " \"Regex\": \"recall=(.*?);\",\n", - " },\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "4ad41d36", - "metadata": {}, - "outputs": [], - "source": [ - "# Creates the estimator to access the ECR image\n", - "est = sagemaker.estimator.Estimator(\n", - " image_uri,\n", - " role, \n", - " instance_count=1, \n", - " instance_type=instance_type,\n", - " volume_size = 30,\n", - " output_path = output_path,\n", - " base_job_name = \"Hermione-train\",\n", - " use_spot_instances=True,\n", - " max_run = 24*60*60,\n", - " max_wait = 24*60*60, # timeout in seconds. Required if use_spot_instances == True\n", - " metric_definitions=metrics\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "62c1894f", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2021-05-26 12:41:29 Starting - Starting the training job...\n", - "2021-05-26 12:41:52 Starting - Launching requested ML instancesProfilerReport-1622032889: InProgress\n", - "......\n", - "2021-05-26 12:42:52 Starting - Preparing the instances for training......\n", - "2021-05-26 12:43:52 Downloading - Downloading input data\n", - "2021-05-26 12:43:52 Training - Downloading the training image.....\u001b[34m2021-05-26 09:44:41,407 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", - "\n", - "2021-05-26 12:45:00 Uploading - Uploading generated training model\n", - "2021-05-26 12:45:00 Completed - Training job completed\n", - "\u001b[34m2021-05-26 09:44:47,642 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", - "\u001b[34m2021-05-26 09:44:47,653 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", - "\u001b[34m2021-05-26 09:44:47,663 sagemaker-training-toolkit INFO Invoking user script\n", - "\u001b[0m\n", - "\u001b[34mTraining Env:\n", - "\u001b[0m\n", - "\u001b[34m{\n", - " \"additional_framework_parameters\": {},\n", - " \"channel_input_dirs\": {\n", - " \"validation\": \"/opt/ml/input/data/validation\",\n", - " \"train\": \"/opt/ml/input/data/train\"\n", - " },\n", - " \"current_host\": \"algo-1\",\n", - " \"framework_module\": null,\n", - " \"hosts\": [\n", - " \"algo-1\"\n", - " ],\n", - " \"hyperparameters\": {},\n", - " \"input_config_dir\": \"/opt/ml/input/config\",\n", - " \"input_data_config\": {\n", - " \"validation\": {\n", - " \"ContentType\": \"text/csv\",\n", - " \"TrainingInputMode\": \"File\",\n", - " \"S3DistributionType\": \"FullyReplicated\",\n", - " \"RecordWrapperType\": \"None\"\n", - " },\n", - " \"train\": {\n", - " \"ContentType\": \"text/csv\",\n", - " \"TrainingInputMode\": \"File\",\n", - " \"S3DistributionType\": \"FullyReplicated\",\n", - " \"RecordWrapperType\": \"None\"\n", - " }\n", - " },\n", - " \"input_dir\": \"/opt/ml/input\",\n", - " \"is_master\": true,\n", - " \"job_name\": \"Hermione-train-2021-05-26-12-41-29-505\",\n", - " \"log_level\": 20,\n", - " \"master_hostname\": \"algo-1\",\n", - " \"model_dir\": \"/opt/ml/model\",\n", - " \"module_dir\": \"/opt/ml/code\",\n", - " \"module_name\": \"train\",\n", - " \"network_interface_name\": \"eth0\",\n", - " \"num_cpus\": 2,\n", - " \"num_gpus\": 0,\n", - " \"output_data_dir\": \"/opt/ml/output/data\",\n", - " \"output_dir\": \"/opt/ml/output\",\n", - " \"output_intermediate_dir\": \"/opt/ml/output/intermediate\",\n", - " \"resource_config\": {\n", - " \"current_host\": \"algo-1\",\n", - " \"hosts\": [\n", - " \"algo-1\"\n", - " ],\n", - " \"network_interface_name\": \"eth0\"\n", - " },\n", - " \"user_entry_point\": \"train.py\"\u001b[0m\n", - "\u001b[34m}\n", - "\u001b[0m\n", - "\u001b[34mEnvironment variables:\n", - "\u001b[0m\n", - "\u001b[34mSM_HOSTS=[\"algo-1\"]\u001b[0m\n", - "\u001b[34mSM_NETWORK_INTERFACE_NAME=eth0\u001b[0m\n", - "\u001b[34mSM_HPS={}\u001b[0m\n", - "\u001b[34mSM_USER_ENTRY_POINT=train.py\u001b[0m\n", - "\u001b[34mSM_FRAMEWORK_PARAMS={}\u001b[0m\n", - "\u001b[34mSM_RESOURCE_CONFIG={\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"}\u001b[0m\n", - "\u001b[34mSM_INPUT_DATA_CONFIG={\"train\":{\"ContentType\":\"text/csv\",\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"},\"validation\":{\"ContentType\":\"text/csv\",\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}}\u001b[0m\n", - "\u001b[34mSM_OUTPUT_DATA_DIR=/opt/ml/output/data\u001b[0m\n", - "\u001b[34mSM_CHANNELS=[\"train\",\"validation\"]\u001b[0m\n", - "\u001b[34mSM_CURRENT_HOST=algo-1\u001b[0m\n", - "\u001b[34mSM_MODULE_NAME=train\u001b[0m\n", - "\u001b[34mSM_LOG_LEVEL=20\u001b[0m\n", - "\u001b[34mSM_FRAMEWORK_MODULE=\u001b[0m\n", - "\u001b[34mSM_INPUT_DIR=/opt/ml/input\u001b[0m\n", - "\u001b[34mSM_INPUT_CONFIG_DIR=/opt/ml/input/config\u001b[0m\n", - "\u001b[34mSM_OUTPUT_DIR=/opt/ml/output\u001b[0m\n", - "\u001b[34mSM_NUM_CPUS=2\u001b[0m\n", - "\u001b[34mSM_NUM_GPUS=0\u001b[0m\n", - "\u001b[34mSM_MODEL_DIR=/opt/ml/model\u001b[0m\n", - "\u001b[34mSM_MODULE_DIR=/opt/ml/code\u001b[0m\n", - "\u001b[34mSM_TRAINING_ENV={\"additional_framework_parameters\":{},\"channel_input_dirs\":{\"train\":\"/opt/ml/input/data/train\",\"validation\":\"/opt/ml/input/data/validation\"},\"current_host\":\"algo-1\",\"framework_module\":null,\"hosts\":[\"algo-1\"],\"hyperparameters\":{},\"input_config_dir\":\"/opt/ml/input/config\",\"input_data_config\":{\"train\":{\"ContentType\":\"text/csv\",\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"},\"validation\":{\"ContentType\":\"text/csv\",\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}},\"input_dir\":\"/opt/ml/input\",\"is_master\":true,\"job_name\":\"Hermione-train-2021-05-26-12-41-29-505\",\"log_level\":20,\"master_hostname\":\"algo-1\",\"model_dir\":\"/opt/ml/model\",\"module_dir\":\"/opt/ml/code\",\"module_name\":\"train\",\"network_interface_name\":\"eth0\",\"num_cpus\":2,\"num_gpus\":0,\"output_data_dir\":\"/opt/ml/output/data\",\"output_dir\":\"/opt/ml/output\",\"output_intermediate_dir\":\"/opt/ml/output/intermediate\",\"resource_config\":{\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"},\"user_entry_point\":\"train.py\"}\u001b[0m\n", - "\u001b[34mSM_USER_ARGS=[]\u001b[0m\n", - "\u001b[34mSM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate\u001b[0m\n", - "\u001b[34mSM_CHANNEL_VALIDATION=/opt/ml/input/data/validation\u001b[0m\n", - "\u001b[34mSM_CHANNEL_TRAIN=/opt/ml/input/data/train\u001b[0m\n", - "\u001b[34mPYTHONPATH=/usr/local/bin:/opt/ml/code:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/lib/python38.zip:/usr/lib/python3.8:/usr/lib/python3.8/lib-dynload:/usr/local/lib/python3.8/dist-packages:/usr/lib/python3/dist-packages\n", - "\u001b[0m\n", - "\u001b[34mInvoking script with the following command:\n", - "\u001b[0m\n", - "\u001b[34m/usr/bin/python3 train.py\n", - "\n", - "\u001b[0m\n", - "\u001b[34m/usr/local/lib/python3.8/dist-packages/interpret_community/common/gpu_kmeans.py:30: UserWarning: cuML is required to use GPU explainers. Check https://rapids.ai/start.html for more information on how to install it.\n", - " warnings.warn(\u001b[0m\n", - "\u001b[34mcuML is required to use GPU explainers. Check https://rapids.ai/start.html for more information on how to install it.\u001b[0m\n", - "\u001b[34mINFO:root:Starting the training\u001b[0m\n", - "\u001b[34mINFO:root:Reading the inputs\u001b[0m\n", - "\u001b[34mINFO:root:Training the model\u001b[0m\n", - "\u001b[34mINFO:root:Saving\u001b[0m\n", - "\u001b[34mINFO:root:accuracy=0.7373737373737373; f1=0.6976744186046512; precision=0.6382978723404256; recall=0.7692307692307693;\u001b[0m\n", - "\u001b[34mINFO:root:Training complete.\u001b[0m\n", - "\u001b[34m2021-05-26 09:44:51,898 sagemaker-training-toolkit INFO Reporting training SUCCESS\u001b[0m\n", - "Training seconds: 85\n", - "Billable seconds: 36\n", - "Managed Spot Training savings: 57.6%\n", - "CPU times: user 450 ms, sys: 19.9 ms, total: 470 ms\n", - "Wall time: 3min 42s\n" - ] - } - ], - "source": [ - "%%time\n", - "# Train the model and validate\n", - "est.fit({'train':train_config, 'validation':val_config}, wait=True, logs=True)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "conda_python3", - "language": "python", - "name": "conda_python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/.ipynb_checkpoints/dataquality-checkpoint.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/.ipynb_checkpoints/dataquality-checkpoint.py deleted file mode 100644 index 5ce7b61..0000000 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/.ipynb_checkpoints/dataquality-checkpoint.py +++ /dev/null @@ -1,60 +0,0 @@ -import pandas as pd -import great_expectations as ge - -class DataQuality: - """ - Class to perform data quality before training - """ - def __init__(self, continuous_cols=None, discrete_cat_cols=None): - """ - Constructor - - Parameters - ---------- - continuous_cols : array - Receives an array with the name of the continuous columns - discrete_cat_cols : array - Receives an array with the name of the dicrete/categorical columns - Returns - ------- - DataQuality - """ - self.continuous_cols = continuous_cols - self.discrete_cat_cols = discrete_cat_cols - - def perform(self, - df: pd.DataFrame): - """ - Perform data quality - - Parameters - ---------- - df : pd.Dataframe - Dataframe to be processed - - Returns - ------- - json - """ - df_ge = ge.dataset.PandasDataset(df) - cols = df_ge.columns - df_ge.expect_table_columns_to_match_ordered_list(cols) - for col in cols: - df_ge.expect_column_values_to_not_be_null(col) - cut_off = 2 - if self.continuous_cols != None: - for col in self.continuous_cols: - measures = df_ge[col].describe() - df_ge.expect_column_values_to_be_of_type(col, 'int64') - df_ge.expect_column_mean_to_be_between(col, measures['mean'] - cut_off * measures['std'], measures['mean'] + cut_off * measures['std']) - df_ge.expect_column_max_to_be_between(col, measures['max'] - cut_off * measures['std'], measures['max'] + cut_off * measures['std']) - df_ge.expect_column_min_to_be_between(col, measures['min'] - cut_off * measures['std'], measures['min'] + cut_off * measures['std']) - expected_partition = ge.dataset.util.continuous_partition_data(df_ge[col]) - df_ge.expect_column_bootstrapped_ks_test_p_value_to_be_greater_than(col, expected_partition) - if len(self.discrete_cat_cols) != None: - for col in self.discrete_cat_cols: - possible_cat = df_ge[col].unique() - df_ge.expect_column_values_to_be_in_set(col, possible_cat) - expected_partition = ge.dataset.util.categorical_partition_data(df_ge[col]) - df_ge.expect_column_chisquare_test_p_value_to_be_greater_than(col, expected_partition) - return df_ge \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/.ipynb_checkpoints/normalization-checkpoint.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/.ipynb_checkpoints/normalization-checkpoint.py deleted file mode 100644 index 6d5822b..0000000 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/.ipynb_checkpoints/normalization-checkpoint.py +++ /dev/null @@ -1,159 +0,0 @@ -import pandas as pd -import numpy as np -from sklearn.preprocessing import StandardScaler, normalize, MinMaxScaler -from scipy.stats import zscore - -class Normalizer: - - def __init__(self, norm_cols: dict): - """ - Constructor - - Parameters - ---------- - norm_cols : dict - Receives dict with the name of the normalization to be - performed and which are the columns - Ex: norm_cols = {'zscore': ['salary', 'price'], - 'min-max': ['heigth', 'age']} - - Returns - ------- - Normalization - """ - self.norm_cols = norm_cols - self.col_names = [name for norm in norm_cols for name in norm_cols[norm]] - self.norms = {'min-max': MinMaxScaler, - 'standard': StandardScaler} - self.fitted = False - - def statistics(self, df : pd.DataFrame): - """ - Calculates dataframe statistics - - Parameters - ---------- - df : dataframe to calculate the statistics for each column - - Returns - ------- - None - """ - zip_cols = lambda result: zip(result.index.values, result.values) - self.col_min = {col: value for col, value in zip_cols(df[self.col_names].min())} - self.col_max = {col: value for col, value in zip_cols(df[self.col_names].max())} - self.col_std = {col: value for col, value in zip_cols(df[self.col_names].std())} - self.col_mean = {col: value for col, value in zip_cols(df[self.col_names].mean())} - self.col_median = {col: value for col, value in zip_cols(df[self.col_names].median())} - - def __apply_func(self, X, normalization): - """ - Creates the normalization object - - Parameters - ---------- - X : array - Data to be normalized - normalization : Normalization - Normalization to be applied - - Returns - ------- - Normalization - """ - normalization.fit(X) - return normalization - - def fit(self, df: pd.DataFrame): - """ - Generates normalization object for each column - - Parameters - ---------- - df : pd.DataFrame - dataframe with columns to be normalized - - Returns - ------- - None - """ - self.statistics(df) - self.normalization = dict() - for norm in self.norm_cols: - if norm in ['zscore', 'log10']: - continue - for col in self.norm_cols[norm]: - self.normalization[col] = self.__apply_func(df[col].values.reshape(-1, 1), self.norms[norm]()) - self.fitted = True - - def transform(self, df: pd.DataFrame): - """ - Apply normalization to each column - - Parameters - ---------- - df : pd.DataFrame - dataframe with columns to be normalized - - Returns - ------- - pd.DataFrame - """ - if not self.fitted: - raise Exception("Not yet fitted.") - - for norm in self.norm_cols: - if norm == 'zscore': - for col in self.norm_cols[norm]: - df.loc[:,col] = (df[col].values - self.col_mean[col])/self.col_std[col] - elif norm == 'log10': - for col in self.norm_cols[norm]: - df.loc[:,col] = np.log10(df[col].values) - else: - for col in self.norm_cols[norm]: - df.loc[:,col] = self.normalization[col].transform(df[col].values.reshape(-1, 1)) - return df - - def inverse_transform(self, df: pd.DataFrame): - """ - Apply the denormalized to each column - - Parameters - ---------- - df : pd.DataFrame - dataframe with columns to be denormalized - - Returns - ------- - pd.DataFrame - """ - if not self.fitted: - raise Exception("Not yet trained.") - - for norm in self.norm_cols: - if norm == 'zscore': - for col in self.norm_cols[norm]: - df.loc[:,col] = df[col].apply(lambda z: self.col_std[col]*z + self.col_mean[col]) - elif norm == 'log10': - for col in self.norm_cols[norm]: - df.loc[:,col] = df[col].apply(lambda x: 10 ** x) - else: - for col in self.norm_cols[norm]: - df.loc[:,col] = self.normalization[col].inverse_transform(df[col].values.reshape(-1, 1)) - return df - - def fit_transform(self, df: pd.DataFrame): - """ - Creates object and apply it normalization - - Parameters - ---------- - df : pd.DataFrame - dataframe with columns to be normalized - - Returns - ------- - pd.DataFrame - """ - self.fit(df) - return self.transform(df) diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/.ipynb_checkpoints/preprocessing-checkpoint.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/.ipynb_checkpoints/preprocessing-checkpoint.py deleted file mode 100644 index dea90fa..0000000 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/.ipynb_checkpoints/preprocessing-checkpoint.py +++ /dev/null @@ -1,141 +0,0 @@ -import pandas as pd - -from ml.preprocessing.normalization import Normalizer -from sklearn.preprocessing import OneHotEncoder -from sklearn.model_selection import train_test_split -from category_encoders import * -import logging - -logging.getLogger().setLevel(logging.INFO) - -class Preprocessing: - """ - Class to perform data preprocessing before training - """ - - def __init__(self, norm_cols=None, oneHot_cols=None): - """ - Constructor - - Parameters - ---------- - norm_cols : dict - Receives dict with the name of the normalization to be - performed and which are the columns - Ex: norm_cols = {'zscore': ['salary', 'price'], - 'min-max': ['heigth', 'age']} - oneHot_cols : array - Receives an array with columns names to be categorized with One Hot Encoding - Returns - ------- - Preprocessing - """ - self.norm_cols = norm_cols - self.oneHot_cols = oneHot_cols - self.ohe = OneHotEncoder(handle_unknown='ignore') - - def clean_data(self, df: pd.DataFrame): - """ - Perform data cleansing. - - Parameters - ---------- - df : pd.Dataframe - Dataframe to be processed - - Returns - ------- - pd.Dataframe - Cleaned Data Frame - """ - logging.info("Cleaning data") - df_copy = df.copy() - df_copy['Pclass'] = df_copy.Pclass.astype('object') - df_copy = df_copy.dropna() - return df_copy - - def categ_encoding_oneHot(self, df: pd.DataFrame, step_train = False): - """ - Perform encoding of the categorical variables using One Hot Encoding - - Parameters - ---------- - df : pd.Dataframe - Dataframe to be processed - step_train : bool - if True, the fit function is executed - - Returns - ------- - pd.Dataframe - Encoded Data Frame - """ - logging.info("One hot encoding") - df_copy = df.copy() - - if step_train: - self.ohe.fit(df_copy[self.oneHot_cols]) - - arr = self.ohe.transform(df_copy[self.oneHot_cols]) - df_copy = df_copy.join(arr).drop(self.oneHot_cols, axis=1) - return df_copy - - def normalize(self, df: pd.DataFrame, step_train = False): - """ - Apply normalization to the selected columns - - Parameters - ---------- - df : pd.DataFrame - dataframe with columns to be normalized - step_train : bool - if True, the Normalizer is created and applied, - otherwise it is only applied - - Returns - ------- - pd.DataFrame - Normalized dataframe - """ - logging.info("Normalizing") - if step_train: - self.norm = Normalizer(self.norm_cols) - df = self.norm.fit_transform(df) - else: - df = self.norm.transform(df.copy()) - return df - - def execute(self, df, step_train = False, val_size = 0.2): - """ - Apply all preprocessing steps on the Dataframe - - Parameters - ---------- - df : pd.DataFrame - dataframe with columns to be normalized - step_train : bool - if True, data is splited in train and val - step_train : val_size - Size of the validation dataset - - Returns - ------- - pd.DataFrame - - One Preprocessed dataframe, if step_train is False - - Two Preprocessed dataframes, if step_train is True - """ - df = self.clean_data(df) - df = self.categ_encoding_oneHot(df, step_train) - - if step_train: - logging.info("Divide train and test") - X_train, X_val = train_test_split(df, test_size=val_size, random_state=123) - X_train = self.normalize(X_train, step_train = True) - X_val = self.normalize(X_val, step_train = False) - logging.info(f"shape train {X_train.shape} val {X_val.shape}") - return X_train, X_val - else: - X = self.normalize(df, step_train = False) - logging.info(f"shape {X.shape}") - return X - diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/.ipynb_checkpoints/text_vectorizer-checkpoint.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/.ipynb_checkpoints/text_vectorizer-checkpoint.py deleted file mode 100644 index 674458e..0000000 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/preprocessing/.ipynb_checkpoints/text_vectorizer-checkpoint.py +++ /dev/null @@ -1,201 +0,0 @@ -from sklearn.feature_extraction.text import CountVectorizer -from sklearn.feature_extraction.text import TfidfVectorizer -import numpy as np -import pandas as pd - -class TextVectorizer: - - def __init__(self, vectorizer_cols : dict, word2vec=None): - """ - Constructor - - Parameters - ---------- - vectorizer_cols : dict - Receives a dict with the name of the vectorizer to be - performed and which are the columns - Ex: vectorizer_cols = {'embedding_median': ['col'], - 'embedding_mean': ['col'], - 'tf_idf': ['col'], - 'bag_of_words' : [col]} - Returns - ------- - Normalization - """ - self.word2vec = word2vec - self.index_ini_fim = len(self.word2vec.index2word) if word2vec != None else 0 - self.vectorizer_cols = vectorizer_cols - self.vectorizer_vects = {'bag_of_words': self.bag_of_words, - 'tf_idf': self.tf_idf_vect} - self.fitted = False - - def fit(self, df: pd.DataFrame): - """ - Generates the vectorizer object for each column. The text must be preprocessed. - - Parameters - ---------- - df : pd.DataFrame - dataframe with columns to be vectorizer - - Returns - ------- - None - """ - self.vectorizers_fitted = dict() - for vectorizer in self.vectorizer_cols: - if vectorizer in ['index', 'embedding_median', 'embedding_mean']: - continue - for col in self.vectorizer_cols[vectorizer]: - self.vectorizers_fitted[vectorizer] = {} - self.vectorizers_fitted[vectorizer][col] = self.vectorizer_vects[vectorizer](df[col].values) - self.fitted = True - - def transform(self, df: pd.DataFrame): - """ - Apply the vectorizer object for each column. The text must be preprocessed. - - Parameters - ---------- - df : pd.DataFrame - dataframe with columns to be vectorizer - - Returns - ------- - pd.DataFrame - """ - if not self.fitted: - raise Exception("Not yet trained.") - - for vectorizer in self.vectorizer_cols: - if vectorizer == 'index': - for col in self.vectorizer_cols[vectorizer]: - df.loc[:, col+"_"+vectorizer] = df[col].apply(lambda x: self.embedding(x, 3)) - elif vectorizer == 'embedding_median': - for col in self.vectorizer_cols[vectorizer]: - df.loc[:, col+"_"+vectorizer] = df[col].apply(lambda x: self.embedding(x, 1)) - elif vectorizer == 'embedding_mean': - for col in self.vectorizer_cols[vectorizer]: - df.loc[:, col+"_"+vectorizer] = df[col].apply(lambda x: self.embedding(x, 2)) - elif (vectorizer == 'bag_of_words') | (vectorizer == 'tf_idf'): - for col in self.vectorizer_cols[vectorizer]: - values = self.vectorizers_fitted[vectorizer][col].transform(df[col]) - df.loc[:,col+"_"+vectorizer] = pd.Series(values.toarray().tolist()) - - return df - - def embedding(self, X, typ_transform=1): - """ - Apply the embedding in X. The text must be preprocessed. - - Parameters - ---------- - X : pd.Series - row to be encoded - typ_transform : int - type of transformation - 1 - apply embedding median - 2 - apply embedding mean - 2 - apply index - - Returns - ------- - pd.DataFrame - """ - if X is None or type(X) == float: - return None - vector = [] - if typ_transform == 1: # mediana - vector = np.median([self.word2vec[x] for x in X.split() if x in self.word2vec], axis=0) - elif typ_transform == 2: # média - vector = np.mean([self.word2vec[x] for x in X.split() if x in self.word2vec], axis=0)#[0] - elif typ_transform == 3: # indexação - idx = self.word2vec.index2word - set_idx = set(idx) - indexes = [idx.index(token) for token in X.split() if token in set_idx] - indexes = [self.index_ini_fim] + indexes + [self.index_ini_fim] - # Create vector - X_length = len(indexes) - vector = np.zeros(X_length, dtype=np.int64) - vector[:len(indexes)] = indexes - else: - vector = [] - return vector - - def bag_of_words(self, corpus): - """ - Generate object bag of words - - Parameters - ---------- - corpus : str - text to generate object bag of words - Returns - ------- - model - """ - vectorizer = CountVectorizer() - model = vectorizer.fit(corpus) - return model - - def tf_idf_vect(self, corpus): - """ - Generate object td idf - - Parameters - ---------- - corpus : str - text to generate object tf idf - Returns - ------- - model - """ - vectorizer = TfidfVectorizer() - model = vectorizer.fit(corpus) - return model - - def inverse_transform(self, df: pd.DataFrame): - """ - Apply the invese_transform of vectorizer to each column - Options: index, bag_of_words and tf_idf - - Parameters - ---------- - df : pd.DataFrame - dataframe with columns to be unvectorizer - - Returns - ------- - pd.DataFrame - """ - if not self.fitted: - raise Exception("Not yet trained.") - - for vectorizer in self.vectorizer_cols: - if vectorizer == 'index': - for col in self.vectorizer_cols[vectorizer]: - df.loc[:, col+"_remove_"+vectorizer] = df[col].apply(lambda x: self.unvectorize(x)) - elif (vectorizer == 'bag_of_words') | (vectorizer == 'tf_idf'): - for col in self.vectorizer_cols[vectorizer]: - values = self.vectorizers_fitted[vectorizer][col].inverse_transform(df[col]) - df.loc[:,col+"_remove_"+vectorizer] = pd.Series(values.toarray().tolist()) - - return df - - def unvectorize(self, vector): - """ - Apply unvectorize in vector index - - Parameters - ---------- - vector : array - array with index - - Returns - ------- - array - """ - idx = self.word2vec.index2word - tokens = [idx[index] for index in vector if index != self.index_ini_fim] - X = " ".join(token for token in tokens) - return X \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/tests/.ipynb_checkpoints/README-checkpoint.md b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/tests/.ipynb_checkpoints/README-checkpoint.md deleted file mode 100644 index ae81f9e..0000000 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/tests/.ipynb_checkpoints/README-checkpoint.md +++ /dev/null @@ -1,41 +0,0 @@ -# Hermione test files - -In this folder, you can develop unit tests for your Data Science project. - -Unit testing is a regular process in software development but, unfortunately, not so common in Data Science projects. To ensure your code quality and that the project is running flawless at all times, it is extremely important that you code unit tests, specially if you are not working alone but in a Data Science team. - -The tests you have in the implemented example project test, for instance, if the project has its minimum directory structure, if your dataset is correctly imported, if the dataset has no missing values and that some columns that should be there are there indeed after preprocessing. - -There are no "written in stone" rules to good testing in Data Science. You just have to figure out what tests are best for you. - -## How to run the tests - -When working locally, you should run your tests before pushing to a remote repository or sharing your code to others. To do that, **ensure that you are inside `tests` folder**. - -```bash -cd src/tests -``` - -Then, run the `pytest` command. - -```bash -pytest -``` - -If you want to have a coverage report, do so: - -```bash -coverage run -m pytest -coverage report -m -``` - -Both `coverage` and `pytest` libraries are already in the `requirements.txt` file. - -## Include tests on CI/CD files - -If you are working with a remote repository, it is a great practice to code a CI/CD `.yml` file. For more information, visit - -- [CI/CD for Machine Learning](https://www.infoq.com/presentations/ci-cd-ml/) -- [CI/CD for Machine Learning & AI](https://blog.paperspace.com/ci-cd-for-machine-learning-ai/) -- [Accelerate MLOps: using CI/CD with machine learning models -](https://algorithmia.com/blog/accelerate-mlops-using-ci-cd-with-machine-learning-models) diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/tests/.ipynb_checkpoints/test_project-checkpoint.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/tests/.ipynb_checkpoints/test_project-checkpoint.py deleted file mode 100644 index 2d6936f..0000000 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/tests/.ipynb_checkpoints/test_project-checkpoint.py +++ /dev/null @@ -1,54 +0,0 @@ -import os -import pytest -import pandas as pd -import sys -sys.path.append('..') - -@pytest.fixture(scope='module') -def read_data(): - from ml.data_source.spreadsheet import Spreadsheet - yield Spreadsheet().get_data('../../data/raw/train.csv') - -@pytest.fixture(scope='module') -def cleaned_data(read_data): - from ml.preprocessing.preprocessing import Preprocessing - p = Preprocessing() - yield p.clean_data(read_data) - -def test_tree(): - """ - Test if the project has a good minimum structure - """ - assert os.path.exists(os.path.join('..','..', 'data', 'raw')) - assert os.path.exists(os.path.join('..','..', 'output')) - assert os.path.exists(os.path.join('..','..', 'src', 'api')) - assert os.path.exists(os.path.join('..','..', 'src', 'config')) - assert os.path.exists(os.path.join('..','..', 'src', 'ml', 'data_source')) - assert os.path.exists(os.path.join('..','..', 'src', 'ml', 'model')) - assert os.path.exists(os.path.join('..','..', 'src', 'ml', 'notebooks')) - assert os.path.exists(os.path.join('..','..', 'src', 'ml', 'preprocessing')) - assert os.path.exists(os.path.join('..','..', 'src', 'tests')) - -def test_spreadsheet(read_data): - """ - Test that spreadsheet is importing correctly - """ - assert read_data.shape[0] > 1 - - -def test_clean_data(cleaned_data): - """ - Test that the df is cleaned correctly - """ - assert cleaned_data.Pclass.dtype == 'object' - assert pd.isnull(cleaned_data.Age).sum() == 0 - -def test_categ_encoding(cleaned_data): - """ - Test if column PClass is - """ - from ml.preprocessing.preprocessing import Preprocessing - p = Preprocessing() - df = p.categ_encoding(cleaned_data) - names = ['Survived', 'Age', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male'] - assert [name in df.columns for name in names] \ No newline at end of file From 5ad72b639370bca5485127f4cbca12bf540b83f7 Mon Sep 17 00:00:00 2001 From: karenstemartins Date: Wed, 2 Jun 2021 21:08:36 +0000 Subject: [PATCH 09/10] Add arquivo util --- .../__IMPLEMENTED_SAGEMAKER__/src/util.py | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/util.py diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/util.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/util.py new file mode 100644 index 0000000..02c7bde --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/util.py @@ -0,0 +1,49 @@ +import os +import sys +import collections +import copy +import json +import numpy as np +import pandas as pd +import re +from shutil import copyfile +import time +import yaml +import io + +def create_dirs(dirpath): + """Creating directories.""" + if not os.path.exists(dirpath): + os.makedirs(dirpath) + +def load_yaml(filepath): + with open(filepath, 'r') as stream: + return yaml.safe_load(stream) + + +def load_json(filepath): + """Load a json file.""" + with open(filepath, "r", encoding='utf8') as fp: + obj = json.load(fp) + return obj + + +def save_json(obj, filepath): + """Save a dictionary to a json file.""" + with open(filepath, "w") as fp: + json.dump(obj, fp, indent=4) + +def wrap_text(text): + """Pretty box print.""" + box_width = len(text) + 2 + print ('\n╒{}╕'.format('═' * box_width)) + print ('│ {} │'.format(text.upper())) + print ('╘{}╛'.format('═' * box_width)) + + +def load_data(data_csv): + """Load data from CSV to Pandas DataFrame.""" + df = pd.read_csv(data_csv, header=0) + wrap_text("Raw data") + print (df.head(5)) + return df From 26bd7725592ecfcd03f139eb633b49bfdf35d1b1 Mon Sep 17 00:00:00 2001 From: karenstemartins Date: Fri, 23 Jul 2021 15:09:18 +0000 Subject: [PATCH 10/10] =?UTF-8?q?Padroniza=C3=A7=C3=A3o=20de=20c=C3=B3digo?= =?UTF-8?q?=20e=20coment=C3=A1rios?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../__IMPLEMENTED_SAGEMAKER__/README.tpl.md | 10 +- .../inference/handler.py | 109 +++-- .../inference/main.py | 6 +- .../processor/preprocessor.py | 128 +++--- .../src/ml/model/metrics.py | 248 ++++++------ .../src/ml/model/trainer.py | 351 +++++++++++----- .../src/ml/model/wrapper.py | 2 +- ...ssor.ipynb => 1_Sagemaker_Processor.ipynb} | 90 ++--- ...er_Train.ipynb => 2_Sagemaker_Train.ipynb} | 121 +++--- .../ml/notebooks/3_Sagemaker_Inference.ipynb | 374 ++++++++++++++++++ ... => 4_Sagemaker_StepFunctions_Train.ipynb} | 0 ...5_Sagemaker_StepFunctions_Inference.ipynb} | 0 .../ml/notebooks/Sagemaker_Inference.ipynb | 322 --------------- .../__IMPLEMENTED_SAGEMAKER__/train/train.py | 88 +++-- 14 files changed, 1084 insertions(+), 765 deletions(-) rename hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/{Sagemaker_Processor.ipynb => 1_Sagemaker_Processor.ipynb} (92%) rename hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/{Sagemaker_Train.ipynb => 2_Sagemaker_Train.ipynb} (81%) create mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/3_Sagemaker_Inference.ipynb rename hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/{Sagemaker_StepFunctions_Train.ipynb => 4_Sagemaker_StepFunctions_Train.ipynb} (100%) rename hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/{Sagemaker_StepFunctions_Inference.ipynb => 5_Sagemaker_StepFunctions_Inference.ipynb} (100%) delete mode 100644 hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_Inference.ipynb diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/README.tpl.md b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/README.tpl.md index 60aa98d..882eefe 100644 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/README.tpl.md +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/README.tpl.md @@ -116,9 +116,9 @@ The bash command will access the Dockerfile in the folder, create the image and To test the images in ECR, execute the following notebooks: -- project-name/src/ml/notebooks/Sagemaker_Processor.ipynb -- project-name/src/ml/notebooks/Sagemaker_Train.ipynb -- project-name/src/ml/notebooks/Sagemaker_Inference.ipynb +- project-name/src/ml/notebooks/1_Sagemaker_Processor.ipynb +- project-name/src/ml/notebooks/2_Sagemaker_Train.ipynb +- project-name/src/ml/notebooks/3_Sagemaker_Inference.ipynb ## Stepfunctions @@ -240,5 +240,5 @@ Next, create and attach another new policy to the role you created: To create and test the Step Functions state machines, execute the following notebooks: -- project-name/src/ml/notebooks/Sagemaker_StepFunctions_Train.ipynb -- project-name/src/ml/notebooks/Sagemaker_StepFunctions_Inference.ipynb \ No newline at end of file +- project-name/src/ml/notebooks/4_Sagemaker_StepFunctions_Train.ipynb +- project-name/src/ml/notebooks/5_Sagemaker_StepFunctions_Inference.ipynb \ No newline at end of file diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/handler.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/handler.py index b6bdc50..b0a0ce5 100644 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/handler.py +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/handler.py @@ -3,9 +3,9 @@ import os import logging -import pandas as pd from joblib import load from six import StringIO +import pandas as pd from ml.model.wrapper import Wrapper from sagemaker_inference.default_inference_handler import DefaultInferenceHandler @@ -17,49 +17,106 @@ # Path to access the model MODEL_DIR = '/opt/ml/model' -def _csv_to_pandas(string_like): # type: (str) -> pd.DataFrame - """Convert a CSV object to a pandas DataFrame. - Args: - string_like (str): CSV string. - - Returns: - (pd.DataFrame): pandas DataFrame - """ + +def _csv_to_pandas(string_like): + """ + Convert a CSV object to a pandas DataFrame. + + Parameters + ---------- + string_like : String + CSV string. + + Returns + ------- + pd.DataFrame : pandas DataFrame + """ stream = StringIO(string_like) res = pd.read_csv(stream) return res + class HandlerService(DefaultHandlerService, DefaultInferenceHandler): """ - Execute the inference step in the virtual environment - + Execute the inference step in the virtual environment + """ def __init__(self): op = transformer.Transformer(default_inference_handler=self) super(HandlerService, self).__init__(transformer=op) - - # Loads the model from the disk + def default_model_fn(self, model_dir): - logging.info('Loading the model') + """ + Loads the model from the disk + + Parameters + ---------- + model_dir : string + Path of the model + + Returns + ------- + pkl : model + """ + logging.info('Loading the model') return load(os.path.join(MODEL_DIR, "model.pkl")) - - # Parse and check the format of the input data + def default_input_fn(self, input_data, content_type): + """ + Parse and check the format of the input data + + Parameters + ---------- + input_data : string + CSV string + content_type : string + Type of the file + + Returns + ------- + pd.DataFrame : pandas DataFrame + """ global colunas if content_type != "text/csv": raise Exception("Invalid content-type: %s" % content_type) - return _csv_to_pandas(input_data) - - # Run our model and do the prediction + return _csv_to_pandas(input_data) + def default_predict_fn(self, df, model): - logging.info('Predicting...') - resultados = model.predict(df,included_input=True) - logging.info('Prediction Complete') + """ + Run our model and do the prediction + + Parameters + ---------- + df : pd.DataFrame + Data to be predicted + model : pkl + Model to predict the data + + Returns + ------- + pd.DataFrame : pandas DataFrame + """ + logging.info('Predicting...') + resultados = model.predict(df, included_input=True) + logging.info('Prediction Complete') return resultados.reset_index(drop=True).T.reset_index().T - - # Gets the prediction output and format it to be returned to the user + def default_output_fn(self, prediction, accept): - logging.info('Saving') + """ + Gets the prediction output and format it to be returned to the user + + Parameters + ---------- + prediction : pd.DataFrame + Predicted dataset + accept : string + Output type + + Returns + ------- + CSV : CSV file + """ + logging.info('Saving') if accept != "text/csv": raise Exception("Invalid accept: %s" % accept) - return encoder.encode(prediction, accept) \ No newline at end of file + return encoder.encode(prediction, accept) diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/main.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/main.py index 9ff9b2a..803a9e9 100644 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/main.py +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/inference/main.py @@ -1,12 +1,10 @@ -import argparse import sys import os +import argparse import logging from sagemaker_inference import model_server logging.getLogger().setLevel(logging.INFO) - if __name__ == "__main__": - - model_server.start_model_server(handler_service="serving.handler") \ No newline at end of file + model_server.start_model_server(handler_service="serving.handler") diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/preprocessor.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/preprocessor.py index c78c24b..bb269eb 100644 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/preprocessor.py +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/processor/preprocessor.py @@ -1,71 +1,101 @@ -from ml.preprocessing.preprocessing import Preprocessing -from ml.preprocessing.dataquality import DataQuality -from ml.data_source.spreadsheet import Spreadsheet -import great_expectations as ge -from datetime import date -import pandas as pd import argparse import logging +from datetime import date + +import pandas as pd import glob import json from joblib import dump, load +import great_expectations as ge + +from ml.preprocessing.preprocessing import Preprocessing +from ml.preprocessing.dataquality import DataQuality +from ml.data_source.spreadsheet import Spreadsheet logging.getLogger().setLevel('INFO') -if __name__=='__main__': +path_input = '/opt/ml/processing/input/' +path_output = '/opt/ml/processing/output/' +date = date.today().strftime('%Y%m%d') + +def data_quality(df, step_train): + """ + If True, it creates the DataQuality object, + otherwise it loads an existing one + + Parameters + ---------- + df : pd.Dataframe + Train or test dataset + step_train : boolean + Train or test + """ - Execute the processor step in the virtual environment - + if step_train: + dq = DataQuality(discrete_cat_cols=['Sex', 'Pclass', 'Survived']) + df_ge = dq.perform(df) + df_ge.save_expectation_suite(path_output + + 'expectations/expectations.json') + else: + df_ge = ge.dataset.PandasDataset(df) + ge_val = df_ge.validate(expectation_suite=path_input + + 'expectations/expectations.json', + only_return_failures=False) + with open(f'{path_output}validations/{date}.json', 'w') as f: + json.dump(ge_val.to_json_dict(), f) + + +def preprocessing(df, step_train): + """ + If True, it creates the Preprocessing object, + otherwise it loads an existing one + + Parameters + ---------- + df : pd.Dataframe + Train or test dataset + step_train : boolean + Train or test + + """ + if step_train: + norm_cols = {'min-max': ['Age']} + oneHot_cols = ['Pclass', 'Sex'] + p = Preprocessing(norm_cols, oneHot_cols) + train, test_train = p.execute(df, step_train=True, val_size=0.2) + logging.info("Saving") + dump(p, path_output+'preprocessing/preprocessing.pkl') + train.to_csv(path_output+'processed/train/train.csv', index=False) + test_train.to_csv(path_output+'processed/val/val.csv', index=False) + else: + p = load(path_input+'preprocessing/preprocessing.pkl') + test = p.execute(df, step_train=False) + logging.info("Saving") + test.to_csv(path_output+'processed/inference/inference.csv', + index=False) + + +if __name__ == '__main__': + """ + Execute the processor step in the virtual environment + """ logging.info('Starting the preprocessing') - + # Read the step argument (train or test) parser = argparse.ArgumentParser() parser.add_argument('--step', type=str, default='train') - args = parser.parse_args() + args = parser.parse_args() step_train = True if args.step == "train" else False logging.info(f'step_train: {step_train}') - + logging.info('Reading the inputs') - file = glob.glob("/opt/ml/processing/input/raw_data/*.csv")[0] + file = glob.glob(path_input+"raw_data/*.csv")[0] logging.info(f'Reading file: {file}') df = Spreadsheet().get_data(file) - + logging.info("Data Quality") - # If True, it creates the DataQuality object, otherwise it loads an existing one - if step_train: - dq = DataQuality(discrete_cat_cols=['Sex', 'Pclass']) - df_ge = dq.perform(df, target='Survived') - df_ge.save_expectation_suite('/opt/ml/processing/output/expectations/expectations.json') - else: - date = date.today().strftime('%Y%m%d') - df_without_target = df.copy() - if 'Survived' in df_without_target.columns: - df_without_target.drop(columns=['Survived'], inplace=True) - df_ge = ge.dataset.PandasDataset(df_without_target) - ge_val = df_ge.validate(expectation_suite='/opt/ml/processing/input/expectations/expectations.json', only_return_failures=False) - with open(f'/opt/ml/processing/output/validations/{date}.json', 'w') as f: - json.dump(ge_val.to_json_dict(), f) + data_quality(df, step_train) logging.info("Preprocessing") - # If True, it creates the Preprocessing object, otherwise it loads an existing one - if step_train: - norm_cols = {'min-max': ['Age']} - oneHot_cols = ['Pclass','Sex'] - p = Preprocessing(norm_cols, oneHot_cols) - train, test_train = p.execute(df, step_train = True, val_size = 0.2) - else: - p = load("/opt/ml/processing/input/preprocessing/preprocessing.pkl") - test = p.execute(df, step_train = False) - - logging.info("Saving") - # If True, it saves the Preprocessing to be used later in the inference step - if step_train: - dump(p, '/opt/ml/processing/output/preprocessing/preprocessing.pkl') - - # If True, it saves the train and val files, otherwise it saves only the inference file - if step_train: - train.to_csv('/opt/ml/processing/output/processed/train/train.csv', index=False) - test_train.to_csv('/opt/ml/processing/output/processed/val/val.csv', index=False) - else: - test.to_csv('/opt/ml/processing/output/processed/inference/inference.csv', index=False) \ No newline at end of file + preprocessing(df, step_train) diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/metrics.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/metrics.py index 34cd079..f9ed342 100644 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/metrics.py +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/metrics.py @@ -3,210 +3,220 @@ from sklearn.metrics import make_scorer from sklearn.model_selection import cross_validate + class Metrics: - + @classmethod def smape(cls, A, F): """ - Calculates the smape value between the real and the predicted - - Parameters - ---------- + Calculates the smape value between the real and the predicted + + Parameters + ---------- A : array Target values F : array Predicted values - - Returns - ------- - float: smape value - """ + + Returns + ------- + float: smape value + """ return 100/len(A) * np.sum(np.abs(F - A) / (np.abs(A) + np.abs(F))) - + @classmethod def __custom_score(cls, y_true, y_pred): """ - Creates a custom metric - - Parameters - ---------- + Creates a custom metric + + Parameters + ---------- y_true : array Target values y_pred : array Predicted values - - Returns - ------- - sklearn.metrics - """ + + Returns + ------- + sklearn.metrics + """ #return sklearn.metrics.fbeta_score(y_true, y_pred, 2) pass - + @classmethod def customized(cls, y_true, y_pred): """ - Creates a custom metric - - Parameters - ---------- + Creates a custom metric + + Parameters + ---------- y_true : array Target values y_pred : array Predicted values - - Returns - ------- - float - """ + + Returns + ------- + float + """ custom_metric = make_scorer(cls.__custom_score, greater_is_better=True) return custom_metric - + @classmethod def mape(cls, y_true, y_pred): """ - Calculates the map value between the real and the predicted - - Parameters - ---------- + Calculates the map value between the real and the predicted + + Parameters + ---------- y_true : array Target values y_pred : array Predicted values - - Returns - ------- - float : value of mape - """ + + Returns + ------- + float : value of mape + """ y_true, y_pred = np.array(y_true), np.array(y_pred) return np.mean(np.abs(((y_true+1) - (y_pred+1)) / (y_true+1))) * 100 - @classmethod def regression(cls, y_true, y_pred): """ - Calculates some metrics for regression problems - - Parameters - ---------- + Calculates some metrics for regression problems + + Parameters + ---------- y_true : array Target values y_pred : array Predicted values - - Returns - ------- - dict : metrics results - """ - results = {'mean_absolute_error': round(mean_absolute_error(y_true, y_pred), 7), - 'root_mean_squared_error': round(np.sqrt(mean_squared_error(y_true, y_pred)), 7), - 'r2': round(r2_score(y_true, y_pred), 7), - 'smape': round(cls.smape(y_true, y_pred), 7), - 'mape': round(cls.mape(y_true, y_pred), 7) - } + + Returns + ------- + dict : metrics results + """ + results = {'mean_absolute_error': round(mean_absolute_error( + y_true, y_pred), 7), + 'root_mean_squared_error': round(np.sqrt( + mean_squared_error(y_true, y_pred)), 7), + 'r2': round(r2_score(y_true, y_pred), 7), + 'smape': round(cls.smape(y_true, y_pred), 7), + 'mape': round(cls.mape(y_true, y_pred), 7) + } return results - + @classmethod - def crossvalidation(cls, model, X, y, classification: bool, cv=5, agg=np.mean): + def crossvalidation(cls, model, X, y, classification: bool, + cv=5, agg=np.mean): if classification: if len(set(y)) > 2: - metrics = ['accuracy','f1_weighted', 'recall_weighted','precision_weighted'] + metrics = ['accuracy', 'f1_weighted', + 'recall_weighted', 'precision_weighted'] else: - metrics = ['accuracy','f1', 'recall','precision', 'roc_auc'] + metrics = ['accuracy', 'f1', 'recall', 'precision', 'roc_auc'] else: - metrics = ['mean_absolute_error', 'r2', 'root_mean_squared_error', 'smape', 'mape'] - res_metrics = cross_validate(model, X, y, cv=cv, return_train_score=False, scoring=metrics) - results = {metric.replace("test_", ""): round(agg(res_metrics[metric]),7) for metric in res_metrics} + metrics = ['mean_absolute_error', 'r2', 'root_mean_squared_error', + 'smape', 'mape'] + res_metrics = cross_validate(model, X, y, cv=cv, + return_train_score=False, + scoring=metrics) + results = {metric.replace("test_", ""): round(agg( + res_metrics[metric]), 7) + for metric in res_metrics} return results @classmethod def __multiclass_classification(cls, y_true, y_pred): """ - Calculates some metrics for multiclass classification problems - - Parameters - ---------- + Calculates some metrics for multiclass classification problems + + Parameters + ---------- y_true : array Target values y_pred : array Predicted values - - Returns - ------- - dict : metrics results - """ - results = {'accuracy': accuracy_score(y_true, y_pred), - 'f1': f1_score(y_true, y_pred, average='weighted'), - 'precision': precision_score(y_true, y_pred, average='weighted'), - 'recall': recall_score(y_true, y_pred, average='weighted'), - } + + Returns + ------- + dict : metrics results + """ + results = {'accuracy': accuracy_score(y_true, y_pred), + 'f1': f1_score(y_true, y_pred, average='weighted'), + 'precision': precision_score(y_true, y_pred, + average='weighted'), + 'recall': recall_score(y_true, y_pred, + average='weighted')} return results - + @classmethod def __binary_classification(cls, y_true, y_pred, y_probs): """ - Calculates some metrics for binary classification problems - - Parameters - ---------- + Calculates some metrics for binary classification problems + + Parameters + ---------- y_true : array Target values y_pred : array Predicted values - - Returns - ------- - dict : metrics results - """ - results = {'accuracy': accuracy_score(y_true, y_pred), - 'f1': f1_score(y_true, y_pred), - 'precision': precision_score(y_true, y_pred), - 'recall': recall_score(y_true, y_pred), - 'roc_auc': roc_auc_score(y_true, y_probs) - } + + Returns + ------- + dict : metrics results + """ + results = {'accuracy': accuracy_score(y_true, y_pred), + 'f1': f1_score(y_true, y_pred), + 'precision': precision_score(y_true, y_pred), + 'recall': recall_score(y_true, y_pred), + 'roc_auc': roc_auc_score(y_true, y_probs)} return results - + @classmethod def classification(cls, y_true, y_pred, y_probs): """ - Checks which classification method will be applied: binary or multiclass - - Parameters - ---------- + Checks which classification method will be applied: + binary or multiclass + + Parameters + ---------- y_true : array Target values y_pred : array Predicted values y_probs : array Probabilities values - - Returns - ------- - dict: metrics results - """ + + Returns + ------- + dict: metrics results + """ if len(set(y_true)) > 2: results = cls.__multiclass_classification(y_true, y_pred) else: results = cls.__binary_classification(y_true, y_pred, y_probs) return results - - + @classmethod def clusterization(cls, X, labels): """ - Calculates some metrics on clustering quality - - Parameters - ---------- + Calculates some metrics on clustering quality + + Parameters + ---------- X : array[array], shape (n_linha, n_colunas) Matrix with the values that were used in the cluster labels : array, shape (n_linha, 1) - Vector with labels selected by the clustering method (eg KMeans) - - Returns - ------- - dict : metrics results - """ - results = {'silhouette': silhouette_score(X, labels, metric='euclidean'), - 'calinski_harabaz': calinski_harabaz_score(X, labels) - } - return results \ No newline at end of file + Vector with labels selected by the clustering method + (eg KMeans) + + Returns + ------- + dict : metrics results + """ + results = {'silhouette': silhouette_score(X, labels, + metric='euclidean'), + 'calinski_harabaz': calinski_harabaz_score(X, labels)} + return results diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/trainer.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/trainer.py index 1266611..e73706c 100644 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/trainer.py +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/trainer.py @@ -1,104 +1,253 @@ -from abc import ABC, abstractmethod -from ml.model.wrapper import Wrapper -from ml.model.metrics import Metrics -import statsmodels.formula.api as smf -from sklearn.model_selection import train_test_split -import numpy as np - -class Trainer(ABC): - def __init__(self): - """ - Constructor - - Parameters - ---------- +from joblib import dump, load +from datetime import date +import mlflow.pyfunc +from mlflow import pyfunc +from interpret.ext.blackbox import TabularExplainer, MimicExplainer +from interpret.ext.glassbox import * +import pandas as pd + +from util import load_yaml, load_json + + +class Wrapper(mlflow.pyfunc.PythonModel): + def __init__(self, model=None, metrics=None, columns=None): + """ + Constructor + + Parameters + ---------- + model : object + If it's just a model: enter all parameters + if it is more than one model: do not enter parameters + and use the add method to add each of the models + metrics : dict + Dictionary with the metrics of the result + of the model + columns : list + list with columns names + Returns + ------- + WrapperModel + """ + self.artifacts = dict() + self.artifacts["model"] = model + self.artifacts["metrics"] = metrics + self.artifacts["columns"] = columns + self.artifacts["creation_date"] = date.today() + + def predict(self, model_input, included_input=False): + """ + Method that returns the result of the prediction on a dataset + + Parameters + ---------- + df : pd.DataFrame + Data to be predicted + + Returns + ------- + list + """ + df_processed = model_input.copy() + model = self.artifacts["model"] + columns = self.artifacts["columns"] + result = model.predict(df_processed[columns]) + if included_input: + model_input['predict'] = result + result = model_input + return result + + def predict_proba(self, model_input, binary=False): + """ + Method that returns the result of the prediction on a dataset + + Parameters + ---------- + df : pd.DataFrame + data to be predicted + + Returns + ------- + list + """ + df_processed = model_input.copy() + model = self.artifacts["model"] + columns = self.artifacts["columns"] + if binary: + return model.predict_proba(df_processed[columns])[:, 1] + else: + return model.predict_proba(df_processed[columns]) + + def save_model(self, path): + """ + Saves the model object to a specific path + + Parameters + ---------- + path : str + path where the model object will be saved + + Returns + ------- None - - Returns - ------- - Trainer - """ - - @abstractmethod - def train(self): - """ - Abstract method that should be implemented in every class that inherits TrainerModel - Parameters - ---------- + """ + dump(self, path) + + @staticmethod + def load_model(path): + """ + Loads the model object in a specific path + + Parameters + ---------- + path : str + path where the model object will be loaded. + + Returns + ------- None - - Returns - ------- - None - """ - pass - -class TrainerSklearn(Trainer): - - def train(self, train, val, y_name, - classification: bool, - algorithm, - columns = None, - **params): - """ - Method that builds the Sklearn model - - Parameters - ---------- - train : pd.Dataframe - data to train the model - val : pd.Dataframe - data to validate the model - y_name : str - target name - algorithm : Sklearn algorithm - algorithm to be trained - classification : bool - if True, classification model training takes place, otherwise Regression - columns : array - columns name to be used in the train - - Returns - ------- - Wrapper - """ - model = algorithm(**params) #model - y_train = train[y_name] - y_val = val[y_name] - X_train = train[columns] - X_val = val[columns] - model.fit(X_train,y_train) - y_pred = model.predict(X_val) - y_probs = model.predict_proba(X_val)[:,1] - if classification: - res_metrics = Metrics.classification(y_val.values, y_pred, y_probs) - else: - res_metrics = Metrics.regression(y_val.values, y_pred) - model = Wrapper(model, res_metrics, X_train.columns) - return model - - -class TrainerSklearnUnsupervised(Trainer): - - def train(self, X, - algorithm, - **params): - """ - Method that builds the Sklearn model - - Parameters - ---------- - model_name : str - model name - - Returns - ------- - Wrapper - """ - model = algorithm(**params) #model - columns = list(X.columns) - model.fit(X) - labels = model.predict(X) - res_metrics = Metrics.clusterization(X, labels) - model = Wrapper(model, res_metrics, columns) + """ + model = load(path) return model + + def save(self, path): + """ + Save model as a Wrapper class + + Parameters + ---------- + path : str + path where the model object will be loaded. + + Returns + ------- + None + """ + path_artifacts = path + "_artifacts.pkl" + dump(self.artifacts, path_artifacts) + content = load_json("config/arquivos.json") + conda_env = load_yaml(content["path_yaml"]) + mlflow.pyfunc.save_model( + path=path, + python_model=self, + artifacts={"model": path_artifacts}, + conda_env=conda_env, + ) + + def get_metrics(self): + """ + Return metrics + + Parameters + ---------- + self : object Wrapper + + Returns + ------- + dict + """ + return self.artifacts["metrics"] + + def get_columns(self): + """ + Return columns + + Parameters + ---------- + self : object Wrapper + + Returns + ------- + list + """ + return self.artifacts["columns"] + + def get_model(self): + """ + Return model + + Parameters + ---------- + self : object Wrapper + + Returns + ------- + dict + """ + return self.artifacts["model"] + + def train_interpret(self, X, model="tabular"): + """ + Train a interpret model + + Parameters + ---------- + self : object Wrapper + X : pd.DataFrame + Data that were used in the train for interpret + model : string, optional + Model to use for the interpret [tabular,mimic_LGBME, + mimic_Linear,mimic_SGDE,mimic_Dec_Tree] + Returns + ------- + None + """ + mimic_models = { + "mimic_LGBME": LGBMExplainableModel, + "mimic_Linear": LinearExplainableModel, + "mimic_SGDE": SGDExplainableModel, + "mimic_Dec_Tree": DecisionTreeExplainableModel, + } + if model == "tabular": + explainer = TabularExplainer( + self.artifacts["model"], X, features=self.artifacts["columns"] + ) + else: + explainer = MimicExplainer( + self.artifacts["model"], + X, + mimic_models[model], + augment_data=True, + max_num_of_augmentations=10, + features=self.artifacts["columns"], + ) + self.artifacts["explainer"] = explainer + + def local_interpret(self, X, n_feat=3, norm=True): + """ + Return a local interpret for each row in data + + Parameters + ---------- + self : object Wrapper + X : array[array], shape (n_linha, n_colunas) + Matrix with the data that were used to return interpret + n_feat : int, optional + Number of features to return + norm : bool, optional + if True, do normalization in the features importances + + Returns + ------- + pd.DataFrame + """ + local_explanation = self.artifacts["explainer"].explain_local(X) + n_obs = X.shape[0] + predictions = self.artifacts["model"].predict(X) + local_values = local_explanation.get_ranked_local_values() + local_values = [local_values[predictions[i]][i] for i in range(n_obs)] + local_names = local_explanation.get_ranked_local_names() + local_names = [local_names[predictions[i]][i] for i in range(n_obs)] + if norm: + local_values = [ + [(i - min(l)) / (max(l) - min(l)) for i in l] for l in local_values + ] + result = [ + (local_names[i][:n_feat] + local_values[i][:n_feat]) for i in range(n_obs) + ] + column_names = [ + f"Importance_{item}_{str(i)}" + for item in ["Name", "Value"] + for i in range(n_feat) + ] + return pd.DataFrame(result, columns=column_names) diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/wrapper.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/wrapper.py index 8f812cf..7aeaf19 100644 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/wrapper.py +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/model/wrapper.py @@ -249,4 +249,4 @@ def local_interpret(self, X, n_feat=3, norm=True): for item in ["Name", "Value"] for i in range(n_feat) ] - return pd.DataFrame(result, columns=column_names) \ No newline at end of file + return pd.DataFrame(result, columns=column_names) diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_Processor.ipynb b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/1_Sagemaker_Processor.ipynb similarity index 92% rename from hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_Processor.ipynb rename to hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/1_Sagemaker_Processor.ipynb index ebd6aae..98961ea 100644 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_Processor.ipynb +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/1_Sagemaker_Processor.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "d1dd5820", + "id": "0080c0d0", "metadata": {}, "source": [ "# Sagemaker Processor" @@ -10,7 +10,7 @@ }, { "cell_type": "markdown", - "id": "fbaaa9e6", + "id": "7d7b0036", "metadata": {}, "source": [ "This script generates the train, val and inference files with the processor previous uploaded in ECR." @@ -18,7 +18,7 @@ }, { "cell_type": "markdown", - "id": "864a2e0f", + "id": "3f2a0229", "metadata": {}, "source": [ "## Import modules" @@ -27,7 +27,7 @@ { "cell_type": "code", "execution_count": 1, - "id": "fa0d1522", + "id": "6e679a79", "metadata": {}, "outputs": [], "source": [ @@ -40,7 +40,7 @@ }, { "cell_type": "markdown", - "id": "43887859", + "id": "a9066e74", "metadata": {}, "source": [ "## Setup" @@ -48,7 +48,7 @@ }, { "cell_type": "markdown", - "id": "4422ac46", + "id": "60ec8b7b", "metadata": {}, "source": [ "Modify according to your configurations." @@ -57,7 +57,7 @@ { "cell_type": "code", "execution_count": 2, - "id": "4d423fcf", + "id": "9d9b2d23", "metadata": {}, "outputs": [], "source": [ @@ -68,7 +68,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "b503dba8", + "id": "1cd1aa77", "metadata": {}, "outputs": [], "source": [ @@ -80,7 +80,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "c00d86d1", + "id": "464d9cec", "metadata": {}, "outputs": [], "source": [ @@ -91,7 +91,7 @@ { "cell_type": "code", "execution_count": 5, - "id": "667c8bb6", + "id": "a0649d24", "metadata": {}, "outputs": [], "source": [ @@ -102,7 +102,7 @@ { "cell_type": "code", "execution_count": 6, - "id": "3b02cf9e", + "id": "f71c6f3c", "metadata": {}, "outputs": [], "source": [ @@ -114,7 +114,7 @@ { "cell_type": "code", "execution_count": 7, - "id": "32c8ab3d", + "id": "db98e9a2", "metadata": {}, "outputs": [], "source": [ @@ -133,8 +133,8 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "eac4ac37", + "execution_count": 8, + "id": "52ba34ff", "metadata": {}, "outputs": [], "source": [ @@ -146,8 +146,8 @@ }, { "cell_type": "code", - "execution_count": 8, - "id": "1b175317", + "execution_count": 9, + "id": "b1744737", "metadata": {}, "outputs": [], "source": [ @@ -158,7 +158,7 @@ }, { "cell_type": "markdown", - "id": "a9bcf199", + "id": "281216e9", "metadata": {}, "source": [ "## Processor - Train" @@ -166,8 +166,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "id": "becf4d16", + "execution_count": 10, + "id": "3191cd98", "metadata": {}, "outputs": [], "source": [ @@ -181,8 +181,8 @@ }, { "cell_type": "code", - "execution_count": 10, - "id": "2ccaf4a1", + "execution_count": 11, + "id": "9998dd3a", "metadata": {}, "outputs": [], "source": [ @@ -214,8 +214,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "id": "e0287211", + "execution_count": 12, + "id": "a0d4af1b", "metadata": {}, "outputs": [], "source": [ @@ -228,8 +228,8 @@ }, { "cell_type": "code", - "execution_count": 12, - "id": "854dc0d7", + "execution_count": 13, + "id": "065f6fca", "metadata": {}, "outputs": [ { @@ -237,11 +237,10 @@ "output_type": "stream", "text": [ "\n", - "Job Name: hermione-processor-2021-05-25-21-03-59-873\n", + "Job Name: hermione-processor-2021-07-22-19-53-22-425\n", "Inputs: [{'InputName': 'raw_data', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://hermione-sagemaker/TRAIN_RAW', 'LocalPath': '/opt/ml/processing/input/raw_data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]\n", "Outputs: [{'OutputName': 'expectations', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/EXPECTATIONS', 'LocalPath': '/opt/ml/processing/output/expectations', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'preprocessing', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/PREPROCESSING', 'LocalPath': '/opt/ml/processing/output/preprocessing', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'train_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/TRAIN_PROCESSED', 'LocalPath': '/opt/ml/processing/output/processed/train', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'val_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/VAL_PROCESSED', 'LocalPath': '/opt/ml/processing/output/processed/val', 'S3UploadMode': 'EndOfJob'}}]\n", - "......................................................\n", - "\u001b[34mINFO:root:Starting the preprocessing\u001b[0m\n", + "......................................................\u001b[34mINFO:root:Starting the preprocessing\u001b[0m\n", "\u001b[34mINFO:root:step_train: True\u001b[0m\n", "\u001b[34mINFO:root:Reading the inputs\u001b[0m\n", "\u001b[34mINFO:root:Reading file: /opt/ml/processing/input/raw_data/raw_train.csv\u001b[0m\n", @@ -255,7 +254,7 @@ "\u001b[0m\n", "\u001b[34mINFO:root:Divide train and test\u001b[0m\n", "\u001b[34mINFO:root:Normalizing\u001b[0m\n", - "\u001b[34mWARNING:py.warnings:/usr/local/lib/python3.8/dist-packages/pandas/core/indexing.py:1738: SettingWithCopyWarning: \u001b[0m\n", + "\u001b[34mWARNING:py.warnings:/usr/local/lib/python3.8/dist-packages/pandas/core/indexing.py:1835: SettingWithCopyWarning: \u001b[0m\n", "\u001b[34mA value is trying to be set on a copy of a slice from a DataFrame.\u001b[0m\n", "\u001b[34mTry using .loc[row_indexer,col_indexer] = value instead\n", "\u001b[0m\n", @@ -265,8 +264,9 @@ "\u001b[34mINFO:root:Normalizing\u001b[0m\n", "\u001b[34mINFO:root:shape train (393, 7) val (99, 7)\u001b[0m\n", "\u001b[34mINFO:root:Saving\u001b[0m\n", - "CPU times: user 1.02 s, sys: 104 ms, total: 1.13 s\n", - "Wall time: 9min 14s\n" + "\n", + "CPU times: user 1.09 s, sys: 71.1 ms, total: 1.16 s\n", + "Wall time: 9min 48s\n" ] } ], @@ -281,7 +281,7 @@ }, { "cell_type": "markdown", - "id": "0f54bf21", + "id": "5db80626", "metadata": {}, "source": [ "## Processor - Inference" @@ -289,8 +289,8 @@ }, { "cell_type": "code", - "execution_count": 13, - "id": "bb2a86dc", + "execution_count": 10, + "id": "8d08c6c9", "metadata": {}, "outputs": [], "source": [ @@ -310,8 +310,8 @@ }, { "cell_type": "code", - "execution_count": 14, - "id": "c3e8dd48", + "execution_count": 11, + "id": "4273ba95", "metadata": {}, "outputs": [], "source": [ @@ -332,8 +332,8 @@ }, { "cell_type": "code", - "execution_count": 15, - "id": "62de176e", + "execution_count": 12, + "id": "b4d816d3", "metadata": {}, "outputs": [], "source": [ @@ -346,8 +346,8 @@ }, { "cell_type": "code", - "execution_count": 16, - "id": "e9255f5a", + "execution_count": 13, + "id": "28aa9b95", "metadata": {}, "outputs": [ { @@ -355,11 +355,10 @@ "output_type": "stream", "text": [ "\n", - "Job Name: hermione-processor-2021-05-25-21-13-13-987\n", + "Job Name: hermione-processor-2021-07-22-19-40-48-848\n", "Inputs: [{'InputName': 'raw_data', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://hermione-sagemaker/TEST_RAW', 'LocalPath': '/opt/ml/processing/input/raw_data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'preprocessing', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/PREPROCESSING', 'LocalPath': '/opt/ml/processing/input/preprocessing', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'expectations', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/EXPECTATIONS', 'LocalPath': '/opt/ml/processing/input/expectations', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]\n", "Outputs: [{'OutputName': 'inference_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/INFERENCE_PROCESSED', 'LocalPath': '/opt/ml/processing/output/processed/inference', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'validations', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/VALIDATIONS', 'LocalPath': '/opt/ml/processing/output/validations', 'S3UploadMode': 'EndOfJob'}}]\n", - "............................................................\n", - "\u001b[34mINFO:root:Starting the preprocessing\u001b[0m\n", + "...........................................................\u001b[34mINFO:root:Starting the preprocessing\u001b[0m\n", "\u001b[34mINFO:root:step_train: False\u001b[0m\n", "\u001b[34mINFO:root:Reading the inputs\u001b[0m\n", "\u001b[34mINFO:root:Reading file: /opt/ml/processing/input/raw_data/raw_test.csv\u001b[0m\n", @@ -370,8 +369,9 @@ "\u001b[34mINFO:root:Normalizing\u001b[0m\n", "\u001b[34mINFO:root:shape (222, 7)\u001b[0m\n", "\u001b[34mINFO:root:Saving\u001b[0m\n", - "CPU times: user 1.19 s, sys: 38.4 ms, total: 1.23 s\n", - "Wall time: 10min 14s\n" + "\n", + "CPU times: user 1.18 s, sys: 39.6 ms, total: 1.22 s\n", + "Wall time: 10min 15s\n" ] } ], diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_Train.ipynb b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/2_Sagemaker_Train.ipynb similarity index 81% rename from hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_Train.ipynb rename to hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/2_Sagemaker_Train.ipynb index b0a796f..5951690 100644 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_Train.ipynb +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/2_Sagemaker_Train.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "577c4f6b", + "id": "0481ea58", "metadata": {}, "source": [ "# Sagemaker Train" @@ -10,7 +10,7 @@ }, { "cell_type": "markdown", - "id": "501ef5b6", + "id": "c14f3a6e", "metadata": {}, "source": [ "This script creates and trains the model with the uploaded image in ECR." @@ -18,7 +18,7 @@ }, { "cell_type": "markdown", - "id": "e66b3975", + "id": "737135a7", "metadata": {}, "source": [ "## Import modules" @@ -26,8 +26,8 @@ }, { "cell_type": "code", - "execution_count": 15, - "id": "d658fb44", + "execution_count": 1, + "id": "010b1646", "metadata": {}, "outputs": [], "source": [ @@ -39,7 +39,7 @@ }, { "cell_type": "markdown", - "id": "64036230", + "id": "ed6ec079", "metadata": {}, "source": [ "## Setup" @@ -47,7 +47,7 @@ }, { "cell_type": "markdown", - "id": "28411012", + "id": "ff8d388c", "metadata": {}, "source": [ "Modify according to your configurations." @@ -55,8 +55,8 @@ }, { "cell_type": "code", - "execution_count": 16, - "id": "7e937373", + "execution_count": 2, + "id": "6278a767", "metadata": {}, "outputs": [], "source": [ @@ -66,8 +66,8 @@ }, { "cell_type": "code", - "execution_count": 17, - "id": "16450249", + "execution_count": 3, + "id": "1fe9ed45", "metadata": {}, "outputs": [], "source": [ @@ -78,8 +78,8 @@ }, { "cell_type": "code", - "execution_count": 18, - "id": "2e144eb8", + "execution_count": 4, + "id": "f6216acf", "metadata": {}, "outputs": [], "source": [ @@ -89,8 +89,8 @@ }, { "cell_type": "code", - "execution_count": 19, - "id": "50b4a590", + "execution_count": 5, + "id": "c9a8d55b", "metadata": {}, "outputs": [], "source": [ @@ -100,8 +100,8 @@ }, { "cell_type": "code", - "execution_count": 20, - "id": "8d56e6ca", + "execution_count": 6, + "id": "f281ac39", "metadata": {}, "outputs": [], "source": [ @@ -112,8 +112,8 @@ }, { "cell_type": "code", - "execution_count": 21, - "id": "e710ea0a", + "execution_count": 7, + "id": "4eee7169", "metadata": {}, "outputs": [], "source": [ @@ -127,8 +127,8 @@ }, { "cell_type": "code", - "execution_count": 22, - "id": "f8a27026", + "execution_count": 8, + "id": "44002452", "metadata": {}, "outputs": [], "source": [ @@ -138,7 +138,7 @@ }, { "cell_type": "markdown", - "id": "b6efb8ce", + "id": "6aa3f5a8", "metadata": {}, "source": [ "## Train" @@ -146,8 +146,8 @@ }, { "cell_type": "code", - "execution_count": 23, - "id": "ed9cb39b", + "execution_count": 9, + "id": "77e64d0c", "metadata": {}, "outputs": [], "source": [ @@ -160,8 +160,8 @@ }, { "cell_type": "code", - "execution_count": 24, - "id": "34f144e0", + "execution_count": 10, + "id": "33726510", "metadata": {}, "outputs": [], "source": [ @@ -174,8 +174,8 @@ }, { "cell_type": "code", - "execution_count": 25, - "id": "a0bbbf7d", + "execution_count": 11, + "id": "1f0350b8", "metadata": {}, "outputs": [], "source": [ @@ -185,8 +185,8 @@ }, { "cell_type": "code", - "execution_count": 26, - "id": "299813d5", + "execution_count": 12, + "id": "0832ebb9", "metadata": {}, "outputs": [], "source": [ @@ -213,8 +213,8 @@ }, { "cell_type": "code", - "execution_count": 27, - "id": "4ad41d36", + "execution_count": 13, + "id": "7a2931e1", "metadata": {}, "outputs": [], "source": [ @@ -236,26 +236,23 @@ }, { "cell_type": "code", - "execution_count": 28, - "id": "62c1894f", + "execution_count": 14, + "id": "d12aa777", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "2021-05-26 12:41:29 Starting - Starting the training job...\n", - "2021-05-26 12:41:52 Starting - Launching requested ML instancesProfilerReport-1622032889: InProgress\n", - "......\n", - "2021-05-26 12:42:52 Starting - Preparing the instances for training......\n", - "2021-05-26 12:43:52 Downloading - Downloading input data\n", - "2021-05-26 12:43:52 Training - Downloading the training image.....\u001b[34m2021-05-26 09:44:41,407 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", - "\n", - "2021-05-26 12:45:00 Uploading - Uploading generated training model\n", - "2021-05-26 12:45:00 Completed - Training job completed\n", - "\u001b[34m2021-05-26 09:44:47,642 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", - "\u001b[34m2021-05-26 09:44:47,653 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", - "\u001b[34m2021-05-26 09:44:47,663 sagemaker-training-toolkit INFO Invoking user script\n", + "2021-07-22 20:15:35 Starting - Starting the training job...\n", + "2021-07-22 20:15:59 Starting - Launching requested ML instancesProfilerReport-1626984935: InProgress\n", + "...\n", + "2021-07-22 20:16:35 Starting - Preparing the instances for training.........\n", + "2021-07-22 20:18:00 Downloading - Downloading input data...\n", + "2021-07-22 20:18:20 Training - Downloading the training image.....\u001b[34m2021-07-22 17:19:11,614 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34m2021-07-22 17:19:11,630 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34m2021-07-22 17:19:11,640 sagemaker-training-toolkit INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34m2021-07-22 17:19:11,648 sagemaker-training-toolkit INFO Invoking user script\n", "\u001b[0m\n", "\u001b[34mTraining Env:\n", "\u001b[0m\n", @@ -288,7 +285,7 @@ " },\n", " \"input_dir\": \"/opt/ml/input\",\n", " \"is_master\": true,\n", - " \"job_name\": \"Hermione-train-2021-05-26-12-41-29-505\",\n", + " \"job_name\": \"Hermione-train-2021-07-22-20-15-35-496\",\n", " \"log_level\": 20,\n", " \"master_hostname\": \"algo-1\",\n", " \"model_dir\": \"/opt/ml/model\",\n", @@ -332,7 +329,7 @@ "\u001b[34mSM_NUM_GPUS=0\u001b[0m\n", "\u001b[34mSM_MODEL_DIR=/opt/ml/model\u001b[0m\n", "\u001b[34mSM_MODULE_DIR=/opt/ml/code\u001b[0m\n", - "\u001b[34mSM_TRAINING_ENV={\"additional_framework_parameters\":{},\"channel_input_dirs\":{\"train\":\"/opt/ml/input/data/train\",\"validation\":\"/opt/ml/input/data/validation\"},\"current_host\":\"algo-1\",\"framework_module\":null,\"hosts\":[\"algo-1\"],\"hyperparameters\":{},\"input_config_dir\":\"/opt/ml/input/config\",\"input_data_config\":{\"train\":{\"ContentType\":\"text/csv\",\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"},\"validation\":{\"ContentType\":\"text/csv\",\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}},\"input_dir\":\"/opt/ml/input\",\"is_master\":true,\"job_name\":\"Hermione-train-2021-05-26-12-41-29-505\",\"log_level\":20,\"master_hostname\":\"algo-1\",\"model_dir\":\"/opt/ml/model\",\"module_dir\":\"/opt/ml/code\",\"module_name\":\"train\",\"network_interface_name\":\"eth0\",\"num_cpus\":2,\"num_gpus\":0,\"output_data_dir\":\"/opt/ml/output/data\",\"output_dir\":\"/opt/ml/output\",\"output_intermediate_dir\":\"/opt/ml/output/intermediate\",\"resource_config\":{\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"},\"user_entry_point\":\"train.py\"}\u001b[0m\n", + "\u001b[34mSM_TRAINING_ENV={\"additional_framework_parameters\":{},\"channel_input_dirs\":{\"train\":\"/opt/ml/input/data/train\",\"validation\":\"/opt/ml/input/data/validation\"},\"current_host\":\"algo-1\",\"framework_module\":null,\"hosts\":[\"algo-1\"],\"hyperparameters\":{},\"input_config_dir\":\"/opt/ml/input/config\",\"input_data_config\":{\"train\":{\"ContentType\":\"text/csv\",\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"},\"validation\":{\"ContentType\":\"text/csv\",\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}},\"input_dir\":\"/opt/ml/input\",\"is_master\":true,\"job_name\":\"Hermione-train-2021-07-22-20-15-35-496\",\"log_level\":20,\"master_hostname\":\"algo-1\",\"model_dir\":\"/opt/ml/model\",\"module_dir\":\"/opt/ml/code\",\"module_name\":\"train\",\"network_interface_name\":\"eth0\",\"num_cpus\":2,\"num_gpus\":0,\"output_data_dir\":\"/opt/ml/output/data\",\"output_dir\":\"/opt/ml/output\",\"output_intermediate_dir\":\"/opt/ml/output/intermediate\",\"resource_config\":{\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"},\"user_entry_point\":\"train.py\"}\u001b[0m\n", "\u001b[34mSM_USER_ARGS=[]\u001b[0m\n", "\u001b[34mSM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate\u001b[0m\n", "\u001b[34mSM_CHANNEL_VALIDATION=/opt/ml/input/data/validation\u001b[0m\n", @@ -344,21 +341,21 @@ "\u001b[34m/usr/bin/python3 train.py\n", "\n", "\u001b[0m\n", - "\u001b[34m/usr/local/lib/python3.8/dist-packages/interpret_community/common/gpu_kmeans.py:30: UserWarning: cuML is required to use GPU explainers. Check https://rapids.ai/start.html for more information on how to install it.\n", - " warnings.warn(\u001b[0m\n", - "\u001b[34mcuML is required to use GPU explainers. Check https://rapids.ai/start.html for more information on how to install it.\u001b[0m\n", "\u001b[34mINFO:root:Starting the training\u001b[0m\n", "\u001b[34mINFO:root:Reading the inputs\u001b[0m\n", "\u001b[34mINFO:root:Training the model\u001b[0m\n", "\u001b[34mINFO:root:Saving\u001b[0m\n", - "\u001b[34mINFO:root:accuracy=0.7373737373737373; f1=0.6976744186046512; precision=0.6382978723404256; recall=0.7692307692307693;\u001b[0m\n", + "\u001b[34mINFO:root:accuracy=0.7373737373737373; f1=0.6976744186046512; precision=0.6382978723404256; recall=0.7692307692307693;\u001b[0m\n", "\u001b[34mINFO:root:Training complete.\u001b[0m\n", - "\u001b[34m2021-05-26 09:44:51,898 sagemaker-training-toolkit INFO Reporting training SUCCESS\u001b[0m\n", - "Training seconds: 85\n", - "Billable seconds: 36\n", - "Managed Spot Training savings: 57.6%\n", - "CPU times: user 450 ms, sys: 19.9 ms, total: 470 ms\n", - "Wall time: 3min 42s\n" + "\u001b[34m2021-07-22 17:19:17,315 sagemaker-training-toolkit INFO Reporting training SUCCESS\u001b[0m\n", + "\n", + "2021-07-22 20:19:30 Uploading - Uploading generated training model\n", + "2021-07-22 20:19:30 Completed - Training job completed\n", + "Training seconds: 96\n", + "Billable seconds: 39\n", + "Managed Spot Training savings: 59.4%\n", + "CPU times: user 491 ms, sys: 48.5 ms, total: 539 ms\n", + "Wall time: 4min 12s\n" ] } ], @@ -367,6 +364,14 @@ "# Train the model and validate\n", "est.fit({'train':train_config, 'validation':val_config}, wait=True, logs=True)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bf57258c", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/3_Sagemaker_Inference.ipynb b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/3_Sagemaker_Inference.ipynb new file mode 100644 index 0000000..525a5a0 --- /dev/null +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/3_Sagemaker_Inference.ipynb @@ -0,0 +1,374 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4558d673", + "metadata": {}, + "source": [ + "# Sagemaker Inference" + ] + }, + { + "cell_type": "markdown", + "id": "733a4c1b", + "metadata": {}, + "source": [ + "This script predicts new data with the uploaded image in ECR." + ] + }, + { + "cell_type": "markdown", + "id": "73ec63de", + "metadata": {}, + "source": [ + "## Import modules" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "9f4bb4b1", + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "import boto3\n", + "import sagemaker\n", + "from sagemaker import get_execution_role" + ] + }, + { + "cell_type": "markdown", + "id": "cf4f0baf", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "markdown", + "id": "a36daf9a", + "metadata": {}, + "source": [ + "Modify according to your configurations." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "ff34a81c", + "metadata": {}, + "outputs": [], + "source": [ + "# Bucket name in S3\n", + "bucket = \"hermione-sagemaker\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "aa6732aa", + "metadata": {}, + "outputs": [], + "source": [ + "# Set session\n", + "region_name=\"us-east-1\"\n", + "boto3.setup_default_session(region_name=region_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0515bb41", + "metadata": {}, + "outputs": [], + "source": [ + "# Get user role\n", + "role = get_execution_role()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "ef2ae3ae", + "metadata": {}, + "outputs": [], + "source": [ + "# Get AWS Account ID\n", + "account_number = boto3.client(\"sts\").get_caller_identity()[\"Account\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "31861461", + "metadata": {}, + "outputs": [], + "source": [ + "# Image previous uploaded in ECR\n", + "image_name = \"hermione-inference\"\n", + "image_uri = f\"{account_number}.dkr.ecr.{region_name}.amazonaws.com/{image_name}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "1eec0163", + "metadata": {}, + "outputs": [], + "source": [ + "# Input and output paths to execute inference\n", + "paths = {\n", + " 'inference_processed': f\"s3://{bucket}/PREPROCESSING/INFERENCE_PROCESSED/inference.csv\",\n", + " 'model': f\"s3://{bucket}/PREPROCESSING/MODEL/Hermione-train-2021-05-26-12-41-29-505/output/model.tar.gz\",\n", + " 'output_path': f\"s3://{bucket}/PREPROCESSING/OUTPUT\"\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "76ce3950", + "metadata": {}, + "outputs": [], + "source": [ + "# instance to run the code\n", + "instance_type=\"ml.m5.large\"" + ] + }, + { + "cell_type": "markdown", + "id": "f44e5b91", + "metadata": {}, + "source": [ + "## Inference" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "a78cd291", + "metadata": {}, + "outputs": [], + "source": [ + "# Receives the processed inference data in S3\n", + "input_path = paths['inference_processed']" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "c8f2a674", + "metadata": {}, + "outputs": [], + "source": [ + "# Receives the model created during the training in S3\n", + "model_path = paths['model']" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "6ec78d16", + "metadata": {}, + "outputs": [], + "source": [ + "# Saves the prediction in S3\n", + "output_path = paths['output_path']" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "c167eff0", + "metadata": {}, + "outputs": [], + "source": [ + "# Creates the model to access the ECR image\n", + "model = sagemaker.model.Model(\n", + " image_uri= image_uri,\n", + " model_data=model_path,\n", + " role=role)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "0b2651c1", + "metadata": {}, + "outputs": [], + "source": [ + "# Creates a transformer object from the trained model\n", + "transformer = model.transformer(\n", + " instance_count=1,\n", + " instance_type=instance_type, \n", + " output_path=output_path,\n", + " accept = 'text/csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "1c5bd0b8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "..........................\u001b[34mWarning: MMS is using non-default JVM parameters: -XX:-UseContainerSupport\u001b[0m\n", + "\u001b[35mWarning: MMS is using non-default JVM parameters: -XX:-UseContainerSupport\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:24,272 [INFO ] main com.amazonaws.ml.mms.ModelServer - \u001b[0m\n", + "\u001b[34mMMS Home: /usr/local/lib/python3.8/dist-packages\u001b[0m\n", + "\u001b[34mCurrent directory: /\u001b[0m\n", + "\u001b[34mTemp directory: /tmp\u001b[0m\n", + "\u001b[34mNumber of GPUs: 0\u001b[0m\n", + "\u001b[34mNumber of CPUs: 2\u001b[0m\n", + "\u001b[34mMax heap size: 1726 M\u001b[0m\n", + "\u001b[34mPython executable: /usr/bin/python3\u001b[0m\n", + "\u001b[34mConfig file: /etc/sagemaker-mms.properties\u001b[0m\n", + "\u001b[34mInference address: http://0.0.0.0:8080\u001b[0m\n", + "\u001b[34mManagement address: http://0.0.0.0:8080\u001b[0m\n", + "\u001b[34mModel Store: /.sagemaker/mms/models\u001b[0m\n", + "\u001b[34mInitial Models: ALL\u001b[0m\n", + "\u001b[34mLog dir: /logs\u001b[0m\n", + "\u001b[34mMetrics dir: /logs\u001b[0m\n", + "\u001b[34mNetty threads: 0\u001b[0m\n", + "\u001b[34mNetty client threads: 0\u001b[0m\n", + "\u001b[34mDefault workers per model: 2\u001b[0m\n", + "\u001b[34mBlacklist Regex: N/A\u001b[0m\n", + "\u001b[34mMaximum Response Size: 6553500\u001b[0m\n", + "\u001b[34mMaximum Request Size: 6553500\u001b[0m\n", + "\u001b[34mPreload model: false\u001b[0m\n", + "\u001b[34mPrefer direct buffer: false\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:24,384 [WARN ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerLifeCycle - attachIOStreams() threadName=W-9000-model\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:24,452 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - model_service_worker started with args: --sock-type unix --sock-name /tmp/.mms.sock.9000 --handler serving.handler --model-path /.sagemaker/mms/models/model --model-name model --preload-model false --tmp-dir /tmp\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:24,272 [INFO ] main com.amazonaws.ml.mms.ModelServer - \u001b[0m\n", + "\u001b[35mMMS Home: /usr/local/lib/python3.8/dist-packages\u001b[0m\n", + "\u001b[35mCurrent directory: /\u001b[0m\n", + "\u001b[35mTemp directory: /tmp\u001b[0m\n", + "\u001b[35mNumber of GPUs: 0\u001b[0m\n", + "\u001b[35mNumber of CPUs: 2\u001b[0m\n", + "\u001b[35mMax heap size: 1726 M\u001b[0m\n", + "\u001b[35mPython executable: /usr/bin/python3\u001b[0m\n", + "\u001b[35mConfig file: /etc/sagemaker-mms.properties\u001b[0m\n", + "\u001b[35mInference address: http://0.0.0.0:8080\u001b[0m\n", + "\u001b[35mManagement address: http://0.0.0.0:8080\u001b[0m\n", + "\u001b[35mModel Store: /.sagemaker/mms/models\u001b[0m\n", + "\u001b[35mInitial Models: ALL\u001b[0m\n", + "\u001b[35mLog dir: /logs\u001b[0m\n", + "\u001b[35mMetrics dir: /logs\u001b[0m\n", + "\u001b[35mNetty threads: 0\u001b[0m\n", + "\u001b[35mNetty client threads: 0\u001b[0m\n", + "\u001b[35mDefault workers per model: 2\u001b[0m\n", + "\u001b[35mBlacklist Regex: N/A\u001b[0m\n", + "\u001b[35mMaximum Response Size: 6553500\u001b[0m\n", + "\u001b[35mMaximum Request Size: 6553500\u001b[0m\n", + "\u001b[35mPreload model: false\u001b[0m\n", + "\u001b[35mPrefer direct buffer: false\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:24,384 [WARN ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerLifeCycle - attachIOStreams() threadName=W-9000-model\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:24,452 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - model_service_worker started with args: --sock-type unix --sock-name /tmp/.mms.sock.9000 --handler serving.handler --model-path /.sagemaker/mms/models/model --model-name model --preload-model false --tmp-dir /tmp\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:24,454 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Listening on port: /tmp/.mms.sock.9000\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:24,454 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - [PID] 24\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:24,455 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - MMS worker started.\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:24,455 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Python runtime: 3.8.10\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:24,456 [INFO ] main com.amazonaws.ml.mms.wlm.ModelManager - Model model loaded.\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:24,460 [INFO ] main com.amazonaws.ml.mms.ModelServer - Initialize Inference server with: EpollServerSocketChannel.\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:24,472 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Connecting to: /tmp/.mms.sock.9000\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:24,476 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Connecting to: /tmp/.mms.sock.9000\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:24,536 [INFO ] main com.amazonaws.ml.mms.ModelServer - Inference API bind to: http://0.0.0.0:8080\u001b[0m\n", + "\u001b[34mModel server started.\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:24,555 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Connection accepted: /tmp/.mms.sock.9000.\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:24,555 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Connection accepted: /tmp/.mms.sock.9000.\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:24,567 [WARN ] pool-2-thread-1 com.amazonaws.ml.mms.metrics.MetricCollector - worker pid is not available yet.\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:24,454 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Listening on port: /tmp/.mms.sock.9000\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:24,454 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - [PID] 24\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:24,455 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - MMS worker started.\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:24,455 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Python runtime: 3.8.10\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:24,456 [INFO ] main com.amazonaws.ml.mms.wlm.ModelManager - Model model loaded.\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:24,460 [INFO ] main com.amazonaws.ml.mms.ModelServer - Initialize Inference server with: EpollServerSocketChannel.\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:24,472 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Connecting to: /tmp/.mms.sock.9000\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:24,476 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Connecting to: /tmp/.mms.sock.9000\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:24,536 [INFO ] main com.amazonaws.ml.mms.ModelServer - Inference API bind to: http://0.0.0.0:8080\u001b[0m\n", + "\u001b[35mModel server started.\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:24,555 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Connection accepted: /tmp/.mms.sock.9000.\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:24,555 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Connection accepted: /tmp/.mms.sock.9000.\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:24,567 [WARN ] pool-2-thread-1 com.amazonaws.ml.mms.metrics.MetricCollector - worker pid is not available yet.\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:27,441 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - generated new fontManager\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:27,450 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - generated new fontManager\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:27,839 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Loading the model\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:27,854 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Loading the model\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:27,441 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - generated new fontManager\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:27,450 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - generated new fontManager\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:27,839 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Loading the model\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:27,854 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Loading the model\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:27,886 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Model model loaded io_fd=0242a9fffefeff83-0000000a-00000000-2860f330bbe7ac20-d219266e\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:27,898 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Backend response time: 3268\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:27,900 [WARN ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerLifeCycle - attachIOStreams() threadName=W-model-1\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:27,916 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Model model loaded io_fd=0242a9fffefeff83-0000000a-00000001-9aea1030bbe7ac23-7076a78a\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:27,916 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Backend response time: 3285\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:27,916 [WARN ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerLifeCycle - attachIOStreams() threadName=W-model-2\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:27,886 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Model model loaded io_fd=0242a9fffefeff83-0000000a-00000000-2860f330bbe7ac20-d219266e\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:27,898 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Backend response time: 3268\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:27,900 [WARN ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerLifeCycle - attachIOStreams() threadName=W-model-1\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:27,916 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Model model loaded io_fd=0242a9fffefeff83-0000000a-00000001-9aea1030bbe7ac23-7076a78a\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:27,916 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Backend response time: 3285\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:27,916 [WARN ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerLifeCycle - attachIOStreams() threadName=W-model-2\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:31,830 [INFO ] pool-1-thread-4 ACCESS_LOG - /169.254.255.130:60460 \"GET /ping HTTP/1.1\" 200 15\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:31,840 [INFO ] epollEventLoopGroup-3-2 ACCESS_LOG - /169.254.255.130:60464 \"GET /execution-parameters HTTP/1.1\" 404 1\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:31,965 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Predicting...\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:31,981 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Prediction Complete\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:31,983 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Saving\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:31,985 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Backend response time: 26\u001b[0m\n", + "\u001b[34m2021-07-22 20:28:31,986 [INFO ] W-9000-model ACCESS_LOG - /169.254.255.130:60468 \"POST /invocations HTTP/1.1\" 200 30\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:31,830 [INFO ] pool-1-thread-4 ACCESS_LOG - /169.254.255.130:60460 \"GET /ping HTTP/1.1\" 200 15\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:31,840 [INFO ] epollEventLoopGroup-3-2 ACCESS_LOG - /169.254.255.130:60464 \"GET /execution-parameters HTTP/1.1\" 404 1\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:31,965 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Predicting...\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:31,981 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Prediction Complete\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:31,983 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Saving\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:31,985 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Backend response time: 26\u001b[0m\n", + "\u001b[35m2021-07-22 20:28:31,986 [INFO ] W-9000-model ACCESS_LOG - /169.254.255.130:60468 \"POST /invocations HTTP/1.1\" 200 30\u001b[0m\n", + "\u001b[32m2021-07-22T20:28:31.846:[sagemaker logs]: MaxConcurrentTransforms=1, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD\u001b[0m\n", + "\n", + "CPU times: user 602 ms, sys: 31.4 ms, total: 634 ms\n", + "Wall time: 4min 43s\n" + ] + } + ], + "source": [ + "%%time\n", + "# Predicts the data\n", + "transformer.transform(data=input_path, data_type='S3Prefix', content_type='text/csv', split_type='Line')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79b282ec", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda_python3", + "language": "python", + "name": "conda_python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_StepFunctions_Train.ipynb b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/4_Sagemaker_StepFunctions_Train.ipynb similarity index 100% rename from hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_StepFunctions_Train.ipynb rename to hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/4_Sagemaker_StepFunctions_Train.ipynb diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_StepFunctions_Inference.ipynb b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/5_Sagemaker_StepFunctions_Inference.ipynb similarity index 100% rename from hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_StepFunctions_Inference.ipynb rename to hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/5_Sagemaker_StepFunctions_Inference.ipynb diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_Inference.ipynb b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_Inference.ipynb deleted file mode 100644 index aa21796..0000000 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/src/ml/notebooks/Sagemaker_Inference.ipynb +++ /dev/null @@ -1,322 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "616d65aa", - "metadata": {}, - "source": [ - "# Sagemaker Inference" - ] - }, - { - "cell_type": "markdown", - "id": "aee7320a", - "metadata": {}, - "source": [ - "This script predicts new data with the uploaded image in ECR." - ] - }, - { - "cell_type": "markdown", - "id": "ea32612e", - "metadata": {}, - "source": [ - "## Import modules" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "3f188c9f", - "metadata": {}, - "outputs": [], - "source": [ - "import time\n", - "import boto3\n", - "import sagemaker\n", - "from sagemaker import get_execution_role" - ] - }, - { - "cell_type": "markdown", - "id": "430e1eb4", - "metadata": {}, - "source": [ - "## Setup" - ] - }, - { - "cell_type": "markdown", - "id": "ebe50488", - "metadata": {}, - "source": [ - "Modify according to your configurations." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "8893b148", - "metadata": {}, - "outputs": [], - "source": [ - "# Bucket name in S3\n", - "bucket = \"hermione-sagemaker\"" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "a6ba2451", - "metadata": {}, - "outputs": [], - "source": [ - "# Set session\n", - "region_name=\"us-east-1\"\n", - "boto3.setup_default_session(region_name=region_name)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "797c5fa6", - "metadata": {}, - "outputs": [], - "source": [ - "# Get user role\n", - "role = get_execution_role()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "d8148140", - "metadata": {}, - "outputs": [], - "source": [ - "# Get AWS Account ID\n", - "account_number = boto3.client(\"sts\").get_caller_identity()[\"Account\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "1b1fba48", - "metadata": {}, - "outputs": [], - "source": [ - "# Image previous uploaded in ECR\n", - "image_name = \"hermione-inference\"\n", - "image_uri = f\"{account_number}.dkr.ecr.{region_name}.amazonaws.com/{image_name}\"" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "f907e610", - "metadata": {}, - "outputs": [], - "source": [ - "# Input and output paths to execute inference\n", - "paths = {\n", - " 'inference_processed': f\"s3://{bucket}/PREPROCESSING/INFERENCE_PROCESSED/inference.csv\",\n", - " 'model': f\"s3://{bucket}/PREPROCESSING/MODEL/Hermione-train-2021-05-26-12-41-29-505/output/model.tar.gz\",\n", - " 'output_path': f\"s3://{bucket}/PREPROCESSING/OUTPUT\"\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "f5fdfdd8", - "metadata": {}, - "outputs": [], - "source": [ - "# instance to run the code\n", - "instance_type=\"ml.m5.large\"" - ] - }, - { - "cell_type": "markdown", - "id": "55fe64d7", - "metadata": {}, - "source": [ - "## Inference" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "60b7dc56", - "metadata": {}, - "outputs": [], - "source": [ - "# Receives the processed inference data in S3\n", - "input_path = paths['inference_processed']" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "e3dc913c", - "metadata": {}, - "outputs": [], - "source": [ - "# Receives the model created during the training in S3\n", - "model_path = paths['model']" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "5b69f31c", - "metadata": {}, - "outputs": [], - "source": [ - "# Saves the prediction in S3\n", - "output_path = paths['output_path']" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "29f7ce88", - "metadata": {}, - "outputs": [], - "source": [ - "# Creates the model to access the ECR image\n", - "model = sagemaker.model.Model(\n", - " image_uri= image_uri,\n", - " model_data=model_path,\n", - " role=role)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "aacdf22a", - "metadata": {}, - "outputs": [], - "source": [ - "# Creates a transformer object from the trained model\n", - "transformer = model.transformer(\n", - " instance_count=1,\n", - " instance_type=instance_type, \n", - " output_path=output_path,\n", - " accept = 'text/csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "6452e276", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - ".........................\u001b[34m2021-05-26 12:57:00,312 [INFO ] main com.amazonaws.ml.mms.ModelServer - \u001b[0m\n", - "\u001b[34mMMS Home: /usr/local/lib/python3.8/dist-packages\u001b[0m\n", - "\u001b[34mCurrent directory: /\u001b[0m\n", - "\u001b[34mTemp directory: /tmp\u001b[0m\n", - "\u001b[34mNumber of GPUs: 0\u001b[0m\n", - "\u001b[34mNumber of CPUs: 2\u001b[0m\n", - "\u001b[34mMax heap size: 857 M\u001b[0m\n", - "\u001b[34mPython executable: /usr/bin/python3\u001b[0m\n", - "\u001b[34mConfig file: /etc/sagemaker-mms.properties\u001b[0m\n", - "\u001b[34mInference address: http://0.0.0.0:8080\u001b[0m\n", - "\u001b[34mManagement address: http://0.0.0.0:8080\u001b[0m\n", - "\u001b[34mModel Store: /.sagemaker/mms/models\u001b[0m\n", - "\u001b[34mInitial Models: ALL\u001b[0m\n", - "\u001b[34mLog dir: /logs\u001b[0m\n", - "\u001b[34mMetrics dir: /logs\u001b[0m\n", - "\u001b[34mNetty threads: 0\u001b[0m\n", - "\u001b[34mNetty client threads: 0\u001b[0m\n", - "\u001b[34mDefault workers per model: 2\u001b[0m\n", - "\u001b[34mBlacklist Regex: N/A\u001b[0m\n", - "\u001b[34mMaximum Response Size: 6553500\u001b[0m\n", - "\u001b[34mMaximum Request Size: 6553500\u001b[0m\n", - "\u001b[34mPreload model: false\u001b[0m\n", - "\u001b[34mPrefer direct buffer: false\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:00,419 [WARN ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerLifeCycle - attachIOStreams() threadName=W-9000-model\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:00,506 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - model_service_worker started with args: --sock-type unix --sock-name /tmp/.mms.sock.9000 --handler serving.handler --model-path /.sagemaker/mms/models/model --model-name model --preload-model false --tmp-dir /tmp\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:00,508 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Listening on port: /tmp/.mms.sock.9000\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:00,509 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - [PID] 23\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:00,509 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - MMS worker started.\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:00,509 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Python runtime: 3.8.5\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:00,512 [INFO ] main com.amazonaws.ml.mms.wlm.ModelManager - Model model loaded.\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:00,517 [INFO ] main com.amazonaws.ml.mms.ModelServer - Initialize Inference server with: EpollServerSocketChannel.\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:00,536 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Connecting to: /tmp/.mms.sock.9000\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:00,536 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Connecting to: /tmp/.mms.sock.9000\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:00,607 [INFO ] main com.amazonaws.ml.mms.ModelServer - Inference API bind to: http://0.0.0.0:8080\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:00,613 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Connection accepted: /tmp/.mms.sock.9000.\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:00,614 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Connection accepted: /tmp/.mms.sock.9000.\u001b[0m\n", - "\u001b[34mModel server started.\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:00,636 [WARN ] pool-2-thread-1 com.amazonaws.ml.mms.metrics.MetricCollector - worker pid is not available yet.\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:02,508 [WARN ] W-9000-model-stderr com.amazonaws.ml.mms.wlm.WorkerLifeCycle - /usr/local/lib/python3.8/dist-packages/interpret_community/common/gpu_kmeans.py:30: UserWarning: cuML is required to use GPU explainers. Check https://rapids.ai/start.html for more information on how to install it.\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:02,510 [WARN ] W-9000-model-stderr com.amazonaws.ml.mms.wlm.WorkerLifeCycle - warnings.warn(\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:02,510 [WARN ] W-9000-model-stderr com.amazonaws.ml.mms.wlm.WorkerLifeCycle - /usr/local/lib/python3.8/dist-packages/interpret_community/common/gpu_kmeans.py:30: UserWarning: cuML is required to use GPU explainers. Check https://rapids.ai/start.html for more information on how to install it.\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:02,510 [WARN ] W-9000-model-stderr com.amazonaws.ml.mms.wlm.WorkerLifeCycle - warnings.warn(\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:03,375 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - generated new fontManager\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:03,393 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - generated new fontManager\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:03,635 [WARN ] W-9000-model-stderr com.amazonaws.ml.mms.wlm.WorkerLifeCycle - cuML is required to use GPU explainers. Check https://rapids.ai/start.html for more information on how to install it.\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:03,658 [WARN ] W-9000-model-stderr com.amazonaws.ml.mms.wlm.WorkerLifeCycle - cuML is required to use GPU explainers. Check https://rapids.ai/start.html for more information on how to install it.\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:03,690 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Loading the model\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:03,715 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Loading the model\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:03,741 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Model model loaded io_fd=0242a9fffefeff83-00000009-00000002-e6c9db643cbfeb7b-a47635f7\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:03,750 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Backend response time: 3046\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:03,752 [WARN ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerLifeCycle - attachIOStreams() threadName=W-model-1\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:03,768 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Model model loaded io_fd=0242a9fffefeff83-00000009-00000001-f549db643cbfeb7b-e2a66100\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:03,768 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Backend response time: 3065\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:03,769 [WARN ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerLifeCycle - attachIOStreams() threadName=W-model-2\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:09,272 [INFO ] pool-1-thread-4 ACCESS_LOG - /169.254.255.130:59054 \"GET /ping HTTP/1.1\" 200 11\u001b[0m\n", - "\u001b[35m2021-05-26 12:57:09,272 [INFO ] pool-1-thread-4 ACCESS_LOG - /169.254.255.130:59054 \"GET /ping HTTP/1.1\" 200 11\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:09,353 [INFO ] epollEventLoopGroup-3-2 ACCESS_LOG - /169.254.255.130:59058 \"GET /execution-parameters HTTP/1.1\" 404 2\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:09,462 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Predicting...\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:09,486 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Prediction Complete\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:09,491 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Saving\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:09,494 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Backend response time: 37\u001b[0m\n", - "\u001b[34m2021-05-26 12:57:09,494 [INFO ] W-9000-model ACCESS_LOG - /169.254.255.130:59068 \"POST /invocations HTTP/1.1\" 200 42\u001b[0m\n", - "\u001b[35m2021-05-26 12:57:09,353 [INFO ] epollEventLoopGroup-3-2 ACCESS_LOG - /169.254.255.130:59058 \"GET /execution-parameters HTTP/1.1\" 404 2\u001b[0m\n", - "\u001b[35m2021-05-26 12:57:09,462 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Predicting...\u001b[0m\n", - "\u001b[35m2021-05-26 12:57:09,486 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Prediction Complete\u001b[0m\n", - "\u001b[35m2021-05-26 12:57:09,491 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - Saving\u001b[0m\n", - "\u001b[35m2021-05-26 12:57:09,494 [INFO ] W-9000-model com.amazonaws.ml.mms.wlm.WorkerThread - Backend response time: 37\u001b[0m\n", - "\u001b[35m2021-05-26 12:57:09,494 [INFO ] W-9000-model ACCESS_LOG - /169.254.255.130:59068 \"POST /invocations HTTP/1.1\" 200 42\u001b[0m\n", - "\u001b[32m2021-05-26T12:57:09.364:[sagemaker logs]: MaxConcurrentTransforms=1, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD\u001b[0m\n", - "\n", - "CPU times: user 547 ms, sys: 59 ms, total: 606 ms\n", - "Wall time: 4min 43s\n" - ] - } - ], - "source": [ - "%%time\n", - "# Predicts the data\n", - "transformer.transform(data=input_path, data_type='S3Prefix', content_type='text/csv', split_type='Line')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "conda_python3", - "language": "python", - "name": "conda_python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/train/train.py b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/train/train.py index bc7b4cd..183ee0a 100644 --- a/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/train/train.py +++ b/hermione/module_templates/__IMPLEMENTED_SAGEMAKER__/train/train.py @@ -2,13 +2,15 @@ sys.path.append("src/") import os +from util import * import traceback -import pandas as pd + import logging +import pandas as pd + from sklearn.metrics import * from ml.model.trainer import TrainerSklearn from sklearn.ensemble import RandomForestClassifier -from util import * logging.getLogger().setLevel('INFO') @@ -21,64 +23,80 @@ error_path = os.path.join(prefix, 'output') model_path = os.environ['SM_MODEL_DIR'] + +def read_input(file_path): + """ + Take the set of train files and read them all + into a single pandas dataframe + + Parameters + ---------- + file_path : string + Path of the file + + Returns + ------- + pd.Dataframe : pandas DataFrame + """ + input_files = [os.path.join(file_path, file) + for file in os.listdir(file_path)] + if len(input_files) == 0: + raise ValueError(('There are no files in {}.\n' + + 'This usually indicates that the channel ({}) was \ + incorrectly specified,\n' + + 'the data specification in S3 was incorrectly \ + specified or the role specified\n' + + 'does not have permission to access \ + the data.').format(file_path, channel_name)) + raw_data = [pd.read_csv(file) for file in input_files] + return pd.concat(raw_data) + + def train(): """ - Execute the train step in the virtual environment - + Execute the train step in the virtual environment + """ logging.info('Starting the training') try: logging.info('Reading the inputs') - # Take the set of train files and read them all into a single pandas dataframe - input_files = [ os.path.join(training_path, file) for file in os.listdir(training_path) ] - if len(input_files) == 0: - raise ValueError(('There are no files in {}.\n' + - 'This usually indicates that the channel ({}) was incorrectly specified,\n' + - 'the data specification in S3 was incorrectly specified or the role specified\n' + - 'does not have permission to access the data.').format(training_path, channel_name)) - raw_data = [ pd.read_csv(file) for file in input_files ] - train = pd.concat(raw_data) - - # Take the set of val files and read them all into a single pandas dataframe - input_files = [ os.path.join(val_path, file) for file in os.listdir(val_path) ] - if len(input_files) == 0: - raise ValueError(('There are no files in {}.\n' + - 'This usually indicates that the channel ({}) was incorrectly specified,\n' + - 'the data specification in S3 was incorrectly specified or the role specified\n' + - 'does not have permission to access the data.').format(val_path, channel_name)) - raw_data = [ pd.read_csv(file) for file in input_files ] - val = pd.concat(raw_data) - + train = read_input(training_path) + val = read_input(val_path) + # Define the target and columns to be used in the train target = "Survived" columns = train.columns.drop(target) logging.info("Training the model") - model = TrainerSklearn().train(train, val, target, classification=True, + model = TrainerSklearn().train(train, val, target, classification=True, algorithm=RandomForestClassifier, columns=columns) - + # Salve the model and metrics logging.info("Saving") model.save_model(os.path.join(model_path, 'model.pkl')) metrics = model.artifacts["metrics"] - logging.info(f"accuracy={metrics['accuracy']}; f1={metrics['f1']}; precision={metrics['precision']}; recall={metrics['recall']};") - pd.DataFrame(model.artifacts["metrics"].items(), columns=['Metric', 'Value']).to_csv(os.path.join(model_path, 'metrics.csv'), index=False) + logging.info(f"accuracy={metrics['accuracy']}; \ + f1={metrics['f1']}; \ + precision={metrics['precision']}; \ + recall={metrics['recall']};") + pd.DataFrame(model.artifacts["metrics"].items(), + columns=['Metric', 'Value']).to_csv( + os.path.join(model_path, 'metrics.csv'), index=False) logging.info('Training complete.') - + except Exception as e: - # Write out an error file. This will be returned as the failureReason in the - # DescribeTrainingJob result. + # Write out an error file trc = traceback.format_exc() with open(os.path.join(error_path, 'failure'), 'w') as s: s.write('Exception during training: ' + str(e) + '\n' + trc) - # Printing this causes the exception to be in the training job logs, as well. - logging.info('Exception during training: ' + str(e) + '\n' + trc, file=sys.stderr) - # A non-zero exit code causes the training job to be marked as Failed. + logging.info('Exception during training: ' + str(e) + '\n' + trc, + file=sys.stderr) + # A non-zero exit code causes the training job to be marked as Failed sys.exit(255) + if __name__ == '__main__': train() - # A zero exit code causes the job to be marked a Succeeded. sys.exit(0)