From 25c5008466b31f50143cabcf01272eeba447c4c5 Mon Sep 17 00:00:00 2001 From: Thomas Schaffter Date: Fri, 6 Dec 2024 13:30:24 -0800 Subject: [PATCH] Add AWS Lambda for upcoming data integration (ARCH-356) (#72) * update docs on setup tools * define lambda role and function * update path to Dockerfile * update README * trigger the lambda every 5 minutes * use plural form of the unit * Remove lambda fct architecture * Migrate data integration code to L2 constructs * Add @dataclass to DataIntegrationProps * Add docstrings * Replace `_lambda` by `lambda_` * Add docstrings * Add docstrings --- README.md | 2 +- app.py | 16 +++++ cdk_docker/data-integration-lambda/Dockerfile | 1 + openchallenges/data_integration_lambda.py | 62 +++++++++++++++++ openchallenges/data_integration_props.py | 15 +++++ openchallenges/data_integration_stack.py | 66 +++++++++++++++++++ requirements.txt | 2 + 7 files changed, 163 insertions(+), 1 deletion(-) create mode 100644 cdk_docker/data-integration-lambda/Dockerfile create mode 100644 openchallenges/data_integration_lambda.py create mode 100644 openchallenges/data_integration_props.py create mode 100644 openchallenges/data_integration_stack.py diff --git a/README.md b/README.md index aa71846..872fb23 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ also include a Python virtual environment where all the Python packages needed are already installed. If you decide the develop outside of the dev container, some of the development -tools can be installed by running: +tools can be installed manually by running: ```console ./tools/setup.sh diff --git a/app.py b/app.py index 54c6929..d207739 100644 --- a/app.py +++ b/app.py @@ -1,4 +1,5 @@ import aws_cdk as cdk +from aws_cdk.aws_scheduler_alpha import ScheduleExpression from openchallenges.bucket_stack import BucketStack from openchallenges.network_stack import NetworkStack @@ -7,6 +8,8 @@ from openchallenges.service_stack import LoadBalancedServiceStack from openchallenges.load_balancer_stack import LoadBalancerStack from openchallenges.service_props import ServiceProps, ContainerVolume +from openchallenges.data_integration_stack import DataIntegrationStack +from openchallenges.data_integration_props import DataIntegrationProps import openchallenges.utils as utils app = cdk.App() @@ -328,6 +331,19 @@ app, f"{stack_name_prefix}-load-balancer", network_stack.vpc ) +data_integration_props = DataIntegrationProps( + schedule=ScheduleExpression.cron( + minute="*/5", + hour="*", + day="*", + month="*", + time_zone=cdk.TimeZone.AMERICA_LOS_ANGELES, + ) +) +data_integration_stack = DataIntegrationStack( + app, f"{stack_name_prefix}-data-integration", data_integration_props +) + api_docs_props = ServiceProps( "openchallenges-api-docs", 8010, diff --git a/cdk_docker/data-integration-lambda/Dockerfile b/cdk_docker/data-integration-lambda/Dockerfile new file mode 100644 index 0000000..6665871 --- /dev/null +++ b/cdk_docker/data-integration-lambda/Dockerfile @@ -0,0 +1 @@ +FROM ghcr.io/sage-bionetworks/sandbox-lambda-python:sha-b38dc22 diff --git a/openchallenges/data_integration_lambda.py b/openchallenges/data_integration_lambda.py new file mode 100644 index 0000000..5a03836 --- /dev/null +++ b/openchallenges/data_integration_lambda.py @@ -0,0 +1,62 @@ +from aws_cdk import aws_iam as iam +from aws_cdk import aws_lambda as lambda_ +from constructs import Construct + + +class DataIntegrationLambda(Construct): + """ + A CDK construct to define an AWS Lambda function for data integration. + + This construct creates an IAM role with the necessary permissions and a Docker-based + Lambda function for handling data integration tasks. + """ + + def __init__(self, scope: Construct, id: str) -> None: + """ + Builds the IAM role for the Lambda function. + + This role allows the Lambda function to execute basic AWS operations. + + Returns: + iam.Role: The IAM role for the Lambda function. + """ + super().__init__(scope, id) + + self.lambda_role = self._build_lambda_role() + self.lambda_function = self._build_lambda_function(self.lambda_role) + + def _build_lambda_role(self) -> iam.Role: + return iam.Role( + self, + "LambdaRole", + assumed_by=iam.ServicePrincipal("lambda.amazonaws.com"), + managed_policies=[ + iam.ManagedPolicy.from_aws_managed_policy_name( + managed_policy_name=("service-role/AWSLambdaBasicExecutionRole") + ) + ], + ) + + def _build_lambda_function(self, role: iam.Role) -> lambda_.Function: + """ + Builds the Docker-based AWS Lambda function. + + The Lambda function uses a Docker image built from a local directory. + + Args: + role (iam.Role): The IAM role to associate with the Lambda function. + + Returns: + _lambda.Function: The Docker-based AWS Lambda function. + """ + return lambda_.DockerImageFunction( + self, + "LambdaFunction", + code=lambda_.DockerImageCode.from_image_asset( + # Directory relative to where you execute cdk deploy contains a + # Dockerfile with build instructions. + directory="cdk_docker/data-integration-lambda" + ), + role=role, + memory_size=128, + ) diff --git a/openchallenges/data_integration_props.py b/openchallenges/data_integration_props.py new file mode 100644 index 0000000..6d8dd13 --- /dev/null +++ b/openchallenges/data_integration_props.py @@ -0,0 +1,15 @@ +from dataclasses import dataclass +from aws_cdk.aws_scheduler_alpha import ScheduleExpression + + +@dataclass +class DataIntegrationProps: + """ + Data integration properties. + + Attributes: + schedule (ScheduleExpression): The schedule for triggering the data integration. + """ + + schedule: ScheduleExpression + """The schedule for triggering the data integration.""" diff --git a/openchallenges/data_integration_stack.py b/openchallenges/data_integration_stack.py new file mode 100644 index 0000000..ca651e3 --- /dev/null +++ b/openchallenges/data_integration_stack.py @@ -0,0 +1,66 @@ +import aws_cdk as cdk +from aws_cdk import ( + aws_scheduler_alpha as scheduler_alpha, + aws_scheduler_targets_alpha as scheduler_targets, +) +from openchallenges.data_integration_lambda import DataIntegrationLambda +from openchallenges.data_integration_props import DataIntegrationProps +from constructs import Construct + + +class DataIntegrationStack(cdk.Stack): + """ + Defines an AWS CDK stack for data integration. + + This stack sets up the resources required for scheduling and executing + data integration tasks using AWS Lambda and EventBridge Scheduler. + + The stack includes: + - A Lambda function for data integration. + - An EventBridge Scheduler schedule to trigger the Lambda function. + - An EventBridge Scheduler group for organizing schedules. + + Attributes: + scope (Construct): The parent construct. + id (str): The unique identifier for this stack. + props (DataIntegrationProps): The properties for the data integration, including the schedule. + """ + + def __init__( + self, scope: Construct, id: str, props: DataIntegrationProps, **kwargs + ) -> None: + """ + Initializes the DataIntegrationStack. + + Arguments: + scope (Construct): The parent construct for this stack. + id (str): The unique identifier for this stack. + props (DataIntegrationProps): The properties required for data integration, + including the schedule. + **kwargs: Additional arguments passed to the base `cdk.Stack` class. + """ + super().__init__(scope, id, **kwargs) + + data_integration_lambda = DataIntegrationLambda(self, "data-integration-lambda") + + target = scheduler_targets.LambdaInvoke( + data_integration_lambda.lambda_function, + input=scheduler_alpha.ScheduleTargetInput.from_object({}), + ) + + # Create a group for the schedule (maybe we want to add more schedules + # to this group the future) + schedule_group = scheduler_alpha.Group( + self, + "group", + group_name="schedule-group", + ) + + scheduler_alpha.Schedule( + self, + "schedule", + schedule=props.schedule, + target=target, + group=schedule_group, + description="This is a cron-based schedule that will run every 5 minutes", + ) diff --git a/requirements.txt b/requirements.txt index 92c10ed..b1c51a6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ aws-cdk-lib==2.139.0 +aws-cdk.aws-scheduler-alpha==2.139.0a0 +aws-cdk.aws-scheduler-targets-alpha==2.139.0a0 constructs>=10.0.0,<11.0.0 boto3>=1.34.1