diff --git a/src/Dockerfile b/src/Dockerfile index 865d7f2e..e071165d 100644 --- a/src/Dockerfile +++ b/src/Dockerfile @@ -1,3 +1,4 @@ +# This Dockerfile is used to build the datapump layer for the lambdas. FROM public.ecr.aws/lambda/python:3.10 ENV WORKDIR /opt @@ -8,18 +9,13 @@ RUN mkdir -p /opt/python # Make the dir and to install all packages into packages/ COPY . $WORKDIR -# installing dependencies to build package +# Installing dependencies to build package. This implicitly uses setup.py for the +# dependency list. RUN pip install . -t python -# This next line needs to be changed (just increment the number) in order -# to change the hash of the file and get TF to realize it needs to be -# redeployed. Ticket for a better solution: -# https://gfw.atlassian.net/browse/GTC-1250 -# change 35 - RUN yum install -y zip geos-devel -# Precompile all python packages and remove .py files +# Remove any precompiled files and __pycache__ dirs RUN find python/ -type f -name '*.pyc' -print0 | xargs -0 rm -rf RUN find python/ -type d -a -name '__pycache__' -print0 | xargs -0 rm -rf diff --git a/terraform/modules/datapump/data.tf b/terraform/modules/datapump/data.tf index ed99fd72..ea9c25a2 100644 --- a/terraform/modules/datapump/data.tf +++ b/terraform/modules/datapump/data.tf @@ -20,11 +20,48 @@ data "template_file" "sfn_datapump" { } } -module "py310_datapump_021" { - source = "git::https://github.com/wri/gfw-lambda-layers.git//terraform/modules/lambda_layer" - bucket = var.pipelines_bucket - name = "datapump-${terraform.workspace}" - module_version = "0.2.1" - runtime = "python3.10" - layer_path = "${var.lambda_layers_path}/" -} \ No newline at end of file +# Terraform to create and upload layer.zip of the datapump source code +# and dependencies. + +locals { + layer_name = substr("python3.10-datapump-${terraform.workspace}_0.2.1", 0, 64) + +} + +# Build the Docker image and copy ZIP file to local folder +# Always build the zip file so we can do a hash on the entire source. +resource "null_resource" "build" { + triggers = { + curtime = timestamp() + } + + provisioner "local-exec" { + command = "${path.module}/scripts/build.sh ${var.lambda_layers_path} ${local.layer_name}" + interpreter = ["bash", "-c"] + } +} + +data "external" "layer_sha256" { + program = [ "${path.module}/scripts/hash.sh", "${var.lambda_layers_path}/layer.zip"] + depends_on = [null_resource.build] +} + +resource "aws_s3_bucket_object" "py310_datapump_021" { + bucket = var.pipelines_bucket + key = "lambda_layers/${local.layer_name}.zip" + source = "${var.lambda_layers_path}/layer.zip" + # This is what decides if the s3 upload of the layer will happen, + # though terraform seems to do its own hash of the zip file as well. + etag = lookup(data.external.layer_sha256.result, "hash") +} + +resource "aws_lambda_layer_version" "py310_datapump_021" { + layer_name = replace(local.layer_name, ".", "") + s3_bucket = aws_s3_bucket_object.py310_datapump_021.bucket + s3_key = aws_s3_bucket_object.py310_datapump_021.key + compatible_runtimes = ["python3.10"] + # This decides if the actual layer will be replaced in the lambda, + # though terraform seems use its own etag of the zip file on S3 as well, + # which means we always update the zip file. + source_code_hash = lookup(data.external.layer_sha256.result, "hash") +} diff --git a/terraform/modules/datapump/lambdas.tf b/terraform/modules/datapump/lambdas.tf index 4fb147c3..1b30ef78 100644 --- a/terraform/modules/datapump/lambdas.tf +++ b/terraform/modules/datapump/lambdas.tf @@ -10,7 +10,7 @@ resource "aws_lambda_function" "dispatcher" { publish = true tags = local.tags layers = [ - module.py310_datapump_021.layer_arn, + aws_lambda_layer_version.py310_datapump_021.arn, var.numpy_lambda_layer_arn, var.rasterio_lambda_layer_arn, var.shapely_lambda_layer_arn @@ -39,7 +39,7 @@ resource "aws_lambda_function" "executor" { timeout = var.lambda_params.timeout publish = true tags = local.tags - layers = [module.py310_datapump_021.layer_arn] + layers = [aws_lambda_layer_version.py310_datapump_021.arn] environment { variables = { ENV = var.environment @@ -68,7 +68,7 @@ resource "aws_lambda_function" "postprocessor" { publish = true tags = local.tags layers = [ - module.py310_datapump_021.layer_arn, + aws_lambda_layer_version.py310_datapump_021.arn, var.numpy_lambda_layer_arn, var.rasterio_lambda_layer_arn, var.shapely_lambda_layer_arn @@ -82,4 +82,4 @@ resource "aws_lambda_function" "postprocessor" { DATAPUMP_TABLE_NAME = aws_dynamodb_table.datapump.name } } -} \ No newline at end of file +} diff --git a/terraform/modules/datapump/scripts/build.sh b/terraform/modules/datapump/scripts/build.sh new file mode 100755 index 00000000..79bb2aa4 --- /dev/null +++ b/terraform/modules/datapump/scripts/build.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash + +# This is the same build script as in gfw-lambda-layers/terraform/modules/lambda_layer/scripts/build.sh +# It builds and runs a docker as specified in ${1}/Dockerfile to create a layer.zip. + +set -e + +LAYER_PATH="${1}" +IMAGE="globalforestwatch/${2}" + +echo -n "${LAYER_PATH}" > "${LAYER_PATH}/foo.txt" +date >> "${LAYER_PATH}/foo.txt" +CONTAINER_NAME="container_$(sha1sum ${LAYER_PATH}/foo.txt |cut -c 1-8)" + +pushd "${LAYER_PATH}" + +echo "BUILD image ${IMAGE}" +docker build --no-cache -t "${IMAGE}" . + +echo "CREATE container ${CONTAINER_NAME}" +docker run -itd --name "${CONTAINER_NAME}" "${IMAGE}" /bin/bash + +echo "COPY ZIP package to host" +docker cp "${CONTAINER_NAME}":"/opt/layer.zip" layer.zip + +echo "STOP container" +docker stop "${CONTAINER_NAME}" +docker wait "${CONTAINER_NAME}" + +echo "REMOVE container" +docker rm -f "${CONTAINER_NAME}" + +popd diff --git a/terraform/modules/datapump/scripts/hash.sh b/terraform/modules/datapump/scripts/hash.sh new file mode 100755 index 00000000..19925ca7 --- /dev/null +++ b/terraform/modules/datapump/scripts/hash.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +# This does a hash of the zip file, but that includes all the modified times of the +# files, which keep changing, even when the file names and contents are the same. I +# tried generating a hash using only filenames and contents, but terraform seems to +# create its own hash of the layer.zip file as well, so basically we're always going +# to update the layer.zip no matter what, which seems fine. +hash=$(sha256sum $1 | cut -d' ' -f1) + +echo '{ "hash": "'"$hash"'" }'