diff --git a/.circleci/config.yml b/.circleci/config.yml index 5b33f20..1f46ff4 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -18,42 +18,53 @@ jobs: name: run pre-commit command: ~/.local/bin/pre-commit run --all-files - deploy: - docker: - - image: nikolaik/python-nodejs:python3.7-nodejs12 + deploy-staging: + machine: + image: ubuntu-2004:202010-01 working_directory: ~/covid-api steps: - checkout - - setup_remote_docker + - run: + name: use python 3 + command: | + pyenv global 3.8.5 + - run: name: install dependencies command: | pip install -e .["deploy"] --user npm install -g cdk - - run: - name: install docker-ce-cli + - deploy: + name: develop branch deployed to staging cdk stack command: | - apt-get update - apt-get install apt-transport-https \ - ca-certificates curl gnupg2 software-properties-common -y - curl -fsSL https://download.docker.com/linux/debian/gpg | apt-key add - - add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/debian buster stable" - apt-get update - apt-get install docker-ce-cli -y + if [ "${CIRCLE_BRANCH}" == "develop" ]; then + STAGE='staging' cdk deploy covid-api-dataset-metadata-generator-staging --region us-east-1 --require-approval never + STAGE='staging' VPC_ID='vpc-0fa3007e738c7bbdf' cdk deploy covid-api-lambda-staging --region us-east-1 --require-approval never + fi + + deploy-production: + machine: + image: ubuntu-2004:202010-01 + working_directory: ~/covid-api + steps: + - checkout + - run: + name: use python 3 + command: | + pyenv global 3.8.5 - run: - name: create lambda package + name: install dependencies command: | - docker build . -t lambda:latest -f Dockerfiles/lambda/Dockerfile - docker run --name lambda lambda:latest echo "container up" - docker cp lambda:/tmp/package.zip package.zip - docker stop lambda + pip install -e .["deploy"] --user + npm install -g cdk - deploy: - name: master branch deployed to cdk stack + name: master branch deployed to production cdk stack command: | if [ "${CIRCLE_BRANCH}" == "master" ]; then + STAGE='prod' cdk deploy covid-api-dataset-metadata-generator-prod --region us-east-1 --require-approval never STAGE='prod' cdk deploy covid-api-lambda-prod --region us-east-1 --require-approval never fi @@ -62,7 +73,14 @@ workflows: test_and_deploy: jobs: - test - - deploy: + - deploy-staging: + requires: + - test + filters: + branches: + # only: /^feature\/.*/ + only: develop + - deploy-production: requires: - test filters: diff --git a/Dockerfiles/ecs/Dockerfile b/Dockerfiles/ecs/Dockerfile index b859207..b175090 100644 --- a/Dockerfiles/ecs/Dockerfile +++ b/Dockerfiles/ecs/Dockerfile @@ -1,4 +1,4 @@ -FROM laurents/uvicorn-gunicorn-fastapi:python3.7-slim +FROM tiangolo/uvicorn-gunicorn:python3.8 # Ref https://github.com/tiangolo/uvicorn-gunicorn-fastapi-docker/issues/15 # Cuts image size by 50% # FROM tiangolo/uvicorn-gunicorn-fastapi:python3.7 @@ -10,4 +10,5 @@ COPY covid_api/ /app/covid_api/ COPY setup.py /app/setup.py RUN pip install -e /app/. --no-cache-dir + CMD ["/start-reload.sh"] diff --git a/Dockerfiles/lambda/Dockerfile b/Dockerfiles/lambda/Dockerfile index b8358fc..8cbf4e9 100644 --- a/Dockerfiles/lambda/Dockerfile +++ b/Dockerfiles/lambda/Dockerfile @@ -1,14 +1,15 @@ FROM lambci/lambda:build-python3.7 -WORKDIR /tmp # We install covid_api and mangum +WORKDIR /app COPY README.md /app/README.md COPY covid_api/ /app/covid_api/ COPY setup.py /app/setup.py -RUN pip install /app/. "mangum>=0.9.0" -t /var/task --no-binary numpy +RUN pip install --upgrade pip +RUN pip install . "mangum>=0.9.0" -t /var/task --no-binary numpy, pydantic # Reduce package size and remove useless files RUN cd /var/task && find . -type f -name '*.pyc' | while read f; do n=$(echo $f | sed 's/__pycache__\///' | sed 's/.cpython-[2-3][0-9]//'); cp $f $n; done; @@ -18,7 +19,5 @@ RUN find /var/task -type d -a -name 'tests' -print0 | xargs -0 rm -rf RUN rm -rdf /var/task/numpy/doc/ RUN rm -rdf /var/task/stack -RUN cd /var/task && zip -r9q /tmp/package.zip * -COPY lambda/handler.py handler.py -RUN zip -r9q /tmp/package.zip handler.py \ No newline at end of file +COPY lambda/handler.py /var/task/handler.py diff --git a/cdk.context.json b/cdk.context.json new file mode 100644 index 0000000..6f6924f --- /dev/null +++ b/cdk.context.json @@ -0,0 +1,45 @@ +{ + "vpc-provider:account=853558080719:filter.vpc-id=vpc-0fa3007e738c7bbdf:region=us-east-1:returnAsymmetricSubnets=true": { + "vpcId": "vpc-0fa3007e738c7bbdf", + "vpcCidrBlock": "10.0.0.0/16", + "availabilityZones": [], + "subnetGroups": [ + { + "name": "Private", + "type": "Private", + "subnets": [ + { + "subnetId": "subnet-04bc4ca3d119f6f6b", + "cidr": "10.0.128.0/18", + "availabilityZone": "us-east-1a", + "routeTableId": "rtb-0a01309e2f528c2bd" + }, + { + "subnetId": "subnet-0bcd0f2d9b9ac1c56", + "cidr": "10.0.192.0/18", + "availabilityZone": "us-east-1b", + "routeTableId": "rtb-05251cbc837438e6c" + } + ] + }, + { + "name": "Public", + "type": "Public", + "subnets": [ + { + "subnetId": "subnet-009875640f64d198a", + "cidr": "10.0.0.0/18", + "availabilityZone": "us-east-1a", + "routeTableId": "rtb-0b1d4d54a9d962398" + }, + { + "subnetId": "subnet-0e033da6876bf7014", + "cidr": "10.0.64.0/18", + "availabilityZone": "us-east-1b", + "routeTableId": "rtb-074500f27775c6bda" + } + ] + } + ] + } +} diff --git a/covid_api/api/api_v1/endpoints/detections.py b/covid_api/api/api_v1/endpoints/detections.py index 075980a..669b29e 100644 --- a/covid_api/api/api_v1/endpoints/detections.py +++ b/covid_api/api/api_v1/endpoints/detections.py @@ -12,7 +12,7 @@ router = APIRouter() # TODO: unhardcoded types and dates -MLTypes = Enum("MLTypes", [(ml, ml) for ml in ["ship", "multiple", "plane"]]) # type: ignore +MLTypes = Enum("MLTypes", [(ml, ml) for ml in ["ship", "multiple", "plane", "vehicles"]]) # type: ignore @router.get( diff --git a/covid_api/api/utils.py b/covid_api/api/utils.py index 79c86c4..177164d 100644 --- a/covid_api/api/utils.py +++ b/covid_api/api/utils.py @@ -740,4 +740,6 @@ def site_date_to_scenes(site: str, date: str): json.loads(row["scene_id"].replace("'", '"')) ) - return site_date_to_scenes_dict[f"{site}-{date}"] + # deduplicate scene list (in case multiple datasets contains the same + # scene id) + return list(set(site_date_to_scenes_dict[f"{site}-{date}"])) diff --git a/covid_api/core/config.py b/covid_api/core/config.py index fb9bad2..7b57ade 100644 --- a/covid_api/core/config.py +++ b/covid_api/core/config.py @@ -19,6 +19,14 @@ MEMCACHE_PASSWORD = os.environ.get("MEMCACHE_PASSWORD") INDICATOR_BUCKET = os.environ.get("INDICATOR_BUCKET", "covid-eo-data") + +DATASET_METADATA_FILENAME = os.environ.get( + "DATASET_METADATA_FILENAME", "dev-dataset-metadata.json" +) +DATASET_METADATA_GENERATOR_FUNCTION_NAME = os.environ.get( + "DATASET_METADATA_GENERATOR_FUNCTION_NAME", "dev-dataset-metadata-generator" +) + DT_FORMAT = "%Y-%m-%d" MT_FORMAT = "%Y%m" PLANET_API_KEY = os.environ.get("PLANET_API_KEY") diff --git a/covid_api/db/static/datasets/__init__.py b/covid_api/db/static/datasets/__init__.py index 92b1201..d058321 100644 --- a/covid_api/db/static/datasets/__init__.py +++ b/covid_api/db/static/datasets/__init__.py @@ -1,11 +1,18 @@ """ covid_api static datasets """ +import json import os -from copy import deepcopy -from typing import Any, Dict, List, Set +from typing import List +import botocore + +from covid_api.core.config import ( + DATASET_METADATA_FILENAME, + DATASET_METADATA_GENERATOR_FUNCTION_NAME, + INDICATOR_BUCKET, +) from covid_api.db.static.errors import InvalidIdentifier from covid_api.db.static.sites import sites -from covid_api.db.utils import get_dataset_domain, get_dataset_folders_by_spotlight +from covid_api.db.utils import invoke_lambda, s3_get from covid_api.models.static import DatasetInternal, Datasets, GeoJsonSource data_dir = os.path.join(os.path.dirname(__file__)) @@ -16,17 +23,44 @@ class DatasetManager(object): def __init__(self): """Load all datasets in a dict.""" + + pass + + def _data(self): datasets = [ os.path.splitext(f)[0] for f in os.listdir(data_dir) if f.endswith(".json") ] - - self._data = { + return { dataset: DatasetInternal.parse_file( os.path.join(data_dir, f"{dataset}.json") ) for dataset in datasets } + def _load_domain_metadata(self): + try: + return json.loads( + s3_get(bucket=INDICATOR_BUCKET, key=DATASET_METADATA_FILENAME) + ) + except botocore.errorfactory.ClientError as e: + + if e.response["Error"]["Code"] == "NoSuchKey": + print( + "No datasets domain metadata file found, requesting generation" + " of a new file. This may take several minutes." + ) + # invoke_lambda should return the output of the lambda's execution + # however there are issues with accessing the output object within the + # "Payload" returned by the lambda_invocation (see docstring). + # Instead the thread is held while the lambda executes and then + # loads the metadata from s3. + invoke_lambda( + lambda_function_name=DATASET_METADATA_GENERATOR_FUNCTION_NAME + ) + return json.loads( + s3_get(bucket=INDICATOR_BUCKET, key=DATASET_METADATA_FILENAME) + ) + def get(self, spotlight_id: str, api_url: str) -> Datasets: """ Fetches all the datasets avilable for a given spotlight. If the @@ -34,12 +68,26 @@ def get(self, spotlight_id: str, api_url: str) -> Datasets: all datasets that are NOT spotlight specific. Raises an `InvalidIdentifier` exception if the provided spotlight_id does not exist. + + Params: + ------- + spotlight_id (str): spotlight id to return datasets for + api_url(str): {scheme}://{host} of request originator in order + to return correctly formated source urls + + Returns: + ------- + (Datasets) pydantic model contains a list of datasets' metadata """ - global_datasets = self._get_global_datasets() - global_datasets = self._overload_domain(datasets=global_datasets) + + global_datasets = self._process( + self._load_domain_metadata()["global"], + api_url=api_url, + spotlight_id="global", + ) if spotlight_id == "global": - return self._prep_output(global_datasets, api_url=api_url) + return Datasets(datasets=[dataset.dict() for dataset in global_datasets]) # Verify that the requested spotlight exists try: @@ -47,211 +95,111 @@ def get(self, spotlight_id: str, api_url: str) -> Datasets: except InvalidIdentifier: raise - # Append "EUPorts" to the spotlight ID's if the requested spotlight id - # was one of "du" or "gh", since certain datasets group both spotlights - # under a single value: "EUPorts". It's then necessary to search, - # and extract domain for each option ("du"/"gh" and "EUPorts") separately - - spotlight_ids = [site.id] - if site.id in ["du", "gh"]: - spotlight_ids.append("EUPorts") - - spotlight_datasets = {} - - for spotlight_id in spotlight_ids: - # find all "folders" in S3 containing keys for the given spotlight - # each "folder" corresponds to a dataset. - spotlight_dataset_folders = get_dataset_folders_by_spotlight( - spotlight_id=spotlight_id - ) - # filter the dataset items by those corresponding the folders above - # and add the datasets to the previously filtered `global` datasets - datasets = self._filter_datasets_by_folders( - folders=spotlight_dataset_folders - ) - - datasets = self._overload_spotlight_id( - datasets=datasets, spotlight_id=spotlight_id - ) - - datasets = self._overload_domain( - datasets=datasets, spotlight_id=spotlight_id - ) - spotlight_datasets.update(datasets) + spotlight_datasets = self._process( + self._load_domain_metadata()[site.id], + api_url=api_url, + spotlight_id=site.id, + ) - if spotlight_id == "tk": - spotlight_datasets["water-chlorophyll"].source.tiles = [ - tile.replace("&rescale=-100%2C100", "") - for tile in spotlight_datasets["water-chlorophyll"].source.tiles + return Datasets( + datasets=[ + dataset.dict() for dataset in [*global_datasets, *spotlight_datasets] ] - - # global datasets are returned for all spotlights - spotlight_datasets.update(global_datasets) - - return self._prep_output(spotlight_datasets, api_url=api_url) + ) def get_all(self, api_url: str) -> Datasets: """Fetch all Datasets. Overload domain with S3 scanned domain""" - self._data = self._overload_domain(datasets=self._data) - return self._prep_output(self._data, api_url=api_url) + # print(self._load_domain_metadata()) + datasets = self._process( + datasets_domains_metadata=self._load_domain_metadata()["_all"], + api_url=api_url, + ) + return Datasets(datasets=[dataset.dict() for dataset in datasets]) def list(self) -> List[str]: """List all datasets""" - return list(self._data.keys()) + return list(self._data().keys()) + + def _format_urls(self, tiles: List[str], api_url: str, spotlight_id: str = None): + if spotlight_id: + return [ + tile.replace("{api_url}", api_url).replace( + "{spotlightId}", spotlight_id + ) + for tile in tiles + ] + return [tile.replace("{api_url}", api_url) for tile in tiles] - def _prep_output(self, output_datasets: dict, api_url: str): + def _process( + self, datasets_domains_metadata: dict, api_url: str, spotlight_id: str = None + ): """ - Replaces the `source` of the detections-* datasets with geojson data types and - inserts the url base of the source tile url. - The deepcopy of the the data to output is necessary to avoid modifying the - underlying objects, which would affect the result of subsequent API calls. + Processes datasets to be returned to the API consumer: + - Updates dataset domains for all returned datasets + - Inserts api url into source urls + - Inserts spotlight id into source url (if a spotlight id is provided) Params: ------- - output_datasets (dict): Dataset metadata objects to return to API consumer. + output_datasets (dict): Dataset domains for the datasets to be returned. api_url (str): Base url, of the form {schema}://{host}, extracted from the request, to prepend all tile source urls with. + spotlight_id (Optional[str]): + Spotlight ID (if requested), to be inserted into the source urls Returns: -------- - (dict) : datasets metadata object, ready to return to the API consumer - """ - output_datasets = deepcopy(output_datasets) - for dataset in output_datasets.values(): - dataset.source.tiles = [ - tile.replace("{api_url}", api_url) for tile in dataset.source.tiles - ] - - if dataset.background_source: - dataset.background_source.tiles = [ - tile.replace("{api_url}", api_url) - for tile in dataset.background_source.tiles - ] - if dataset.compare: - dataset.compare.source.tiles = [ - tile.replace("{api_url}", api_url) - for tile in dataset.compare.source.tiles - ] - if dataset.id in ["detections-ship", "detections-plane"]: - dataset.source = GeoJsonSource( - type=dataset.source.type, data=dataset.source.tiles[0] - ) - - return Datasets( - datasets=[dataset.dict() for dataset in output_datasets.values()] - ) - - @staticmethod - def _overload_spotlight_id(datasets: dict, spotlight_id: str): - """ - Returns the provided `datasets` objects with an updated value for - each dataset's `source.tiles` and `background_source.tiles` keys. - The string "{spotlightId}" in the `tiles` URL(s) is replaced with the - actual value of the spotlightId (eg: "ny", "sf", "tk") - Params: - ------ - datasets (dict): dataset metadata objects for which to overload - `source.tiles` and `background_source.tiles` keys. - spotlight_id ([dict]): spotlight id value with which to replace - "{spotlightId}" - - Returns: - ------ - dict: the `datasets` object, with an updated `source.tiles` and - `background_source.tiles` values for each dataset in the `datasets` object. - """ - for _, dataset in datasets.items(): - dataset.source.tiles = [ - url.replace("{spotlightId}", spotlight_id) - for url in dataset.source.tiles - ] - - if not dataset.background_source: - continue - - dataset.background_source.tiles = [ - url.replace("{spotlightId}", spotlight_id) - for url in dataset.background_source.tiles - ] - return datasets - - @staticmethod - def _overload_domain(datasets: dict, spotlight_id: str = None): - """ - Returns the provided `datasets` object with an updated value for - each dataset's `domain` key. - The domain is extracted by listing keys in S3 belonging to that - dataset (and spotlight, if provided) and extracting the dates from - those keys. - - Params: - ------ - datasets (dict): dataset metadata objects for which to overload - `domain` keys. - spotlight_id (Optional[str]): spotlight_id to further precise `domain` - search - - Returns: - ------ - dict: the `datasets` object, with an updated `domain` value for each - dataset in the `datasets` object. + (list) : datasets metadata objects (to be serialized as a pydantic Datasets + model) """ + output_datasets = { + k: v + for k, v in self._data().items() + if k in datasets_domains_metadata.keys() + } - for _, dataset in datasets.items(): - - # No point in searching for files in S3 if the dataset isn't stored there! - if not dataset.s3_location: - continue + for k, dataset in output_datasets.items(): - domain_args: Dict[str, Any] = { - "dataset_folder": dataset.s3_location, - "is_periodic": dataset.is_periodic, - } + # overload domain with domain returned from s3 file + dataset.domain = datasets_domains_metadata[k]["domain"] + # format url to contain the correct API host and + # spotlight id (if a spotlight was requested) + format_url_params = dict(api_url=api_url) if spotlight_id: - domain_args["spotlight_id"] = spotlight_id - - dataset.domain = get_dataset_domain(**domain_args) - - return datasets + format_url_params.update(dict(spotlight_id=spotlight_id)) - def _filter_datasets_by_folders(self, folders: Set[str]) -> Dict: - """ - Returns all datasets corresponding to a set of folders (eg: for - folders {"BMHD_30M_MONTHLY", "xco2"} this method would return the - "Nightlights HD" and the "CO2" dataset metadata objects) - - Params: - ------- - folders (Set[str]): folders to filter datasets - - Returns: - -------- - Dict : Metadata objects for the datasets corresponding to the - folders provided. - """ - # deepcopy is necessary because the spotlight and domain overriding was - # affecting the original dataset metadata items and returning the same values - # in subsequent API requests for different spotlights - return { - k: v for k, v in deepcopy(self._data).items() if v.s3_location in folders - } + dataset.source.tiles = self._format_urls( + tiles=dataset.source.tiles, **format_url_params + ) + if dataset.background_source: + dataset.background_source.tiles = self._format_urls( + tiles=dataset.background_source.tiles, **format_url_params + ) + if dataset.compare: + dataset.compare.source.tiles = self._format_urls( + tiles=dataset.compare.source.tiles, **format_url_params + ) + # source URLs of background tiles for `detections-*` datasets are + # handled differently in the front end so the the `source` objects + # get updated here + if k in [ + "detections-ship", + "detections-plane", + "detections-vehicles", + ]: + dataset.source = GeoJsonSource( + type=dataset.source.type, data=dataset.source.tiles[0] + ).dict() - def _get_global_datasets(self): - """ - Returns all datasets which do not reference a specific spotlight, by - filtering out datasets where the "source.tiles" value contains either - `spotlightId`. - """ + if spotlight_id == "tk" and k == "water-chlorophyll": + dataset.source.tiles = [ + tile.replace("&rescale=-100%2C100", "") + for tile in dataset.source.tiles + ] - return { - k: v - for k, v in self._data.items() - if not any( - i in v.source.tiles[0] for i in ["{spotlightId}", "greatlakes", "togo"] - ) - } + return output_datasets.values() datasets = DatasetManager() diff --git a/covid_api/db/static/datasets/agriculture.json b/covid_api/db/static/datasets/agriculture.json index e149122..37013fb 100644 --- a/covid_api/db/static/datasets/agriculture.json +++ b/covid_api/db/static/datasets/agriculture.json @@ -24,7 +24,8 @@ "water-chlorophyll", "water-spm", "detections-ship", - "detections-plane" + "detections-plane", + "detections-vehicles" ], "enabled": false, "swatch": { diff --git a/covid_api/db/static/datasets/co2-diff.json b/covid_api/db/static/datasets/co2-diff.json index ec64426..fd39573 100644 --- a/covid_api/db/static/datasets/co2-diff.json +++ b/covid_api/db/static/datasets/co2-diff.json @@ -24,7 +24,8 @@ "water-chlorophyll", "water-spm", "detections-ship", - "detections-plane" + "detections-plane", + "detections-vehicles" ], "enabled": false, "swatch": { diff --git a/covid_api/db/static/datasets/co2.json b/covid_api/db/static/datasets/co2.json index fbe16c3..c5f963f 100644 --- a/covid_api/db/static/datasets/co2.json +++ b/covid_api/db/static/datasets/co2.json @@ -23,7 +23,8 @@ "water-chlorophyll", "water-spm", "detections-ship", - "detections-plane" + "detections-plane", + "detections-vehicles" ], "enabled": false, "compare": { diff --git a/covid_api/db/static/datasets/detections-plane.json b/covid_api/db/static/datasets/detections-plane.json index df7ae64..9fa94a1 100644 --- a/covid_api/db/static/datasets/detections-plane.json +++ b/covid_api/db/static/datasets/detections-plane.json @@ -29,7 +29,8 @@ "detection-multi", "water-chlorophyll", "water-spm", - "detections-ship" + "detections-ship", + "detections-vehicles" ], "enabled": false, "swatch": { diff --git a/covid_api/db/static/datasets/detections-ship.json b/covid_api/db/static/datasets/detections-ship.json index 3e0cf96..bfaf832 100644 --- a/covid_api/db/static/datasets/detections-ship.json +++ b/covid_api/db/static/datasets/detections-ship.json @@ -29,7 +29,8 @@ "detection-multi", "water-chlorophyll", "water-spm", - "detections-plane" + "detections-plane", + "detections-vehicles" ], "enabled": false, "swatch": { diff --git a/covid_api/db/static/datasets/detections-vehicles.json b/covid_api/db/static/datasets/detections-vehicles.json new file mode 100644 index 0000000..6269cc0 --- /dev/null +++ b/covid_api/db/static/datasets/detections-vehicles.json @@ -0,0 +1,41 @@ +{ + "id": "detections-vehicles", + "name": "Vehicle", + "type": "inference-timeseries", + "s3_location": "detections-vehicles", + "is_periodic": false, + "time_unit": "day", + "source": { + "type": "geojson", + "tiles": [ + "{api_url}/detections/vehicles/{spotlightId}/{date}.geojson" + ] + }, + "background_source": { + "type": "raster", + "tiles": [ + "{api_url}/{z}/{x}/{y}@1x?url=s3://covid-eo-data/detections-vehicles/background/{spotlightId}/{date}.tif" + ] + }, + "exclusive_with": [ + "agriculture", + "no2", + "co2-diff", + "co2", + "gibs-population", + "car-count", + "nightlights-viirs", + "nightlights-hd", + "detection-multi", + "water-chlorophyll", + "water-spm", + "detections-ship", + "detections-plane" + ], + "enabled": false, + "swatch": { + "color": "#C0C0C0", + "name": "Grey" + }, + "info": "Vehicles detected each day in PlanetScope imagery are shown in orange." +} \ No newline at end of file diff --git a/covid_api/db/static/datasets/nightlights-hd.json b/covid_api/db/static/datasets/nightlights-hd.json index a44d25f..ceeb37a 100644 --- a/covid_api/db/static/datasets/nightlights-hd.json +++ b/covid_api/db/static/datasets/nightlights-hd.json @@ -23,7 +23,8 @@ "water-chlorophyll", "water-spm", "detections-ship", - "detections-plane" + "detections-plane", + "detections-vehicles" ], "swatch": { "color": "#C0C0C0", diff --git a/covid_api/db/static/datasets/nightlights-viirs.json b/covid_api/db/static/datasets/nightlights-viirs.json index ee00ca7..5b429e9 100644 --- a/covid_api/db/static/datasets/nightlights-viirs.json +++ b/covid_api/db/static/datasets/nightlights-viirs.json @@ -23,7 +23,8 @@ "water-chlorophyll", "water-spm", "detections-ship", - "detections-plane" + "detections-plane", + "detections-vehicles" ], "swatch": { "color": "#C0C0C0", diff --git a/covid_api/db/static/datasets/no2.json b/covid_api/db/static/datasets/no2.json index 5057bb3..fa9a500 100644 --- a/covid_api/db/static/datasets/no2.json +++ b/covid_api/db/static/datasets/no2.json @@ -26,7 +26,8 @@ "water-chlorophyll", "water-spm", "detections-ship", - "detections-plane" + "detections-plane", + "detections-vehicles" ], "enabled": true, "compare": { diff --git a/covid_api/db/static/datasets/water-chlorophyll.json b/covid_api/db/static/datasets/water-chlorophyll.json index f7859c1..ee9d07b 100644 --- a/covid_api/db/static/datasets/water-chlorophyll.json +++ b/covid_api/db/static/datasets/water-chlorophyll.json @@ -23,7 +23,8 @@ "detection-multi", "water-spm", "detections-ship", - "detections-plane" + "detections-plane", + "detections-vehicles" ], "swatch": { "color": "#154F8D", diff --git a/covid_api/db/static/datasets/water-spm.json b/covid_api/db/static/datasets/water-spm.json index 46ed091..a0d0de0 100644 --- a/covid_api/db/static/datasets/water-spm.json +++ b/covid_api/db/static/datasets/water-spm.json @@ -23,7 +23,8 @@ "detection-multi", "water-chlorophyll", "detections-ship", - "detections-plane" + "detections-plane", + "detections-vehicles" ], "swatch": { "color": "#154F8D", diff --git a/covid_api/db/utils.py b/covid_api/db/utils.py index 0255d72..80fefac 100644 --- a/covid_api/db/utils.py +++ b/covid_api/db/utils.py @@ -2,141 +2,86 @@ import csv import json -import re from datetime import datetime -from typing import Dict, List, Optional, Set +from typing import Dict, List import boto3 +from botocore import config -from covid_api.core.config import DT_FORMAT, INDICATOR_BUCKET, MT_FORMAT +from covid_api.core.config import DT_FORMAT, INDICATOR_BUCKET from covid_api.models.static import IndicatorObservation s3 = boto3.client("s3") +_lambda = boto3.client( + "lambda", + region_name="us-east-1", + config=config.Config( + read_timeout=900, connect_timeout=900, retries={"max_attempts": 0} + ), +) -def gather_s3_keys( - spotlight_id: Optional[str] = None, prefix: Optional[str] = None, -) -> Set[str]: - """ - Returns a set of S3 keys. If no args are provided, the keys will represent - the entire S3 bucket. - Params: - ------- - spotlight_id (Optional[str]): - Id of a spotlight to filter keys by - prefix (Optional[str]): - S3 Prefix under which to gather keys, used to specifcy a specific - dataset folder to search within. - - Returns: - ------- - set(str) - - """ - keys: set = set() - - list_objects_args = {"Bucket": INDICATOR_BUCKET} - - if prefix: - list_objects_args["Prefix"] = prefix - - response = s3.list_objects_v2(**list_objects_args) - keys.update({x["Key"] for x in response.get("Contents", [])}) - - while response["IsTruncated"]: - - list_objects_args["ContinuationToken"] = response["NextContinuationToken"] - response = s3.list_objects_v2(**list_objects_args) - - keys.update({x["Key"] for x in response.get("Contents", [])}) - - if not spotlight_id: - return keys - - return { - key - for key in keys - if re.search( - rf"""[^a-zA-Z0-9]({spotlight_id})[^a-zA-Z0-9]""", key, re.IGNORECASE, - ) - } - -def get_dataset_folders_by_spotlight(spotlight_id: str) -> Set[str]: - """ - Returns the S3 prefix of datasets containing files for the given spotlight +def invoke_lambda( + lambda_function_name: str, payload: dict = None, invocation_type="RequestResponse" +): + """Invokes a lambda function using the boto3 lambda client. Params: - ------ - spotlight_id (str): id of spotlight to search for + ------- + lambda_function_name (str): name of the lambda to invoke + payload (Optional[dict]): data into invoke the lambda function with (will be accessible + in the lambda handler function under the `event` param) + invocation_type (Optional[str] = ["RequestResponse", "Event", "DryRun"]): + RequestReponse will run the lambda synchronously (holding up the thread + until the lambda responds + Event will run asynchronously + DryRun will only verify that the user/role has the correct permissions to invoke + the lambda function Returns: -------- - set(str) - """ - keys = gather_s3_keys(spotlight_id=spotlight_id) - return {k.split("/")[0] for k in keys} + (dict) Lambda invocation response, see: + https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/lambda.html#Lambda.Client.invoke + - NOTE: + The current configuration specifies a RequestResponse invocation, which does + indeed run synchronously, but returns a status succeeded of 202 (Accepted) when + it should return a 200 status. 202 status is expected from the `Event` invocation + type (indicated lamdba was initiated but we don't know it's status) -def get_dataset_domain( - dataset_folder: str, is_periodic: bool, spotlight_id: str = None, -): - """ - Returns a domain for a given dataset as identified by a folder. If a - time_unit is passed as a function parameter, the function will assume - that the domain is periodic and with only return the min/max dates, - otherwise ALL dates available for that dataset/spotlight will be returned. + - NOTE: + The current configuration should directly return the lambda output under + response["Payload"]: StreamingBody, however the byte string currently being returned + contains lambda invocation/runtime details from the logs. (eg: - Params: - ------ - dataset_folder (str): dataset folder to search within - time_unit (Optional[str]): time_unit from the dataset's metadata json file - spotlight_id (Optional[str]): a dictionary containing the - `spotlight_id` of a spotlight to restrict the - domain search to. - - Return: - ------ - List[datetime] - """ - s3_keys_args = {"prefix": dataset_folder} - if spotlight_id: - s3_keys_args["spotlight_id"] = spotlight_id - - keys = gather_s3_keys(**s3_keys_args) - dates = [] - - for key in keys: - result = re.search( - # matches either dates like: YYYYMM or YYYY_MM_DD - r"""[^a-zA-Z0-9]((?P(\d{6}))|""" - r"""((?P\d{4})_(?P\d{2})_(?P\d{2})))[^a-zA-Z0-9]""", - key, - re.IGNORECASE, - ) - if not result: - continue + ``` + START RequestId: 7c61eb52-735d-1ce4-0df2-a975197924eb Version: 1 + END RequestId: 7c61eb52-735d-1ce4-0df2-a975197924eb + REPORT RequestId: 7c61eb52-735d-1ce4-0df2-a975197924eb Init Duration: 232.54 ms Duration: 3.02 ms Billed Duration: 100 ms Memory Size: 128 MB Max Memory Used: 33 MB - date = None - try: - if result.group("MT_DATE"): - date = datetime.strptime(result.group("MT_DATE"), MT_FORMAT) - else: - datestring = ( - f"""{result.group("YEAR")}-{result.group("MONTH")}""" - f"""-{result.group("DAY")}""" - ) - date = datetime.strptime(datestring, DT_FORMAT) - except ValueError: - # Invalid date value matched - continue + {"result":"success","input":"test"} - dates.append(date.strftime("%Y-%m-%dT%H:%M:%SZ")) + ``` + when we only expect the JSON object: {"result":"success", "input":"test"} to be returned + ) - if is_periodic and len(dates): - return [min(dates), max(dates)] + To load just the lambda output use: - return sorted(set(dates)) + ``` + response = r["Payload"].read().decode("utf-8") + lambda_output = json.loads( + response[response.index("{") : (response.index("}") + 1)] + ) + ``` + where r is the output of this function. + """ + lambda_invoke_params = dict( + FunctionName=lambda_function_name, InvocationType=invocation_type + ) + if payload: + lambda_invoke_params.update(dict(Payload=json.dumps(payload))) + return _lambda.invoke(**lambda_invoke_params) def s3_get(bucket: str, key: str): diff --git a/docker-compose.yml b/docker-compose.yml index c1532e0..103227a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,4 +1,4 @@ -version: '3' +version: '3.2' services: api: diff --git a/lambda/dataset_metadata_generator/__init__.py b/lambda/dataset_metadata_generator/__init__.py new file mode 100644 index 0000000..1b3b9d2 --- /dev/null +++ b/lambda/dataset_metadata_generator/__init__.py @@ -0,0 +1 @@ +""" Dataset metadata generator module.""" diff --git a/lambda/dataset_metadata_generator/src/__init__.py b/lambda/dataset_metadata_generator/src/__init__.py new file mode 100644 index 0000000..1b3b9d2 --- /dev/null +++ b/lambda/dataset_metadata_generator/src/__init__.py @@ -0,0 +1 @@ +""" Dataset metadata generator module.""" diff --git a/lambda/dataset_metadata_generator/src/main.py b/lambda/dataset_metadata_generator/src/main.py new file mode 100644 index 0000000..3cb95ff --- /dev/null +++ b/lambda/dataset_metadata_generator/src/main.py @@ -0,0 +1,258 @@ +""" Dataset metadata generator lambda. """ +import datetime +import json +import os +import re +from typing import Any, Dict, List, Optional, Union + +import boto3 + +BASE_PATH = os.path.dirname(os.path.abspath(__file__)) +DATASETS_JSON_FILEPATH = os.path.join(BASE_PATH, "datasets") +SITES_JSON_FILEPATH = os.path.join(BASE_PATH, "sites") + + +BUCKET_NAME = os.environ["DATA_BUCKET_NAME"] +DATASET_METADATA_FILENAME = os.environ["DATASET_METADATA_FILENAME"] + + +s3 = boto3.resource("s3") +bucket = s3.Bucket(BUCKET_NAME) + +DT_FORMAT = "%Y-%m-%d" +MT_FORMAT = "%Y%m" + + +def handler(event, context): + """ + Params: + ------- + event (dict): + content (dict): + + Both params are standard lambda handler invocation params but not used within this + lambda's code. + + Returns: + ------- + (string): JSON-encoded dict with top level keys for each of the possible + queries that can be run against the `/datasets` endpoint (key: _all_ contains + result of the LIST operation, each of other keys contain the result of + GET /datasets/{spotlight_id | "global"}) + """ + + # TODO: defined TypedDicts for these! + datasets = _gather_json_data(DATASETS_JSON_FILEPATH) + sites = _gather_json_data(SITES_JSON_FILEPATH) + + result = json.dumps(_gather_datasets_metadata(datasets, sites)) + + bucket.put_object( + Body=result, Key=DATASET_METADATA_FILENAME, ContentType="application/json", + ) + return result + + +def _gather_datasets_metadata(datasets: List[dict], sites: List[dict]): + """Reads through the s3 bucket to generate a file that contains + the datasets for each given spotlight option (_all, global, tk, ny, sf, + la, be, du, gh) and their respective domain for each spotlight + + Params: + ------- + datasets (List[dict]): list of dataset metadata objects (contains fields + like: s3_location, time_unit, swatch, exclusive_with, etc), to use + to generate the result of each of the possible `/datasets` endpoint + queries. + sites (List[dict]): list of site metadata objects + + Returns: + -------- + (dict): python object with result of each possible query against the `/datasets` + endpoint with each dataset's associated domain. + """ + + metadata: Dict[str, dict] = {} + + for dataset in datasets: + if not dataset.get("s3_location"): + continue + + domain_args = { + "dataset_folder": dataset["s3_location"], + "is_periodic": dataset.get("is_periodic"), + "time_unit": dataset.get("time_unit"), + } + + domain = _get_dataset_domain(**domain_args) + + metadata.setdefault("_all", {}).update({dataset["id"]: {"domain": domain}}) + + if _is_global_dataset(dataset): + + metadata.setdefault("global", {}).update( + {dataset["id"]: {"domain": domain}} + ) + continue + + for site in sites: + + domain_args["spotlight_id"] = site["id"] + + if site["id"] in ["du", "gh"]: + domain_args["spotlight_id"] = ["du", "gh", "EUPorts"] + + # skip adding dataset to metadata object if no dates were found for the given + # spotlight (indicates dataset is not valid for that spotlight) + try: + domain = _get_dataset_domain(**domain_args) + except NoKeysFoundForSpotlight: + continue + + metadata.setdefault(site["id"], {}).update( + {dataset["id"]: {"domain": domain}} + ) + return metadata + + +def _gather_json_data(dirpath: str) -> List[dict]: + """Gathers all JSON files from within a diven directory""" + + results = [] + + for filename in os.listdir(dirpath): + if not filename.endswith(".json"): + continue + with open(os.path.join(dirpath, filename)) as f: + results.append(json.load(f)) + return results + + +def _is_global_dataset(dataset: dict) -> bool: + """Returns wether the given dataset is spotlight specific (FALSE) + or non-spotlight specific (TRUE)""" + return not any( + [ + i in dataset["source"]["tiles"][0] + for i in ["{spotlightId}", "greatlakes", "togo"] + ] + ) + + +def _gather_s3_keys( + spotlight_id: Optional[Union[str, List]] = None, prefix: Optional[str] = "", +) -> List[str]: + """ + Returns a set of S3 keys. If no args are provided, the keys will represent + the entire S3 bucket. + Params: + ------- + spotlight_id (Optional[str]): + Id of a spotlight to filter keys by + prefix (Optional[str]): + S3 Prefix under which to gather keys, used to specifcy a specific + dataset folder to search within. + + Returns: + ------- + set(str) + + """ + + keys = [x.key for x in bucket.objects.filter(Prefix=prefix)] + + if not spotlight_id: + return keys + + if isinstance(spotlight_id, list): + spotlight_id = "|".join([s for s in spotlight_id]) + + pattern = re.compile(rf"""[^a-zA-Z0-9]({spotlight_id})[^a-zA-Z0-9]""") + return list({key for key in keys if pattern.search(key, re.IGNORECASE,)}) + + +def _get_dataset_domain( + dataset_folder: str, + is_periodic: bool, + spotlight_id: Optional[Union[str, List]] = None, + time_unit: Optional[str] = "day", +): + """ + Returns a domain for a given dataset as identified by a folder. If a + time_unit is passed as a function parameter, the function will assume + that the domain is periodic and with only return the min/max dates, + otherwise ALL dates available for that dataset/spotlight will be returned. + + Params: + ------ + dataset_folder (str): dataset folder to search within + time_unit (Optional[str]): time_unit from the dataset's metadata json file + spotlight_id (Optional[str]): a dictionary containing the + `spotlight_id` of a spotlight to restrict the + domain search to. + time_unit (Optional[str] - one of ["day", "month"]): + Wether the {date} object in the S3 filenames should be matched + to YYYY_MM_DD (day) or YYYYMM (month) + + Return: + ------ + List[datetime] + """ + s3_keys_args: Dict[str, Any] = {"prefix": dataset_folder} + if spotlight_id: + s3_keys_args["spotlight_id"] = spotlight_id + + keys = _gather_s3_keys(**s3_keys_args) + + if not keys: + raise NoKeysFoundForSpotlight + + dates = [] + + for key in keys: + + # matches either dates like: YYYYMM or YYYY_MM_DD + pattern = re.compile( + r"[^a-zA-Z0-9]((?P\d{4})_(?P\d{2})_(?P\d{2}))[^a-zA-Z0-9]" + ) + if time_unit == "month": + pattern = re.compile( + r"[^a-zA-Z0-9](?P(\d{4}))(?P(\d{2}))[^a-zA-Z0-9]" + ) + + result = pattern.search(key, re.IGNORECASE,) + + if not result: + continue + + date = None + try: + date = datetime.datetime( + int(result.group("YEAR")), + int(result.group("MONTH")), + int(result.groupdict().get("DAY", 1)), + ) + + except ValueError: + # Invalid date value matched - skip date + continue + + # Some files happen to have 6 consecutive digits (likely an ID of sorts) + # that sometimes gets matched as a date. This further restriction of + # matched timestamps will reduce the number of "false" positives (although + # ID's between 201011 and 203011 will slip by) + if not datetime.datetime(2010, 1, 1) < date < datetime.datetime(2030, 1, 1): + continue + + dates.append(date.strftime("%Y-%m-%dT%H:%M:%SZ")) + + if is_periodic and len(dates): + return [min(dates), max(dates)] + + return sorted(set(dates)) + + +class NoKeysFoundForSpotlight(Exception): + """Exception to be thrown if no keys are found for a given spotlight""" + + pass diff --git a/lambda/dataset_metadata_generator/tests/__init__.py b/lambda/dataset_metadata_generator/tests/__init__.py new file mode 100644 index 0000000..57827a2 --- /dev/null +++ b/lambda/dataset_metadata_generator/tests/__init__.py @@ -0,0 +1,268 @@ +""" +Dataset metadata generator lambda test class. This file contains dataset and site metadata +used by the unit tests. The data in this file should be updated to reflect and modification +in metadata content or format of the actual metadatda files (under `covid_api/db/static/`) +""" +DATASETS = [ + { + "id": "co2", + "name": "CO₂ (Avg)", + "type": "raster-timeseries", + "time_unit": "day", + "s3_location": "xco2-mean", + "is_periodic": True, + "source": { + "type": "raster", + "tiles": [ + "{api_url}/{z}/{x}/{y}@1x?url=s3://covid-eo-data/xco2-mean/xco2_16day_mean.{date}.tif&resampling_method=bilinear&bidx=1&rescale=0.000408%2C0.000419&color_map=rdylbu_r&color_formula=gamma r {gamma}" + ], + }, + "exclusive_with": [ + "agriculture", + "no2", + "co2-diff", + "gibs-population", + "car-count", + "nightlights-viirs", + "nightlights-hd", + "detection-multi", + "water-chlorophyll", + "water-spm", + "detections-ship", + "detections-plane", + "detections-vehicles", + ], + "enabled": False, + "compare": { + "enabled": True, + "help": "Compare with baseline", + "year_diff": 0, + "map_label": "{date}: Base vs Mean", + "source": { + "type": "raster", + "tiles": [ + "{api_url}/{z}/{x}/{y}@1x?url=s3://covid-eo-data/xco2-base/xco2_16day_base.{date}.tif&resampling_method=bilinear&bidx=1&rescale=0.000408%2C0.000419&color_map=rdylbu_r&color_formula=gamma r {gamma}" + ], + }, + }, + "swatch": {"color": "#189C54", "name": "Dark Green"}, + "legend": { + "type": "gradient-adjustable", + "min": "< 408 ppm", + "max": "> 419 ppm", + "stops": [ + "#313695", + "#588cbf", + "#a3d2e5", + "#e8f6e8", + "#fee89c", + "#fba55c", + "#e24932", + ], + }, + "info": "This layer shows the average background concentration of carbon dioxide (CO₂) in our atmosphere for 2020. Redder colors indicate more CO₂. Bluer colors indicate less CO₂.", + }, + { + "id": "detections-plane", + "name": "Airplanes", + "type": "inference-timeseries", + "s3_location": "detections-plane", + "is_periodic": False, + "time_unit": "day", + "source": { + "type": "geojson", + "tiles": ["{api_url}/detections-plane/{spotlightId}/{date}.geojson"], + }, + "background_source": { + "type": "raster", + "tiles": ["{api_url}/planet/{z}/{x}/{y}?date={date}&site={spotlightId}"], + }, + "exclusive_with": [ + "agriculture", + "no2", + "co2-diff", + "co2", + "gibs-population", + "car-count", + "nightlights-viirs", + "nightlights-hd", + "detection-multi", + "water-chlorophyll", + "water-spm", + "detections-ship", + "detections-vehicles", + ], + "enabled": False, + "swatch": {"color": "#C0C0C0", "name": "Grey"}, + "info": "Grounded airplanes detected each day in PlanetScope imagery are shown in orange.", + }, + { + "id": "nightlights-hd", + "name": "Nightlights HD", + "type": "raster-timeseries", + "s3_location": "bmhd_30m_monthly", + "is_periodic": True, + "time_unit": "month", + "source": { + "type": "raster", + "tiles": [ + "{api_url}/{z}/{x}/{y}@1x?url=s3://covid-eo-data/bmhd_30m_monthly/BMHD_VNP46A2_{spotlightId}_{date}_cog.tif&resampling_method=bilinear&bidx=1%2C2%2C3" + ], + }, + "exclusive_with": [ + "agriculture", + "no2", + "co2-diff", + "co2", + "gibs-population", + "car-count", + "nightlights-viirs", + "detection-multi", + "water-chlorophyll", + "water-spm", + "detections-ship", + "detections-plane", + "detections-vehicles", + ], + "swatch": {"color": "#C0C0C0", "name": "Grey"}, + "legend": { + "type": "gradient", + "min": "less", + "max": "more", + "stops": ["#08041d", "#1f0a46", "#52076c", "#f57c16", "#f7cf39"], + }, + "info": "The High Definition Nightlights dataset is processed to eliminate light sources, including moonlight reflectance and other interferences. Darker colors indicate fewer night lights and less activity. Lighter colors indicate more night lights and more activity.", + }, + { + "id": "nightlights-viirs", + "name": "Nightlights VIIRS", + "type": "raster-timeseries", + "time_unit": "day", + "s3_location": "bm_500m_daily", + "is_periodic": True, + "source": { + "type": "raster", + "tiles": [ + "{api_url}/{z}/{x}/{y}@1x?url=s3://covid-eo-data/bm_500m_daily/VNP46A2_V011_{spotlightId}_{date}_cog.tif&resampling_method=nearest&bidx=1&rescale=0%2C100&color_map=viridis" + ], + }, + "exclusive_with": [ + "agriculture", + "no2", + "co2-diff", + "co2", + "gibs-population", + "car-count", + "nightlights-hd", + "detection-multi", + "water-chlorophyll", + "water-spm", + "detections-ship", + "detections-plane", + "detections-vehicles", + ], + "swatch": {"color": "#C0C0C0", "name": "Grey"}, + "legend": { + "type": "gradient", + "min": "less", + "max": "more", + "stops": ["#440357", "#3b508a", "#208f8c", "#5fc961", "#fde725"], + }, + "info": "Darker colors indicate fewer night lights and less activity. Lighter colors indicate more night lights and more activity. Check out the HD dataset to see a light-corrected version of this dataset.", + }, + { + "id": "water-chlorophyll", + "name": "Chlorophyll", + "type": "raster-timeseries", + "time_unit": "day", + "is_periodic": False, + "s3_location": "oc3_chla_anomaly", + "source": { + "type": "raster", + "tiles": [ + "{api_url}/{z}/{x}/{y}@1x?url=s3://covid-eo-data/oc3_chla_anomaly/anomaly-chl-{spotlightId}-{date}.tif&resampling_method=bilinear&bidx=1&rescale=-100%2C100&color_map=rdbu_r" + ], + }, + "exclusive_with": [ + "agriculture", + "no2", + "co2-diff", + "co2", + "gibs-population", + "car-count", + "nightlights-viirs", + "nightlights-hd", + "detection-multi", + "water-spm", + "detections-ship", + "detections-plane", + "detections-vehicles", + ], + "swatch": {"color": "#154F8D", "name": "Deep blue"}, + "legend": { + "type": "gradient", + "min": "less", + "max": "more", + "stops": ["#3A88BD", "#C9E0ED", "#E4EEF3", "#FDDCC9", "#DE725B", "#67001F"], + }, + "info": "Chlorophyll is an indicator of algae growth. Redder colors indicate increases in chlorophyll-a and worse water quality. Bluer colors indicate decreases in chlorophyll-a and improved water quality. White areas indicate no change.", + }, +] +SITES = [ + { + "id": "du", + "label": "Port of Dunkirk", + "center": [2.250141, 51.02986], + "polygon": { + "type": "Polygon", + "coordinates": [ + [ + [2.08355962, 51.03423481], + [2.14826632, 50.96553938], + [2.41646888, 51.02097784], + [2.38289168, 51.07488218], + [2.32298564, 51.08773119], + [2.15844656, 51.05891125], + [2.08355962, 51.03423481], + ] + ], + }, + "bounding_box": [2.008355962, 50.96553938, 2.41646888, 51.08773119], + }, + { + "id": "ny", + "label": "New York", + "center": [-73.09, 41.0114], + "polygon": { + "type": "Polygon", + "coordinates": [ + [ + [-71.74516, 41.54467], + [-74.43395, 41.54943], + [-74.43219, 40.47812], + [-71.74516, 40.48343], + [-71.74516, 41.54467], + ] + ], + }, + "bounding_box": [-74.43395, 40.47812, -71.74516, 41.54467], + }, + { + "id": "tk", + "label": "Tokyo", + "center": [139.78, 35.61], + "polygon": { + "type": "Polygon", + "coordinates": [ + [ + [139.37, 35.33], + [140.19, 35.33], + [140.19, 35.85], + [139.37, 35.85], + [139.37, 35.33], + ] + ], + }, + "bounding_box": [139.37, 35.33, 140.19, 35.85], + }, +] diff --git a/lambda/dataset_metadata_generator/tests/conftest.py b/lambda/dataset_metadata_generator/tests/conftest.py new file mode 100644 index 0000000..1ed9f43 --- /dev/null +++ b/lambda/dataset_metadata_generator/tests/conftest.py @@ -0,0 +1,40 @@ +""" +Test configuration class for the dataset metadata generator lambda's unit tests +""" + +import pytest + +from . import DATASETS, SITES + + +@pytest.fixture(autouse=True) +def aws_credentials(monkeypatch): + """Make sure we use monkeypatch env.""" + monkeypatch.setenv("DISABLE_CACHE", "YESPLEASE") + monkeypatch.setenv("AWS_ACCESS_KEY_ID", "jqt") + monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "rde") + monkeypatch.setenv("DATA_BUCKET_NAME", "covid-eo-data") + monkeypatch.setenv("DATASET_METADATA_FILENAME", "dev-dataset-metadata.json") + + +@pytest.fixture +def gather_datasets_metadata(): + """Yield the main function to unit test""" + # Why is this imported here? + # See: https://github.com/spulec/moto#what-about-those-pesky-imports + + from ..src.main import _gather_datasets_metadata + + return _gather_datasets_metadata + + +@pytest.fixture +def datasets(): + """Dataset metadata items""" + return DATASETS + + +@pytest.fixture +def sites(): + """Site metadata items""" + return SITES diff --git a/lambda/dataset_metadata_generator/tests/test_metadata_generator.py b/lambda/dataset_metadata_generator/tests/test_metadata_generator.py new file mode 100644 index 0000000..4076cd4 --- /dev/null +++ b/lambda/dataset_metadata_generator/tests/test_metadata_generator.py @@ -0,0 +1,139 @@ +"""Test class for metadata generator lambda""" +from datetime import datetime + +import boto3 +from moto import mock_s3 + + +@mock_s3 +def _setup_s3(): + + s3 = boto3.resource("s3") + + bucket = s3.Bucket("covid-eo-data") + bucket.create() + s3_keys = [ + ("indicators/test/super.csv", b"test"), + ("xco2-mean/GOSAT_XCO2_2019_01_01_be_BG_circle_cog.tif", b"test"), + ("xco2-mean/GOSAT_XCO2_2019_04_01_be_BG_circle_cog.tif", b"test"), + ("xco2-mean/GOSAT_XCO2_2019_06_01_be_BG_circle_cog.tif", b"test"), + ("oc3_chla_anomaly/anomaly-chl-tk-2020_01_29.tif", b"test"), + ("oc3_chla_anomaly/anomaly-chl-tk-2020_02_05.tif", b"test"), + ("oc3_chla_anomaly/anomaly-chl-tk-2020_03_02.tif", b"test"), + ("bm_500m_daily/VNP46A2_V011_be_2020_01_01_cog.tif", b"test"), + ("bm_500m_daily/VNP46A2_V011_be_2020_02_29_cog.tif", b"test"), + ("bm_500m_daily/VNP46A2_V011_be_2020_03_20_cog.tif", b"test"), + ("bm_500m_daily/VNP46A2_V011_EUPorts_2020_01_01_cog.tif", b"test"), + ("bm_500m_daily/VNP46A2_V011_EUPorts_2020_02_29_cog.tif", b"test"), + ("bm_500m_daily/VNP46A2_V011_EUPorts_2020_03_20_cog.tif", b"test"), + ("bmhd_30m_monthly/BMHD_VNP46A2_du_202005_cog.tif", b"test"), + ("bmhd_30m_monthly/BMHD_VNP46A2_du_202006_cog.tif", b"test"), + ("bmhd_30m_monthly/BMHD_VNP46A2_du_202007_cog.tif", b"test"), + ("OMNO2d_HRM/OMI_trno2_0.10x0.10_200401_Col3_V4.nc.tif", b"test"), + ("OMNO2d_HRM/OMI_trno2_0.10x0.10_200708_Col3_V4.nc.tif", b"test"), + ("OMNO2d_HRM/OMI_trno2_0.10x0.10_200901_Col3_V4.nc.tif", b"test"), + ("detections-plane/ny/2020_01_09.geojson", b"test"), + ("detections-plane/ny/2020_01_21.geojson", b"test"), + ("detections-plane/ny/2020_02_02.geoson", b"test"), + ("detections-ship/ny/2020_01_09.geojson", b"test"), + ("detections-ship/ny/2020_01_21.geojson", b"test"), + ("detections-ship/ny/2020_02_02.geoson", b"test"), + ("indicators/test/super.csv", b"test"), + ] + for key, content in s3_keys: + bucket.put_object(Body=content, Key=key) + return bucket + + +@mock_s3 +def test_datasets(gather_datasets_metadata, datasets, sites): + """Tests for basic (/) query""" + + _setup_s3() + + content = gather_datasets_metadata(datasets, sites) + + assert content is not None + + assert "global" in content.keys() + assert "tk" in content.keys() + + +@mock_s3 +def test_global_datasets(gather_datasets_metadata, datasets, sites): + """Test for correct extraction of global datasets""" + + _setup_s3() + + content = gather_datasets_metadata(datasets, sites) + + assert content is not None + + assert "global" in content + assert set(content["global"].keys()) == {"co2"} + + assert "_all" in content + assert set(content["_all"].keys()) == { + "co2", + "detections-plane", + "nightlights-hd", + "nightlights-viirs", + "water-chlorophyll", + } + + +@mock_s3 +def test_periodic_daily_global_datasets(gather_datasets_metadata, datasets, sites): + """Test domain of periodic (domain only contains start and stop + date) global datasets""" + + _setup_s3() + + content = gather_datasets_metadata(datasets, sites) + + assert content is not None + + dataset_info = content["global"]["co2"] + + assert dataset_info["domain"][0] == datetime.strftime( + datetime(2019, 1, 1), "%Y-%m-%dT%H:%M:%SZ" + ) + assert dataset_info["domain"][1] == datetime.strftime( + datetime(2019, 6, 1), "%Y-%m-%dT%H:%M:%SZ" + ) + + +@mock_s3 +def test_non_periodic_daily_spotlight_dataset( + gather_datasets_metadata, datasets, sites +): + """Test non periodic (domain has all available dates) spotlight + sepecific datasets + """ + + _setup_s3() + + content = gather_datasets_metadata(datasets, sites) + + assert content is not None + assert "ny" in content + + dataset_info = content["ny"]["detections-plane"] + + assert len(dataset_info["domain"]) > 2 + + +@mock_s3 +def test_euports_datasets(gather_datasets_metadata, datasets, sites): + """Test that an EUPorts datasets (du) searchs both for it's own spotlight id + AND EUPorts""" + + _setup_s3() + + content = gather_datasets_metadata(datasets, sites) + + assert "du" in content + assert set(content["du"].keys()) == { + "nightlights-hd", + "nightlights-viirs", + } diff --git a/package.json b/package.json new file mode 100644 index 0000000..0967ef4 --- /dev/null +++ b/package.json @@ -0,0 +1 @@ +{} diff --git a/setup.py b/setup.py index c875435..08ab014 100644 --- a/setup.py +++ b/setup.py @@ -14,6 +14,7 @@ "email-validator", "fiona", "shapely", + "rasterio==1.1.8", "rasterstats", "geojson-pydantic", "boto3", @@ -25,15 +26,16 @@ "deploy": [ "docker", "attrs", - "aws-cdk.core==1.72.0", - "aws-cdk.aws_lambda==1.72.0", - "aws-cdk.aws_apigatewayv2==1.72.0", - "aws-cdk.aws_ecs==1.72.0", - "aws-cdk.aws_ec2==1.72.0", - "aws-cdk.aws_autoscaling==1.72.0", - "aws-cdk.aws_ecs_patterns==1.72.0", - "aws-cdk.aws_iam==1.72.0", - "aws-cdk.aws_elasticache==1.72.0", + "aws-cdk.core>=1.72.0", + "aws-cdk.aws_lambda>=1.72.0", + "aws-cdk.aws_apigatewayv2>=1.72.0", + "aws-cdk.aws_apigatewayv2_integrations>=1.72.0", + "aws-cdk.aws_ecs>=1.72.0", + "aws-cdk.aws_ec2>=1.72.0", + "aws-cdk.aws_autoscaling>=1.72.0", + "aws-cdk.aws_ecs_patterns>=1.72.0", + "aws-cdk.aws_iam>=1.72.0", + "aws-cdk.aws_elasticache>=1.72.0", ], "test": ["moto[iam]", "mock", "pytest", "pytest-cov", "pytest-asyncio", "requests"], } @@ -41,7 +43,7 @@ setup( name="covid_api", - version="0.3.5", + version="0.4.0", description=u"", long_description=long_description, long_description_content_type="text/markdown", diff --git a/stack/app.py b/stack/app.py index 9581c36..d804173 100644 --- a/stack/app.py +++ b/stack/app.py @@ -1,16 +1,21 @@ """Construct App.""" import os +import shutil from typing import Any, Union import config + +# import docker from aws_cdk import aws_apigatewayv2 as apigw +from aws_cdk import aws_apigatewayv2_integrations as apigw_integrations from aws_cdk import aws_ec2 as ec2 from aws_cdk import aws_ecs as ecs from aws_cdk import aws_ecs_patterns as ecs_patterns from aws_cdk import aws_elasticache as escache +from aws_cdk import aws_events, aws_events_targets from aws_cdk import aws_iam as iam -from aws_cdk import aws_lambda, core +from aws_cdk import aws_lambda, aws_s3, core iam_policy_statement = iam.PolicyStatement( actions=["s3:*"], resources=[f"arn:aws:s3:::{config.BUCKET}*"] @@ -44,18 +49,23 @@ def __init__( self, scope: core.Construct, id: str, + dataset_metadata_filename: str, + dataset_metadata_generator_function_name: str, memory: int = 1024, timeout: int = 30, concurrent: int = 100, - env: dict = {}, code_dir: str = "./", **kwargs: Any, ) -> None: """Define stack.""" - super().__init__(scope, id, *kwargs) + super().__init__(scope, id, **kwargs) # add cache - vpc = ec2.Vpc(self, f"{id}-vpc") + if config.VPC_ID: + vpc = ec2.Vpc.from_lookup(self, f"{id}-vpc", vpc_id=config.VPC_ID,) + else: + vpc = ec2.Vpc(self, f"{id}-vpc") + sb_group = escache.CfnSubnetGroup( self, f"{id}-subnet-group", @@ -95,6 +105,9 @@ def __init__( LOG_LEVEL="error", MEMCACHE_HOST=cache.attr_configuration_endpoint_address, MEMCACHE_PORT=cache.attr_configuration_endpoint_port, + DATASET_METADATA_FILENAME=dataset_metadata_filename, + DATASET_METADATA_GENERATOR_FUNCTION_NAME=dataset_metadata_generator_function_name, + PLANET_API_KEY=os.environ["PLANET_API_KEY"], ) ) @@ -117,30 +130,24 @@ def __init__( apigw.HttpApi( self, f"{id}-endpoint", - default_integration=apigw.LambdaProxyIntegration(handler=lambda_function), + default_integration=apigw_integrations.LambdaProxyIntegration( + handler=lambda_function + ), ) def create_package(self, code_dir: str) -> aws_lambda.Code: """Build docker image and create package.""" - # print('building lambda package via docker') - # print(f'code dir: {code_dir}') - # client = docker.from_env() - # print('docker client up') - # client.images.build( - # path=code_dir, - # dockerfile="Dockerfiles/lambda/Dockerfile", - # tag="lambda:latest", - # ) - # print('docker image built') - # client.containers.run( - # image="lambda:latest", - # command="/bin/sh -c 'cp /tmp/package.zip /local/package.zip'", - # remove=True, - # volumes={os.path.abspath(code_dir): {"bind": "/local/", "mode": "rw"}}, - # user=0, - # ) - return aws_lambda.Code.asset(os.path.join(code_dir, "package.zip")) + return aws_lambda.Code.from_asset( + path=os.path.abspath(code_dir), + bundling=core.BundlingOptions( + image=core.BundlingDockerImage.from_asset( + path=os.path.abspath(code_dir), + file="Dockerfiles/lambda/Dockerfile", + ), + command=["bash", "-c", "cp -R /var/task/. /asset-output/."], + ), + ) class covidApiECSStack(core.Stack): @@ -154,14 +161,18 @@ def __init__( memory: Union[int, float] = 512, mincount: int = 1, maxcount: int = 50, - env: dict = {}, + task_env: dict = {}, code_dir: str = "./", **kwargs: Any, ) -> None: """Define stack.""" - super().__init__(scope, id, *kwargs) + super().__init__(scope, id, **kwargs) - vpc = ec2.Vpc(self, f"{id}-vpc", max_azs=2) + # add cache + if config.VPC_ID: + vpc = ec2.Vpc.from_lookup(self, f"{id}-vpc", vpc_id=config.VPC_ID,) + else: + vpc = ec2.Vpc(self, f"{id}-vpc") cluster = ecs.Cluster(self, f"{id}-cluster", vpc=vpc) @@ -174,7 +185,7 @@ def __init__( LOG_LEVEL="error", ) ) - task_env.update(env) + task_env.update(task_env) fargate_service = ecs_patterns.ApplicationLoadBalancedFargateService( self, @@ -223,8 +234,96 @@ def __init__( ) +class covidApiDatasetMetadataGeneratorStack(core.Stack): + """Dataset metadata generator stack - comprises a lambda and a Cloudwatch + event that triggers a new lambda execution every 24hrs""" + + def __init__( + self, + scope: core.Construct, + id: str, + dataset_metadata_filename: str, + dataset_metadata_generator_function_name: str, + code_dir: str = "./", + **kwargs: Any, + ) -> None: + """Define stack.""" + super().__init__(scope, id, *kwargs) + + base = os.path.abspath(os.path.join("covid_api", "db", "static")) + lambda_deployment_package_location = os.path.abspath( + os.path.join(code_dir, "lambda", "dataset_metadata_generator") + ) + for e in ["datasets", "sites"]: + self.copy_metadata_files_to_lambda_deployment_package( + from_dir=os.path.join(base, e), + to_dir=os.path.join(lambda_deployment_package_location, "src", e), + ) + + data_bucket = aws_s3.Bucket.from_bucket_name( + self, id=f"{id}-data-bucket", bucket_name=config.BUCKET + ) + + dataset_metadata_updater_function = aws_lambda.Function( + self, + f"{id}-metadata-updater-lambda", + runtime=aws_lambda.Runtime.PYTHON_3_8, + code=aws_lambda.Code.from_asset(lambda_deployment_package_location), + handler="src.main.handler", + environment={ + "DATASET_METADATA_FILENAME": dataset_metadata_filename, + "DATA_BUCKET_NAME": data_bucket.bucket_name, + }, + function_name=dataset_metadata_generator_function_name, + timeout=core.Duration.minutes(5), + ) + + for e in ["datasets", "sites"]: + shutil.rmtree(os.path.join(lambda_deployment_package_location, "src", e)) + + data_bucket.grant_read_write(dataset_metadata_updater_function) + + aws_events.Rule( + self, + f"{id}-metadata-update-daily-trigger", + # triggers everyday + schedule=aws_events.Schedule.rate(duration=core.Duration.days(1)), + targets=[ + aws_events_targets.LambdaFunction(dataset_metadata_updater_function) + ], + ) + + def copy_metadata_files_to_lambda_deployment_package(self, from_dir, to_dir): + """Copies dataset metadata files to the lambda deployment package + so that the dataset domain extractor lambda has access to the necessary + metadata items at runtime + Params: + ------- + from_dir (str): relative filepath from which to copy all `.json` files + to_dir (str): relative filepath to copy `.json` files to + Return: + ------- + None + """ + files = [ + os.path.abspath(os.path.join(d, f)) + for d, _, fnames in os.walk(from_dir) + for f in fnames + if f.endswith(".json") + ] + + try: + os.mkdir(to_dir) + except FileExistsError: + pass + + for f in files: + shutil.copy(f, to_dir) + + app = core.App() + # Tag infrastructure for key, value in { "Project": config.PROJECT_NAME, @@ -243,7 +342,11 @@ def __init__( memory=config.TASK_MEMORY, mincount=config.MIN_ECS_INSTANCES, maxcount=config.MAX_ECS_INSTANCES, - env=config.ENV, + task_env=config.TASK_ENV, + env=dict( + account=os.environ["CDK_DEFAULT_ACCOUNT"], + region=os.environ["CDK_DEFAULT_REGION"], + ), ) lambda_stackname = f"{config.PROJECT_NAME}-lambda-{config.STAGE}" @@ -253,7 +356,22 @@ def __init__( memory=config.MEMORY, timeout=config.TIMEOUT, concurrent=config.MAX_CONCURRENT, - env=config.ENV, + dataset_metadata_filename=config.DATASET_METADATA_FILENAME, + dataset_metadata_generator_function_name=config.DATASET_METADATA_GENERATOR_FUNCTION_NAME, + env=dict( + account=os.environ["CDK_DEFAULT_ACCOUNT"], + region=os.environ["CDK_DEFAULT_REGION"], + ), +) + +dataset_metadata_generator_stackname = ( + f"{config.PROJECT_NAME}-dataset-metadata-generator-{config.STAGE}" +) +covidApiDatasetMetadataGeneratorStack( + app, + dataset_metadata_generator_stackname, + dataset_metadata_filename=config.DATASET_METADATA_FILENAME, + dataset_metadata_generator_function_name=config.DATASET_METADATA_GENERATOR_FUNCTION_NAME, ) app.synth() diff --git a/stack/config.py b/stack/config.py index af8d068..81c73f1 100644 --- a/stack/config.py +++ b/stack/config.py @@ -9,7 +9,12 @@ BUCKET = "covid-eo-data" # Additional environement variable to set in the task/lambda -ENV: dict = dict() +TASK_ENV: dict = dict() + +# Existing VPC to point ECS/LAMBDA stacks towards. Defaults to creating a new +# VPC if no ID is supplied. +VPC_ID = os.environ.get("VPC_ID") + ################################################################################ # # @@ -42,3 +47,17 @@ CACHE_NODE_TYPE = "cache.m5.large" CACHE_ENGINE = "memcached" CACHE_NODE_NUM = 1 + +################################################################################ +# # +# DATASET METADATA GENERATOR # +# # +################################################################################ +DATASET_METADATA_FILENAME = ( + "dataset-metadata.json" if STAGE == "prod" else "dev-dataset-metadata.json" +) +DATASET_METADATA_GENERATOR_FUNCTION_NAME = ( + "dataset-metadata-generator" + if STAGE == "prod" + else "dev-dataset-metadata-generator" +) diff --git a/tests/conftest.py b/tests/conftest.py index f5329d4..e2b6285 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -10,11 +10,16 @@ @pytest.fixture(autouse=True) -def app(monkeypatch) -> TestClient: - """Make sure we use monkeypatch env.""" +def aws_credentials(monkeypatch): monkeypatch.setenv("DISABLE_CACHE", "YESPLEASE") monkeypatch.setenv("AWS_ACCESS_KEY_ID", "jqt") monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "rde") + + +@pytest.fixture +def app() -> TestClient: + """Make sure we use monkeypatch env.""" + from covid_api.main import app return TestClient(app) @@ -25,3 +30,10 @@ def mock_rio(src_path: str) -> DatasetReader: prefix = os.path.join(os.path.dirname(__file__), "fixtures") assert src_path.startswith("https://myurl.com/") return rasterio.open(os.path.join(prefix, "cog.tif")) + + +@pytest.fixture +def dataset_manager(): + from covid_api.db.static.datasets import DatasetManager + + return DatasetManager diff --git a/tests/routes/v1/test_datasets.py b/tests/routes/v1/test_datasets.py index 207d51c..85c5021 100644 --- a/tests/routes/v1/test_datasets.py +++ b/tests/routes/v1/test_datasets.py @@ -1,180 +1,130 @@ """Test /v1/datasets endpoints""" + import json -from datetime import datetime +from unittest.mock import patch import boto3 +import botocore from moto import mock_s3 from covid_api.core.config import INDICATOR_BUCKET +DATASET_METADATA_FILENAME = "dev-dataset-metadata.json" +DATASET_METADATA_GENERATOR_FUNCTION_NAME = "dev-dataset-metadata-generator" + @mock_s3 def _setup_s3(empty=False): - s3 = boto3.client("s3") - s3.create_bucket(Bucket=INDICATOR_BUCKET) - + s3 = boto3.resource("s3") + bucket = s3.Bucket(INDICATOR_BUCKET) + bucket.create() if empty: - return s3 - + return bucket s3_keys = [ - "xco2-mean/GOSAT_XCO2_201901_be_BG_circle_cog.tif", - "xco2-mean/GOSAT_XCO2_201904_be_BG_circle_cog.tif", - "xco2-mean/GOSAT_XCO2_201906_be_BG_circle_cog.tif", - "oc3_chla_anomaly/anomaly-chl-tk-2020_01_29.tif", - "oc3_chla_anomaly/anomaly-chl-tk-2020_02_05.tif", - "oc3_chla_anomaly/anomaly-chl-tk-2020_03_02.tif", - "bm_500m_daily/VNP46A2_V011_be_2020_01_01_cog.tif", - "bm_500m_daily/VNP46A2_V011_be_2020_02_29_cog.tif", - "bm_500m_daily/VNP46A2_V011_be_2020_03_20_cog.tif", - "bm_500m_daily/VNP46A2_V011_EUPorts_2020_01_01_cog.tif", - "bm_500m_daily/VNP46A2_V011_EUPorts_2020_02_29_cog.tif", - "bm_500m_daily/VNP46A2_V011_EUPorts_2020_03_20_cog.tif", - "bmhd_30m_monthly/BMHD_VNP46A2_du_202005_cog.tif", - "bmhd_30m_monthly/BMHD_VNP46A2_du_202006_cog.tif", - "bmhd_30m_monthly/BMHD_VNP46A2_du_202007_cog.tif", - "OMNO2d_HRM/OMI_trno2_0.10x0.10_200401_Col3_V4.nc.tif", - "OMNO2d_HRM/OMI_trno2_0.10x0.10_200708_Col3_V4.nc.tif", - "OMNO2d_HRM/OMI_trno2_0.10x0.10_200901_Col3_V4.nc.tif", - "detections-plane/ny/2020_01_09.geojson", - "detections-plane/ny/2020_01_21.geojson", - "detections-plane/ny/2020_02_02.geoson", - "detections-ship/ny/2020_01_09.geojson", - "detections-ship/ny/2020_01_21.geojson", - "detections-ship/ny/2020_02_02.geoson", - "indicators/test/super.csv", + ("indicators/test/super.csv", b"test"), + ( + DATASET_METADATA_FILENAME, + json.dumps( + { + "_all": { + "co2": { + "domain": ["2019-01-01T00:00:00Z", "2020-01-01T00:00:00Z"] + }, + "detections-plane": { + "domain": [ + "2019-01-01T00:00:00Z", + "2019-10-10T00:00:00Z", + "2020-01-01T:00:00:00Z", + ] + }, + }, + "global": { + "co2": { + "domain": ["2019-01-01T00:00:00Z", "2020-01-01T00:00:00Z"] + } + }, + "tk": { + "detections-plane": { + "domain": [ + "2019-01-01T00:00:00Z", + "2019-10-10T00:00:00Z", + "2020-01-01T:00:00:00Z", + ] + } + }, + "ny": { + "detections-ship": { + "domain": [ + "2019-01-01T00:00:00Z", + "2019-10-10T00:00:00Z", + "2020-01-01T:00:00:00Z", + ] + } + }, + } + ), + ), ] - for key in s3_keys: - s3.put_object( - Bucket=INDICATOR_BUCKET, Key=key, Body=b"test", - ) - - return s3 - - -@mock_s3 -def test_databases(app): - - _setup_s3() - - response = app.get("/v1/datasets") - - assert response.status_code == 200 - - content = json.loads(response.content) - assert "datasets" in content - assert len(content["datasets"]) > 0 - - -@mock_s3 -def test_datasets_monthly(app): - - _setup_s3() - - response = app.get("/v1/datasets/be") - assert response.status_code == 200 - - content = json.loads(response.content) - assert "datasets" in content - - dataset_info = [d for d in content["datasets"] if d["id"] == "co2"][0] - assert dataset_info["domain"][0] == datetime.strftime( - datetime(2019, 1, 1), "%Y-%m-%dT%H:%M:%SZ" - ) - assert dataset_info["domain"][1] == datetime.strftime( - datetime(2019, 6, 1), "%Y-%m-%dT%H:%M:%SZ" - ) + for key, content in s3_keys: + bucket.put_object(Body=content, Key=key) + return bucket @mock_s3 -def test_euports_dataset(app): +def test_metadata_file_generation_triggered_if_not_found( + app, dataset_manager, monkeypatch +): - _setup_s3() - - response = app.get("/v1/datasets/du") - assert response.status_code == 200 - - content = json.loads(response.content) - assert "datasets" in content - - dataset_info = [d for d in content["datasets"] if d["id"] == "nightlights-hd"][0] - assert dataset_info["domain"][0] == datetime.strftime( - datetime(2020, 5, 1), "%Y-%m-%dT%H:%M:%SZ" - ) - assert dataset_info["domain"][1] == datetime.strftime( - datetime(2020, 7, 1), "%Y-%m-%dT%H:%M:%SZ" - ) - - assert "_du_" in dataset_info["source"]["tiles"][0] - - # Dunkirk has two different datasets under two different spotlight names: - # "du" and "EUports" - both need to be tested individually + _setup_s3(empty=True) - dataset_info = [d for d in content["datasets"] if d["id"] == "nightlights-viirs"][0] - assert "_EUPorts_" in dataset_info["source"]["tiles"][0] + with patch("covid_api.db.static.datasets.invoke_lambda") as mocked_invoke_lambda: + + mocked_invoke_lambda.return_value = {"result": "success"} + # Load dataset will invoke the mocked-lambda and then attempt to load the file + # from S3 once the lambda finished executing. Since the mocked lambda + # doesn't actually write anything to S3 in this test, the call to load the file + # from S3 will fail. This is not a problem since this test is just to ascertain + # that the lambda was in fact triggered. + try: + dataset_manager()._load_domain_metadata() + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] == "404": + pass + + mocked_invoke_lambda.assert_called_with( + lambda_function_name=DATASET_METADATA_GENERATOR_FUNCTION_NAME + ) @mock_s3 -def test_detections_datasets(app): - """test /datasets endpoint""" - - # aws mocked resources +def test_datasets(app): _setup_s3() + response = app.get("v1/datasets") - response = app.get("v1/datasets/ny") assert response.status_code == 200 - content = json.loads(response.content) - assert "datasets" in content - dataset_info = [d for d in content["datasets"] if d["id"] == "detections-plane"][0] - assert len(dataset_info["domain"]) > 2 + assert "co2" in [d["id"] for d in content["datasets"]] + assert "detections-plane" in [d["id"] for d in content["datasets"]] @mock_s3 -def test_datasets_daily(app): - """test /datasets endpoint""" - - # aws mocked resources +def test_spotlight_datasets(app): _setup_s3() + response = app.get("v1/datasets/tk") - response = app.get("/v1/datasets/tk") assert response.status_code == 200 content = json.loads(response.content) - assert "datasets" in content - - dataset_info = [d for d in content["datasets"] if d["id"] == "water-chlorophyll"][0] - assert len(dataset_info["domain"]) > 2 - assert dataset_info["domain"][0] == datetime.strftime( - datetime(2020, 1, 29), "%Y-%m-%dT%H:%M:%SZ" - ) - assert dataset_info["domain"][-1] == datetime.strftime( - datetime(2020, 3, 2), "%Y-%m-%dT%H:%M:%SZ" - ) - - assert "&rescale=-100%2C100" not in dataset_info["source"]["tiles"][0] + assert "co2" in [d["id"] for d in content["datasets"]] + assert "detections-plane" in [d["id"] for d in content["datasets"]] + assert "detections-ship" not in [d["id"] for d in content["datasets"]] @mock_s3 -def test_global_datasets(app): - """test /datasets endpoint""" - - # aws mocked resources +def test_incorrect_dataset_id(app): _setup_s3() - response = app.get("/v1/datasets/global") - assert response.status_code == 200 - - content = json.loads(response.content) - assert "datasets" in content - - dataset_info = [d for d in content["datasets"] if d["id"] == "no2"][0] - assert len(dataset_info["domain"]) == 2 - - -@mock_s3 -def test_incorrect_dataset_id(app): - _setup_s3(empty=True) response = app.get("/v1/datasets/NOT_A_VALID_DATASET") assert response.status_code == 404 diff --git a/tests/routes/v1/test_sites.py b/tests/routes/v1/test_sites.py index 400d36a..0fbd3f6 100644 --- a/tests/routes/v1/test_sites.py +++ b/tests/routes/v1/test_sites.py @@ -6,31 +6,32 @@ from covid_api.core.config import INDICATOR_BUCKET +@mock_s3 +def _setup_s3(): + s3 = boto3.resource("s3") + bucket = s3.Bucket(INDICATOR_BUCKET) + bucket.create() + s3_keys = [ + ("indicators/test/super.csv", b"test"), + ] + for key, content in s3_keys: + bucket.put_object(Body=content, Key=key) + return bucket + + @mock_s3 def test_sites(app): + _setup_s3() """test /sites endpoint""" - # aws mocked resources - s3 = boto3.client("s3") - s3.create_bucket(Bucket=INDICATOR_BUCKET) - s3.put_object( - Bucket=INDICATOR_BUCKET, Key="indicators/test/super.csv", Body=b"test" - ) - response = app.get("/v1/sites") assert response.status_code == 200 @mock_s3 def test_site_id(app): + _setup_s3() """test /sites/{id} endpoint""" - # aws mocked resources - s3 = boto3.client("s3") - s3.create_bucket(Bucket=INDICATOR_BUCKET) - s3.put_object( - Bucket=INDICATOR_BUCKET, Key="indicators/test/super.csv", Body=b"test" - ) - response = app.get("/v1/sites/be") assert response.status_code == 200