diff --git a/.gitignore b/.gitignore index c8da3fc..c1e1558 100755 --- a/.gitignore +++ b/.gitignore @@ -6,14 +6,16 @@ data incoming_data zenodo *.qgz +*.bak # Secrets *_TOKEN +CDSAPI_* # Python __pycache__ *.py[c,i] *.egg-info -# Mac -._* \ No newline at end of file +# Mac +._* diff --git a/README.md b/README.md index dec664f..93b9330 100644 --- a/README.md +++ b/README.md @@ -36,19 +36,28 @@ micromamba activate datapkg The data packages are produced using a [`snakemake`](https://snakemake.readthedocs.io/) workflow. -The workflow expects `ZENODO_TOKEN` to be set as an environment variable - this -must be set before running any workflow steps. +The workflow expects `ZENODO_TOKEN`, `CDSAPI_KEY` and `CDSAPI_URL` to be set as +environment variables - these must be set before running any workflow steps. -If not interacting with Zenodo, this can be a dummy string: +If not interacting with Zenodo or the Copernicus Climate Data Store, these can +be dummy strings: ```bash echo "placeholder" > ZENODO_TOKEN +echo "https://cds-beta.climate.copernicus.eu/api" > CDSAPI_URL +echo "test" > CDSAPI_KEY ``` +See [Climate Data Store API +docs](https://cds-beta.climate.copernicus.eu/how-to-api#use-the-cds-api-client-for-data-access) +and [Zenodo API docs](https://developers.zenodo.org/#introduction) for access details. + Export from the file to the environment: ```bash export ZENODO_TOKEN=$(cat ZENODO_TOKEN) +export CDSAPI_KEY=$(cat CDSAPI_KEY) +export CDSAPI_URL=$(cat CDSAPI_URL) ``` Check what will be run, if we ask for everything produced by the rule `all`, diff --git a/Snakefile b/Snakefile old mode 100644 new mode 100755 index aab754e..38bf703 --- a/Snakefile +++ b/Snakefile @@ -1,5 +1,6 @@ import json import shutil +from datetime import datetime from pathlib import Path from glob import glob @@ -9,9 +10,9 @@ import pandas import requests import shapely -DATAPKG_VERSION = "0.1.0" -# ZENODO_URL = "sandbox.zenodo.org" -ZENODO_URL = "zenodo.org" +DATAPKG_VERSION = "0.2.0" +ZENODO_URL = "sandbox.zenodo.org" +# ZENODO_URL = "zenodo.org" BOUNDARIES = irv_datapkg.read_boundaries(Path(".")) BOUNDARY_LU = BOUNDARIES.set_index("CODE_A3") @@ -19,6 +20,8 @@ BOUNDARY_LU = BOUNDARIES.set_index("CODE_A3") envvars: "ZENODO_TOKEN", + "CDSAPI_URL", + "CDSAPI_KEY" def boundary_geom(iso3): @@ -73,12 +76,14 @@ rule checksums: "data/{ISO3}/openstreetmap/openstreetmap_roads-tertiary__{ISO3}.gpkg", "data/{ISO3}/storm.csv", "data/{ISO3}/wri_powerplants/wri-powerplants__{ISO3}.gpkg", + "data/{ISO3}/copernicus_lulc/copernicus_lulc__{ISO3}.tif", + "data/{ISO3}/copernicus_dem/copernicus_dem__{ISO3}.tif", output: checksums="data/{ISO3}/md5sum.txt", shell: """ cd data/{wildcards.ISO3} - md5sum **/*.* | grep "tif\|gpkg" | sort -k 2 > md5sum.txt + md5sum **/*.* | grep "tif\\|gpkg" | sort -k 2 > md5sum.txt """ @@ -113,4 +118,6 @@ include: "rules/jrc_ghsl.smk" include: "rules/openstreetmap.smk" include: "rules/storm.smk" include: "rules/wri_powerplants.smk" +include: "rules/copernicus_lulc.smk" +include: "rules/copernicus_dem.smk" include: "rules/zenodo.smk" diff --git a/environment.yml b/environment.yml index ae189e2..5e1a6c5 100644 --- a/environment.yml +++ b/environment.yml @@ -1,18 +1,21 @@ name: datapkg channels: - - conda-forge - - defaults + - nodefaults + - bioconda # for snakemake + - conda-forge # for most other packages dependencies: - - python=3.11 + - python=3.12 - pip - pip: - - zenodo_get>=1.5.1 # download from Zenodo + - zenodo_get>=1.6.1 # download from Zenodo + - cdsapi>=0.7.2 # copernicus data api - -e . # irv_datapkg helper + - awscli # connect to AWS, download from S3 - black # Python formatting - gdal>=3.3 # command-line tools for spatial data - - geopandas>=0.14.0 # geospatial dataframes + - geopandas>=1.0.1 # geospatial dataframes - osmium-tool==1.16.0 # openstreetmap extracts - pyyaml # read YAML files - pyogrio # faster geospatial i/o - - bioconda::snakemake==7.32.4 # workflow management - - bioconda::snakefmt # Snakefile formatting + - snakemake==8.25.5 # workflow management + - snakefmt # Snakefile formatting diff --git a/metadata/copernicus_dem.yml b/metadata/copernicus_dem.yml new file mode 100755 index 0000000..47252d8 --- /dev/null +++ b/metadata/copernicus_dem.yml @@ -0,0 +1,29 @@ +# yaml-language-server: $schema=irv-datapkg-schema.json +name: copernicus_dem +description: Copernicus 90m Digital Elevation Model +version: 2023_1 +dataset_name: copernicus_glo90 +data_author: European Union and ESA +data_title: Copernicus GLO-90 +data_title_long: Copernicus Global Digital Elevation Model (WorldDEM GLO-90) + +data_summary: > + The Copernicus DEM is a Digital Surface Model (DSM) which represents the + surface of the Earth including buildings, infrastructure and vegetation. + GLO-90 provides worldwide coverage at 90 meters. Note that ocean areas do not + have tiles, there one can assume height values equal to zero. Data is provided + as Cloud Optimized GeoTIFFs and comes from Copernicus DEM 2021 release. + +data_citation: > + Copernicus DEM - Global Digital Elevation Model (2021) + https://doi.org/10.5270/ESA-c5d3d65 (produced using Copernicus WorldDEM™-90 © + DLR e.V. 2010-2014 and © Airbus Defence and Space GmbH 2014-2018 provided + under COPERNICUS by the European Union and ESA; all rights reserved) + +data_license: + name: Copernicus-DEM-EULA + title: Licence for Copernicus DEM instance COP-DEM-GLO-90-F Global 90m Full, Free & Open + path: https://esa.maps.eox.at/copernicus-dem_eula.pdf + +data_origin_url: https://dataspace.copernicus.eu/explore-data/data-collections/copernicus-contributing-missions/collections-description/COP-DEM +data_formats: ["GeoTIFF"] diff --git a/metadata/copernicus_lulc.yml b/metadata/copernicus_lulc.yml new file mode 100755 index 0000000..796e1bc --- /dev/null +++ b/metadata/copernicus_lulc.yml @@ -0,0 +1,65 @@ +# yaml-language-server: $schema=irv-datapkg-schema.json +name: copernicus_lulc +description: Copernicus Land Cover Classification +version: v2.1.1 +dataset_name: copernicus_lulc +data_author: Copernicus Climate Change Service, Climate Data Store +data_title: Copernicus Land Cover Classification +data_title_long: Land cover classification gridded maps from 1992 to present derived from satellite observations +data_summary: > + This dataset provides global maps describing the land surface into 22 classes, + which have been defined using the United Nations Food and Agriculture + Organization's (UN FAO) Land Cover Classification System (LCCS). In addition + to the land cover (LC) maps, four quality flags are produced to document the + reliability of the classification and change detection. + + In order to ensure continuity, these land cover maps are consistent with the + series of global annual LC maps from the 1990s to 2015 produced by the + European Space Agency (ESA) Climate Change Initiative (CCI), which are also + available on the ESA CCI LC viewer. + + To produce this dataset, the entire Medium Resolution Imaging Spectrometer + (MERIS) Full and Reduced Resolution archive from 2003 to 2012 was first + classified into a unique 10-year baseline LC map. This is then back- and + up-dated using change detected from (i) Advanced Very-High-Resolution + Radiometer (AVHRR) time series from 1992 to 1999, (ii) SPOT-Vegetation + (SPOT-VGT) time series from 1998 to 2012 and (iii) PROBA-Vegetation (PROBA-V) + and Sentinel-3 OLCI (S3 OLCI) time series from 2013. + + Beyond the climate-modelling communities, this dataset's long-term + consistency, yearly updates, and high thematic detail on a global scale have + made it attractive for a multitude of applications such as land accounting, + forest monitoring and desertification, in addition to scientific research. + + The products are made available to the public by ESA and the consortium. You + may use one or several CCI-LC products land cover map for educational and/or + scientific purposes, without any fee on the condition that you credit the ESA + Climate Change Initiative and in particular its Land Cover project as the + source of the CCI-LC database. Should you write any scientific publication on + the results of research activities that use one or several CCI-LC products as + input, you shall acknowledge the ESA CCI Land Cover project in the text of the + publication and provide the project with an electronic copy of the publication + (contact@esa-landcover-cci.org). If you wish to use one or several CCI-LC + products in advertising or in any commercial promotion, you shall acknowledge + the ESA CCI Land Cover project and you must submit the layout to the project + for approval beforehand (contact@esa-landcover-cci.org). + + © ESA Climate Change Initiative - Land Cover led by UCLouvain (2017) + + Generated using Copernicus Climate Change Service information [2024]. Neither + the European Commission nor ECMWF is responsible for any use that may be made + of the Copernicus information or data it contains. + +data_citation: > + Copernicus Climate Change Service, Climate Data Store, (2019): Land cover + classification gridded maps from 1992 to present derived from satellite + observation. Copernicus Climate Change Service (C3S) Climate Data Store (CDS). + DOI: 10.24381/cds.006f2c9a (Accessed on 09-AUG-2024) + +data_license: + name: ESA-CCI + title: ESA CCI Land Cover licence + path: https://object-store.os-api.cci2.ecmwf.int/cci2-prod-catalogue/licences/satellite-land-cover/satellite-land-cover_8423d13d3dfd95bbeca92d9355516f21de90d9b40083a915ead15a189d6120fa.pdf + +data_origin_url: https://cds-beta.climate.copernicus.eu/datasets/satellite-land-cover?tab=overview +data_formats: ["GeoTIFF"] diff --git a/pyproject.toml b/pyproject.toml index 99e8121..4ceb127 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,13 @@ classifiers = [ "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ] -dependencies = ["geopandas>=0.13", "shapely>=2.0", "pyproj", "GDAL>=3.3"] +dependencies = [ + "geopandas>=0.13", + "shapely>=2.0", + "pyproj", + "GDAL>=3.3", + "cdsapi" +] [project.urls] "Homepage" = "https://github.com/nismod/irv-datapkg" diff --git a/rules/copernicus_dem.smk b/rules/copernicus_dem.smk new file mode 100755 index 0000000..7bf15bb --- /dev/null +++ b/rules/copernicus_dem.smk @@ -0,0 +1,81 @@ +# +# Copernicus DEM +# +rule list_dem_glo90: + output: + txt="incoming_data/copernicus_dem/COP-DEM_GLO-90-DGED__2023_1.txt", + shell: + """ + out_dir = $(dirname {output.txt}) + mkdir -p $out_dir + + curl -k -H "accept: csv" \ + https://prism-dem-open.copernicus.eu/pd-desk-open-access/publicDemURLs/COP-DEM_GLO-90-DGED__2023_1 \ + > {output.txt} + """ + +rule download_dem_glo90: + input: + txt=rules.list_dem_glo90.output.txt + output: + dir=directory("incoming_data/copernicus_dem/archive"), + shell: + """ + mkdir -p {output.dir}.tmp + pushd {output.dir}.tmp + + cat ../COP-DEM_GLO-90-DGED__2023_1.txt | parallel 'wget --no-clobber {{}}' + + popd + mv {output.dir}.tmp {output.dir} + """ + +rule extract_dem_glo90: + input: + dir=rules.download_dem_glo90.output.dir, + output: + dir=directory("incoming_data/copernicus_dem/tiles"), + shell: + """ + pushd incoming_data/copernicus_dem + mkdir -p tiles + + # Extract + find -type f -name '*.tar' | \ + head | \ + sed 's/.\\/archive\\///' | \ + sed 's/.tar//' | \ + parallel -j 1 \ + tar xvf \ + {{}}.tar \ + --skip-old-files \ + --strip-components=2 \ + -C ./tiles/ \ + {{}}/DEM/{{}}_DEM.tif + popd + """ + +rule convert_dem_glo90: + input: + dir=rules.extract_dem_glo90.output.dir, + output: + tiff="incoming_data/copernicus_dem/copernicus_dem.tif", + shell: + """ + pushd incoming_data/copernicus_dem + # Build list + find -type f -name '*.tif' > tileList.txt + + # Build VRT + gdalbuildvrt -input_file_list tileList.txt copernicus_dem.vrt + + # Combine to big TIFF + gdal_translate \ + -co "COMPRESS=LZW" \ + -co "TILED=yes" \ + -co "BIGTIFF=YES" \ + -of "GTiff" \ + copernicus_dem.vrt \ + copernicus_dem.tif + popd + """ diff --git a/rules/copernicus_lulc.smk b/rules/copernicus_lulc.smk new file mode 100755 index 0000000..fd6973d --- /dev/null +++ b/rules/copernicus_lulc.smk @@ -0,0 +1,41 @@ +# +# Copernicus LULC +# + +rule download_lulc: + output: + archive="incoming_data/copernicus_lulc/archive.tgz", + run: + from irv_datapkg import download_from_CDS + download_from_CDS( + "satellite-land-cover", + { + 'variable': 'all', + 'year': ['2022'], + 'version': ['v2_1_1'], + 'format': 'tgz' + }, + output.archive + ) + +rule convert_lulc: + input: + archive=rules.download_lulc.output.archive, + output: + tif = "incoming_data/copernicus_lulc/copernicus_lulc.tif", + shell: + """ + cd incoming_data/copernicus_lulc + + tar xvzf $(basename {input.archive}) + + gdalwarp \ + -of Gtiff \ + -co COMPRESS=LZW \ + -ot Byte \ + -te -180.0000000 -90.0000000 180.0000000 90.0000000 \ + -tr 0.002777777777778 0.002777777777778 \ + -t_srs EPSG:4326 \ + NETCDF:C3S-LC-L4-LCCS-Map-300m-P1Y-2022-v2.1.1.nc:lccs_class \ + copernicus_lulc.tif + """ diff --git a/rules/zenodo.smk b/rules/zenodo.smk index 3e9f72b..342d214 100644 --- a/rules/zenodo.smk +++ b/rules/zenodo.smk @@ -27,8 +27,22 @@ rule create_deposition: deposition = r.json() # Save details - with open(output.json, "w") as fh: - json.dump(deposition, fh, indent=2) + write_deposition(output.json, deposition) + +def get_deposition(deposition_id): + params = {"access_token": os.environ["ZENODO_TOKEN"]} + r = requests.get(f"https://{ZENODO_URL}/api/deposit/depositions/{deposition_id}", params=params) + r.raise_for_status() + deposition = r.json() + return deposition + +def log_deposition(iso3, deposition, deposition_id): + with open(f"zenodo/{iso3}.deposition.{deposition_id}.{datetime.now().isoformat()}.json", "w") as fh: + json.dump(deposition, fh, indent=2) + +def write_deposition(fname, deposition): + with open(fname, "w") as fh: + json.dump(deposition, fh, indent=2) rule deposit: @@ -40,6 +54,7 @@ rule deposit: touch("zenodo/{ISO3}.deposited"), run: params = {"access_token": os.environ["ZENODO_TOKEN"]} + headers = {'Authorization': f"Bearer {os.environ["ZENODO_TOKEN"]}"} with open(input.deposition, "r") as fh: deposition = json.load(fh) @@ -48,6 +63,44 @@ rule deposit: datapackage = json.load(fh) deposition_id = deposition["id"] + + # Check and create a new version if the last one was submitted + + # Get latest deposition + deposition = get_deposition(deposition_id) + + log_deposition(wildcards.ISO3, deposition, deposition_id) + + if deposition["submitted"]: + # Request a new deposition to draft a new version + + # POST /api/deposit/depositions/:id/actions/newversion + # NOTE: this seems to fail if there's already a draft - workaround is to search for the draft and discard it manually + # could search all depositions for unsubmitted and discard? + # or could search for unsubmitted matching "conceptdoi" and use it? + r = requests.post(f'https://{ZENODO_URL}/api/deposit/depositions/{deposition_id}/actions/newversion', headers=headers) + r.raise_for_status() + response = r.json() + + # Find draft deposition ID in response + deposition_id = response["links"]["latest_draft"].split("/")[-1] + deposition = get_deposition(deposition_id) + log_deposition(wildcards.ISO3, deposition, deposition_id) + # NOTE overwriting an input file (should be okay, it's marked as ancient) + write_deposition(input.deposition, deposition) + + # List files + # GET /api/deposit/depositions/:id/files + r = requests.get(f"https://{ZENODO_URL}/api/deposit/depositions/{deposition_id}/files", headers=headers) + r.raise_for_status() + files = r.json() + + # Delete each file + # DELETE /api/deposit/depositions/:id/files/:file_id + for file_ in files: + r = requests.delete(f"https://{ZENODO_URL}/api/deposit/depositions/{deposition_id}/files/{file_["id"]}", headers=headers) + r.raise_for_status() + bucket_url = deposition["links"]["bucket"] # Upload files @@ -62,7 +115,7 @@ rule deposit: print(r.json()) r.raise_for_status() - # Set up metadata + # Set up metadata centroid = boundary_geom(wildcards.ISO3).centroid place_name = BOUNDARY_LU.loc[wildcards.ISO3, "NAME"] diff --git a/src/irv_datapkg/__init__.py b/src/irv_datapkg/__init__.py old mode 100644 new mode 100755 index 10e7e77..6db2762 --- a/src/irv_datapkg/__init__.py +++ b/src/irv_datapkg/__init__.py @@ -1,12 +1,14 @@ -from dataclasses import dataclass +import os import shlex import subprocess +from dataclasses import dataclass from pathlib import Path from typing import Optional +import cdsapi import geopandas -import shapely import pyproj +import shapely from osgeo import gdal from shapely.ops import transform @@ -83,3 +85,54 @@ def crop_raster( cmd = cmd + f" -co {creation_option}" subprocess.run(shlex.split(cmd), check=True) + + +def download_from_CDS( + dataset_name: str, + request: dict, + output_path: str, +) -> None: + """ + Download a resource from the Copernicus CDS API, given appropriate credentials. + + Requires CDSAPI_URL and CDSAPI_KEY to be in the environment. + For more details see: https://cds.climate.copernicus.eu/api-how-to + + Args: + dataset_name: Name of dataset to download + request: Dictionary defining request, could include: + variable: Name of variable to request + file_format: Desired file format e.g. zip + version: Version of dataset + year: Year of dataset applicability + output_path: Where to save the downloaded file + """ + client = cdsapi.Client() + + # N.B. Files are covered by licences which need to be manually accepted, e.g. + # https://cds.climate.copernicus.eu/cdsapp/#!/terms/satellite-land-cover + # https://cds.climate.copernicus.eu/cdsapp/#!/terms/vito-proba-v + # + # Ideally we could programmatically accept the necessary licence conditions + # the below code is an attempt at that, but fails with an HTTP 403, not + # logged in when trying to simulate a user acceptance + # + # API_URL = os.environ.get("CDSAPI_URL") + # payloads = [ + # [{"terms_id":"vito-proba-v","revision":1}], + # [{"terms_id":"satellite-land-cover","revision":1}], + # ] + # for payload in payloads: + # client._api( + # url=f"{API_URL.rstrip('/')}.ui/user/me/terms-and-conditions", + # request=payload, + # method="post" + # ) + # + # See https://github.com/ecmwf/cdsapi/blob/master/cdsapi/api.py + + client.retrieve( + dataset_name, + request, + output_path, + )