From 8b583bb53d46837a99e7196a862b02ae97a3489d Mon Sep 17 00:00:00 2001 From: David Huard Date: Tue, 15 Oct 2024 11:09:31 -0400 Subject: [PATCH] suggestions from review --- .gitignore | 4 ++ STACpopulator/extensions/base.py | 72 ++++++++++++++--------------- STACpopulator/extensions/cordex6.py | 23 --------- STACpopulator/extensions/thredds.py | 9 ++++ STACpopulator/populator_base.py | 52 +-------------------- tests/test_cordex.py | 24 ++++++++++ 6 files changed, 72 insertions(+), 112 deletions(-) create mode 100644 tests/test_cordex.py diff --git a/.gitignore b/.gitignore index 43259dc..57f1c4e 100644 --- a/.gitignore +++ b/.gitignore @@ -22,6 +22,10 @@ build ## Logs *.jsonl +*.json + +## Exclude schemas +!schemas/**/*.json # Old Submodule Path # Could be used locally diff --git a/STACpopulator/extensions/base.py b/STACpopulator/extensions/base.py index fe42947..25438fe 100644 --- a/STACpopulator/extensions/base.py +++ b/STACpopulator/extensions/base.py @@ -1,3 +1,28 @@ +""" +# Base classes for STAC extensions + +What we have: + - `Loader`, which returns attributes. + - An external json schema describing a subset of the attributes returned by the Loader. This schema might preclude + additional properties, so it cannot be applied wholesale to the Loader's output. (maybe overkill since not a lot of schemas can be found in the wild...) + - `data model` describing the content we want included in the catalog. It includes a subset of the schema properties, + as well as additional attributes desired by the catalog admins. + +Desiderata: + - Not having to replicate existing validation logic in the schema + - Not having to create a modified schema + - Being able to supplement the schema validation by pydantic validation logic + - Streamline the creation of new data models (reduce boilerplate, allow subclassing) + - Developer-friendly validation error messages + + +How-to: + - Instructions to create basic datamodel from schema (codegen) + + + +""" + from datetime import datetime import json import jsonschema @@ -25,47 +50,27 @@ from STACpopulator.extensions.datacube import DataCubeHelper from STACpopulator.extensions.thredds import THREDDSExtension, THREDDSHelper - - T = TypeVar("T", pystac.Collection, pystac.Item, pystac.Asset, item_assets.AssetDefinition) LOGGER = logging.getLogger(__name__) -""" -# Context -What we have: - - `Loader`, which returns attributes. - - An external json schema describing a subset of the attributes returned by the Loader. This schema might preclude - additional properties, so it cannot be applied wholesale to the Loader's output. (maybe overkill since not a lot of schemas can be found in the wild...) - - `data model` describing the content we want included in the catalog. It includes a subset of the schema properties, - as well as additional attributes desired by the catalog admins. -Desiderata: - - Not having to replicate existing validation logic in the schema - - Not having to create a modified schema - - Being able to supplement the schema validation by pydantic validation logic - - Streamline the creation of new data models (reduce boilerplate, allow subclassing) - - Developer-friendly validation error messages - - -How-to: - - Instructions to create basic datamodel from schema (codegen) - - - -""" class DataModel(BaseModel): """Base class for dataset properties going into the catalog. Subclass this with attributes. + + Attributes + ---------- + _prefix : str + If not None, a prefix for the properties in the catalog will be added. + _schema_uri : str + URI of the json schema to validate against. + _schema_exclude : list[str] + Properties not meant to be validated by json schema, but still included in the data model. """ - # Ideally, the catalog properties would be described by a jsonschema. _prefix: str = PrivateAttr() - - # URI of the json schema to validate against. _schema_uri: FilePath = PrivateAttr(None) - - # List of properties not meant to be validated by json schema. _schema_exclude: list[str] = PrivateAttr([]) model_config = ConfigDict(populate_by_name=True, extra="ignore") @@ -135,14 +140,6 @@ def uid(self) -> str: import uuid return str(uuid.uuid4()) - # TODO: Move this into the THREDDS extension? - # @field_validator("access_urls") - # @classmethod - # def validate_access_urls(cls, value): - # assert len(set(["HTTPServer", "OPENDAP"]).intersection(value.keys())) >= 1, ( - # "Access URLs must include HTTPServer or OPENDAP keys.") - # return value - def stac_item(self) -> "pystac.Item": """Create a STAC item and add extensions.""" item = pystac.Item( @@ -167,7 +164,6 @@ def stac_item(self) -> "pystac.Item": return json.loads(json.dumps(item.to_dict())) - def metadata_extension(self, item): """Add extension for the properties of the dataset to the STAC item. The extension class is created dynamically from the properties. diff --git a/STACpopulator/extensions/cordex6.py b/STACpopulator/extensions/cordex6.py index ee592e3..805f181 100644 --- a/STACpopulator/extensions/cordex6.py +++ b/STACpopulator/extensions/cordex6.py @@ -70,26 +70,3 @@ def uid(self) -> str: -# TODO: Remove before merging -def get_test_data(): - import requests - from siphon.catalog import TDSCatalog - import xncml - from STACpopulator.stac_utils import numpy_to_python_datatypes - - cat = TDSCatalog("https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/catalog/birdhouse/disk2/ouranos/CORDEX/CMIP6/DD/NAM-12/OURANOS/MPI-ESM1-2-LR/ssp370/r1i1p1f1/CRCM5/v1-r1/day/tas/v20231208/catalog.html") - - if cat.datasets.items(): - for item_name, ds in cat.datasets.items(): - url = ds.access_urls["NCML"] - r = requests.get(url) - attrs = xncml.Dataset.from_text(r.text).to_cf_dict() - attrs["attributes"] = numpy_to_python_datatypes(attrs["attributes"]) - attrs["access_urls"] = ds.access_urls - return attrs - -def test_item(): - attrs = get_test_data() - model = Cordex6DataModel.from_data(attrs) - model.stac_item() - diff --git a/STACpopulator/extensions/thredds.py b/STACpopulator/extensions/thredds.py index 19e2f44..8acc903 100644 --- a/STACpopulator/extensions/thredds.py +++ b/STACpopulator/extensions/thredds.py @@ -132,3 +132,12 @@ def links(self) -> list[pystac.Link]: url = self.access_urls[ServiceType.httpserver] link = magpie_resource_link(url) return [link] + + +# TODO: Validate services links exist ? +# @field_validator("access_urls") +# @classmethod +# def validate_access_urls(cls, value): +# assert len(set(["HTTPServer", "OPENDAP"]).intersection(value.keys())) >= 1, ( +# "Access URLs must include HTTPServer or OPENDAP keys.") +# return value diff --git a/STACpopulator/populator_base.py b/STACpopulator/populator_base.py index e4df7e8..0b7fe2f 100644 --- a/STACpopulator/populator_base.py +++ b/STACpopulator/populator_base.py @@ -226,55 +226,5 @@ def ingest(self) -> None: counter += 1 LOGGER.info(f"Processed {counter} data items. {failures} failures") - - - -class THREDDSRunner: - def __init__(self, populator): - self.populator = populator - self.parser = argparse.ArgumentParser() - self.add_parser_args(self.parser) - - @staticmethod - def add_parser_args(parser: argparse.ArgumentParser) -> None: - parser.description="STAC populator from a THREDDS catalog or NCML XML." - parser.add_argument("stac_host", help="STAC API URL") - parser.add_argument("href", help="URL to a THREDDS catalog or a NCML XML with CMIP6 metadata.") - parser.add_argument("--update", action="store_true", help="Update collection and its items") - parser.add_argument( - "--mode", - choices=["full", "single"], - default="full", - help="Operation mode, processing the full dataset or only the single reference.", - ) - parser.add_argument( - "--config", - type=str, - help=( - "Override configuration file for the populator. " - "By default, uses the adjacent configuration to the implementation class." - ), - ) - add_request_options(parser) - - def runner(self, ns: argparse.Namespace) -> int: - LOGGER.info(f"Arguments to call: {vars(ns)}") - - with Session() as session: - apply_request_options(session, ns) - if ns.mode == "full": - data_loader = THREDDSLoader(ns.href, session=session) - else: - # To be implemented - data_loader = ErrorLoader() - - c = self.populator( - ns.stac_host, data_loader, update=ns.update, session=session, config_file=ns.config, log_debug=ns.debug - ) - c.ingest() - return 0 - - def main(self, *args: str) -> int: - ns = self.parser.parse_args(args or None) - return self.runner(ns) + diff --git a/tests/test_cordex.py b/tests/test_cordex.py new file mode 100644 index 0000000..12b42d5 --- /dev/null +++ b/tests/test_cordex.py @@ -0,0 +1,24 @@ +from STACpopulator.extensions.cordex6 import Cordex6DataModel + + +def get_test_data(): + import requests + from siphon.catalog import TDSCatalog + import xncml + from STACpopulator.stac_utils import numpy_to_python_datatypes + + cat = TDSCatalog("https://pavics.ouranos.ca/twitcher/ows/proxy/thredds/catalog/birdhouse/disk2/ouranos/CORDEX/CMIP6/DD/NAM-12/OURANOS/MPI-ESM1-2-LR/ssp370/r1i1p1f1/CRCM5/v1-r1/day/tas/v20231208/catalog.html") + + if cat.datasets.items(): + for item_name, ds in cat.datasets.items(): + url = ds.access_urls["NCML"] + r = requests.get(url) + attrs = xncml.Dataset.from_text(r.text).to_cf_dict() + attrs["attributes"] = numpy_to_python_datatypes(attrs["attributes"]) + attrs["access_urls"] = ds.access_urls + return attrs + +def test_item(): + attrs = get_test_data() + model = Cordex6DataModel.from_data(attrs) + model.stac_item()