Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: update existing Collection (WIP) TDE-1359 #1224

Draft
wants to merge 9 commits into
base: master
Choose a base branch
from
76 changes: 66 additions & 10 deletions scripts/stac/imagery/collection.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
import json
import os
from typing import Any

import ulid
from shapely import to_geojson
from shapely.geometry import shape
from shapely.geometry.base import BaseGeometry

from scripts.datetimes import format_rfc_3339_datetime_string, parse_rfc_3339_datetime
from scripts.files.files_helper import ContentType
from scripts.files.fs import read, write
from scripts.files.fs import exists, read, write
from scripts.json_codec import dict_to_json_bytes
from scripts.stac.imagery.capture_area import generate_capture_area
from scripts.stac.imagery.metadata_constants import (
Expand Down Expand Up @@ -38,6 +41,8 @@

class ImageryCollection:
stac: dict[str, Any]
capture_area: dict[str, Any] | None = None
published_location: str | None = None

def __init__(
self,
Expand Down Expand Up @@ -95,8 +100,39 @@ def __init__(

self.add_providers(merge_provider_roles(providers))

@classmethod
def from_file(cls, file_name: str, metadata: CollectionMetadata, updated_datetime: str) -> "ImageryCollection":
"""Load an ImageryCollection from a Collection file.

Args:
file_name: The s3 URL or local path of the Collection file to load.

Returns:
The loaded ImageryCollection.
"""
file_content = read(file_name)
stac_from_file = json.loads(file_content.decode("UTF-8"))
stac_from_file["updated"] = updated_datetime
collection = cls(
metadata=metadata,
created_datetime=stac_from_file["created"],
updated_datetime=stac_from_file["updated"],
linz_slug=stac_from_file["linz:slug"],
)
# Override STAC from the original collection
collection.stac = stac_from_file

collection.published_location = os.path.dirname(file_name)
capture_area_path = os.path.join(collection.published_location, CAPTURE_AREA_FILE_NAME)
# Some published datasets may not have a capture-area.geojson file (TDE-988)
if exists(capture_area_path):
collection.capture_area = json.loads(read(capture_area_path))

return collection

def add_capture_area(self, polygons: list[BaseGeometry], target: str, artifact_target: str = "/tmp") -> None:
"""Add the capture area of the Collection.
If the Collection is an update of a published dataset, the existing capture area will be merged with the new one.
The `href` or path of the capture-area.geojson is always set as the relative `./capture-area.geojson`

Args:
Expand All @@ -105,7 +141,9 @@ def add_capture_area(self, polygons: list[BaseGeometry], target: str, artifact_t
artifact_target: location where the capture-area.geojson artifact file will be saved.
This is useful for Argo Workflow in order to expose the file to the user for testing/validation purpose.
"""

# If published dataset update, merge the existing capture area with the new one
if self.capture_area:
polygons.append(shape(self.capture_area["geometry"]))
# The GSD is measured in meters (e.g., `0.3m`)
capture_area_document = generate_capture_area(polygons, self.metadata["gsd"])
capture_area_content: bytes = dict_to_json_bytes(capture_area_document)
Expand Down Expand Up @@ -162,20 +200,38 @@ def add_capture_dates(self, source_directory: str) -> None:

def add_item(self, item: dict[Any, Any]) -> None:
"""Add an `Item` to the `links` of the `Collection`.
If updating an existing Collection, if the Item already existed in the Collection, it will be removed and replaced.
The capture area will be updated if the Item already existed in the Collection.

Args:
item: STAC Item to add
"""
item_self_link = next((feat for feat in item["links"] if feat["rel"] == "self"), None)
if item_self_link:
self.stac["links"].append(
Link(
path=item_self_link["href"],
rel=Relation.ITEM,
media_type=StacMediaType.GEOJSON,
file_content=dict_to_json_bytes(item),
).stac
)
link_to_add = Link(
path=item_self_link["href"],
rel=Relation.ITEM,
media_type=StacMediaType.GEOJSON,
file_content=dict_to_json_bytes(item),
).stac

# Remove existing item from the capture-area
existing_item = next((link for link in self.stac["links"] if link["href"] == link_to_add["href"]), None)
if existing_item and self.capture_area and self.published_location:
existing_item_stac = json.loads(
read(os.path.join(self.published_location, os.path.basename(existing_item["href"])))
)
existing_item_geometry = shape(existing_item_stac["geometry"])
capture_area_geometry = shape(self.capture_area["geometry"])
updated_capture_area_geometry = capture_area_geometry.difference(existing_item_geometry)

self.capture_area["geometry"] = json.loads(to_geojson(updated_capture_area_geometry))

# Remove old link if it exists
self.stac["links"] = [link for link in self.stac["links"] if link["href"] != link_to_add["href"]]
# Add the new link
self.stac["links"].append(link_to_add)

self.update_temporal_extent(item["properties"]["start_datetime"], item["properties"]["end_datetime"])
self.update_spatial_extent(item["bbox"])

Expand Down
33 changes: 16 additions & 17 deletions scripts/stac/imagery/create_stac.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import json
import os
from typing import Any, TypeAlias, cast
from typing import Any, TypeAlias

from linz_logger import get_log
from shapely.geometry.base import BaseGeometry
Expand All @@ -24,6 +24,7 @@


# pylint: disable=too-many-arguments
# pylint: disable=too-many-locals
def create_collection(
collection_id: str,
linz_slug: str,
Expand Down Expand Up @@ -58,19 +59,21 @@ def create_collection(
Returns:
an ImageryCollection object
"""
existing_collection = {}
if odr_url:
existing_collection = get_published_file_contents(odr_url, "collection")

collection = ImageryCollection(
metadata=collection_metadata,
created_datetime=cast(str, existing_collection.get("created", current_datetime)),
updated_datetime=current_datetime,
linz_slug=linz_slug,
collection_id=collection_id,
providers=get_providers(licensors, producers),
add_title_suffix=add_title_suffix,
)
collection = ImageryCollection.from_file(
os.path.join(odr_url, "collection.json"), collection_metadata, current_datetime
)

else:
collection = ImageryCollection(
metadata=collection_metadata,
created_datetime=current_datetime,
updated_datetime=current_datetime,
linz_slug=linz_slug,
collection_id=collection_id,
providers=get_providers(licensors, producers),
add_title_suffix=add_title_suffix,
)

for item in stac_items:
collection.add_item(item)
Expand Down Expand Up @@ -212,7 +215,3 @@ def create_or_load_base_item(
)

return ImageryItem(id_, stac_asset, stac_processing)


def get_published_file_contents(odr_url: str, filename: str) -> JSON_Dict:
return cast(JSON_Dict, json.loads(read(os.path.join(odr_url, f"{filename}.json")).decode("UTF-8")))
80 changes: 80 additions & 0 deletions scripts/stac/imagery/tests/create_stac_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,7 @@ def test_create_collection_resupply(
"type": "Collection",
"stac_version": STAC_VERSION,
"id": collection_id,
"linz:slug": fake_linz_slug,
"created": created_datetime_string,
"updated": created_datetime_string,
}
Expand Down Expand Up @@ -287,6 +288,85 @@ def test_create_collection_resupply(
assert collection.stac["updated"] == updated_datetime_string


def test_create_collection_resupply_add_items(
fake_collection_metadata: CollectionMetadata, fake_linz_slug: str, subtests: SubTests, tmp_path: Path
) -> None:
collection_id = "test_collection"
created_datetime = any_epoch_datetime()
created_datetime_string = format_rfc_3339_datetime_string(created_datetime)
existing_item_link = {
"rel": "item",
"href": "./item_a.json",
"type": "application/geo+json",
"file:checksum": "1220559708896b75aebab2bbadbc184f6b9ce22708adbb725e93bf3f08a38d2bc71e",
}

existing_collection_content = {
"type": "Collection",
"stac_version": STAC_VERSION,
"id": collection_id,
"linz:slug": fake_linz_slug,
"links": [
{
"rel": "root",
"href": "https://nz-imagery.s3.ap-southeast-2.amazonaws.com/catalog.json",
"type": "application/json",
},
{"rel": "self", "href": "./collection.json", "type": "application/json"},
existing_item_link,
],
"created": created_datetime_string,
"updated": created_datetime_string,
}
existing_collection_path = tmp_path / "collection.json"
existing_collection_path.write_text(json.dumps(existing_collection_content))

item_to_add = {
"type": "Feature",
"id": "item_b",
"links": [
{"href": "./item_b.json", "rel": "self", "type": "application/geo+json"},
{"href": "./collection.json", "rel": "collection", "type": "application/json"},
{"href": "./collection.json", "rel": "parent", "type": "application/json"},
],
"properties": {"start_datetime": "2024-09-02T12:00:00Z", "end_datetime": "2024-09-02T12:00:00Z"},
"bbox": [171.8256487, -34.3559317, 172.090076, -34.0291036],
}

item_to_add_link = {
"rel": "item",
"href": "./item_b.json",
"type": "application/geo+json",
"file:checksum": "12203040c94dda3807c4430b312e9b400604188a639f22cc8067136084662fc2618d",
}

updated_datetime_string = format_rfc_3339_datetime_string(created_datetime + timedelta(days=1))

collection = create_collection(
collection_id=collection_id,
linz_slug=fake_linz_slug,
collection_metadata=fake_collection_metadata,
current_datetime=updated_datetime_string,
producers=[],
licensors=[],
stac_items=[item_to_add],
item_polygons=[],
add_capture_dates=False,
uri="test",
odr_url=tmp_path.as_posix(),
)

with subtests.test("created datetime"):
assert collection.stac["created"] == existing_collection_content["created"]

with subtests.test("updated datetime"):
assert collection.stac["updated"] == updated_datetime_string

with subtests.test("links"):
assert item_to_add_link in collection.stac["links"]
assert existing_item_link in collection.stac["links"]


def test_create_item_with_odr_url(tmp_path: Path) -> None:
item_name = "empty"
existing_item_file = tmp_path / f"{item_name}.json"
Expand Down
Loading