Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adjust ItemProperties Validation. #131

Merged
2 changes: 2 additions & 0 deletions CHANGELOG.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
- Enforce required `type` key for `Collection` and `Catalog` models
- Add queryables link relation type (#123, @constantinius)
- Fix STAC API Query Extension operator names from ne->neq, le->lte, and ge->gte (#120, @philvarner)
- Better **datetime** parsing/validation by using Pydantic native types and remove `ciso8601` requirement (#131, @eseglem)
- move datetime validation in `StacCommonMetadata` model definition (#131, @eseglem)

3.0.0 (2024-01-25)
------------------
Expand Down
8 changes: 1 addition & 7 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,7 @@ keywords=["stac", "pydantic", "validation"]
authors=[{ name = "Arturo Engineering", email = "[email protected]"}]
license= { text = "MIT" }
requires-python=">=3.8"
dependencies = [
"click>=8.1.7",
"pydantic>=2.4.1",
"geojson-pydantic>=1.0.0",
"ciso8601~=2.3",
]
dependencies = ["click>=8.1.7", "pydantic>=2.4.1", "geojson-pydantic>=1.0.0"]
dynamic = ["version", "readme"]

[project.scripts]
Expand All @@ -37,7 +32,6 @@ repository ="https://github.com/stac-utils/stac-pydantic.git"

[project.optional-dependencies]
dev = [
"arrow>=1.2.3",
"pytest>=7.4.2",
"pytest-cov>=4.1.0",
"pytest-icdiff>=0.8",
Expand Down
79 changes: 43 additions & 36 deletions stac_pydantic/api/search.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,21 @@
from datetime import datetime as dt
from typing import Any, Dict, List, Optional, Tuple, Union, cast

from ciso8601 import parse_rfc3339
from geojson_pydantic.geometries import GeometryCollection # type: ignore
from geojson_pydantic.geometries import (
GeometryCollection,
LineString,
MultiLineString,
MultiPoint,
MultiPolygon,
Point,
Polygon,
)
from pydantic import BaseModel, Field, field_validator, model_validator
from pydantic import BaseModel, Field, TypeAdapter, field_validator, model_validator

from stac_pydantic.api.extensions.fields import FieldsExtension
from stac_pydantic.api.extensions.query import Operator
from stac_pydantic.api.extensions.sort import SortExtension
from stac_pydantic.shared import BBox
from stac_pydantic.shared import BBox, UtcDatetime

Intersection = Union[
Point,
Expand All @@ -28,6 +27,8 @@
GeometryCollection,
]

SearchDatetime = TypeAdapter(Optional[UtcDatetime])


class Search(BaseModel):
"""
Expand All @@ -43,23 +44,18 @@ class Search(BaseModel):
datetime: Optional[str] = None
limit: int = 10

# Private properties to store the parsed datetime values. Not part of the model schema.
_start_date: Optional[dt] = None
_end_date: Optional[dt] = None

# Properties to return the private values
@property
def start_date(self) -> Optional[dt]:
values = (self.datetime or "").split("/")
if len(values) == 1:
return None
if values[0] == ".." or values[0] == "":
return None
return parse_rfc3339(values[0])
return self._start_date

@property
def end_date(self) -> Optional[dt]:
values = (self.datetime or "").split("/")
if len(values) == 1:
return parse_rfc3339(values[0])
if values[1] == ".." or values[1] == "":
return None
return parse_rfc3339(values[1])
return self._end_date

# Check https://docs.pydantic.dev/dev-v2/migration/#changes-to-validators for more information.
@model_validator(mode="before")
Expand Down Expand Up @@ -102,32 +98,43 @@ def validate_bbox(cls, v: BBox) -> BBox:

@field_validator("datetime")
@classmethod
def validate_datetime(cls, v: str) -> str:
if "/" in v:
values = v.split("/")
else:
# Single date is interpreted as end date
values = ["..", v]

dates: List[dt] = []
for value in values:
if value == ".." or value == "":
continue

dates.append(parse_rfc3339(value))
def validate_datetime(cls, value: str) -> str:
vincentsarago marked this conversation as resolved.
Show resolved Hide resolved
# Split on "/" and replace no value or ".." with None
values = [v if v and v != ".." else None for v in value.split("/")]

# If there are more than 2 dates, it's invalid
if len(values) > 2:
raise ValueError(
"Invalid datetime range, must match format (begin_date, end_date)"
"Invalid datetime range. Too many values. Must match format: {begin_date}/{end_date}"
)

if not {"..", ""}.intersection(set(values)):
if dates[0] > dates[1]:
raise ValueError(
"Invalid datetime range, must match format (begin_date, end_date)"
)
# If there is only one date, insert a None for the start date
vincentsarago marked this conversation as resolved.
Show resolved Hide resolved
if len(values) == 1:
values.insert(0, None)

# Cast because pylance gets confused by the type adapter and annotated type
dates = cast(
List[Optional[dt]],
[
# Use the type adapter to validate the datetime strings, strict is necessary
# due to pydantic issues #8736 and #8762
SearchDatetime.validate_strings(v, strict=True) if v else None
for v in values
],
)

# If there is a start and end date, check that the start date is before the end date
if dates[0] and dates[1] and dates[0] > dates[1]:
raise ValueError(
"Invalid datetime range. Begin date after end date. "
"Must match format: {begin_date}/{end_date}"
)

return v
# Store the parsed dates
cls._start_date = dates[0]
cls._end_date = dates[1]
# Return the original string value
return value

@property
def spatial_filter(self) -> Optional[Intersection]:
Expand Down
46 changes: 5 additions & 41 deletions stac_pydantic/item.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,15 @@
from datetime import datetime as dt
from typing import Any, Dict, List, Optional, Union
from typing import Any, Dict, List, Optional

from ciso8601 import parse_rfc3339
from geojson_pydantic import Feature
from pydantic import (
AnyUrl,
ConfigDict,
Field,
field_serializer,
model_serializer,
model_validator,
)
from pydantic import AnyUrl, ConfigDict, Field, model_serializer, model_validator

from stac_pydantic.links import Links
from stac_pydantic.shared import (
DATETIME_RFC339,
SEMVER_REGEX,
Asset,
StacBaseModel,
StacCommonMetadata,
UtcDatetime,
)
from stac_pydantic.version import STAC_VERSION

Expand All @@ -28,39 +19,12 @@ class ItemProperties(StacCommonMetadata):
https://github.com/radiantearth/stac-spec/blob/v1.0.0/item-spec/item-spec.md#properties-object
"""

datetime: Union[dt, str] = Field(..., alias="datetime")
# Overide the datetime field to be required
datetime: Optional[UtcDatetime]

eseglem marked this conversation as resolved.
Show resolved Hide resolved
vincentsarago marked this conversation as resolved.
Show resolved Hide resolved
# Check https://docs.pydantic.dev/dev-v2/migration/#changes-to-config for more information.
model_config = ConfigDict(extra="allow")

@model_validator(mode="before")
@classmethod
def validate_datetime(cls, data: Dict[str, Any]) -> Dict[str, Any]:
datetime = data.get("datetime")
start_datetime = data.get("start_datetime")
end_datetime = data.get("end_datetime")

if not datetime or datetime == "null":
if not start_datetime and not end_datetime:
raise ValueError(
"start_datetime and end_datetime must be specified when datetime is null"
)

if isinstance(datetime, str):
data["datetime"] = parse_rfc3339(datetime)

if isinstance(start_datetime, str):
data["start_datetime"] = parse_rfc3339(start_datetime)

if isinstance(end_datetime, str):
data["end_datetime"] = parse_rfc3339(end_datetime)

return data

@field_serializer("datetime")
def serialize_datetime(self, v: dt, _info: Any) -> str:
return v.strftime(DATETIME_RFC339)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we still need the serializer to make sure we respect the RFC3339 spec

This could be one implementation (to make sure we keep microseconds, if provided)

    @field_serializer("datetime", "start_datetime", "end_datetime", when_used="always")
    def serialize_datetime(self, v: Optional[dt], _info: Any) -> str:
        if v:
            if str(v)[19] == ".":
                return v.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
            else:
                return v.strftime("%Y-%m-%dT%H:%M:%SZ")

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Assuming the datetime object has all the required values, then the pydantic output will always be valid RFC3339. But that is not always the case. Pydantic allows in time without seconds, but then the output would not be valid. If input is validated through parse_rfc3339 then it would be fine to use no serializer and let pydantic handle it.

If, however, we want to accept a wider range of inputs (AwareDatetime) then a serializer is probably necessary. I believe the best solution would actually be to do .isoformat() on the datetime object, not an .strftime. As RFC3339 allows any timezone.

   time-numoffset  = ("+" / "-") time-hour ":" time-minute
   time-offset     = "Z" / time-numoffset

And the use of .strftime("%Y-%m-%dT%H:%M:%S.%fZ") is not guaranteed to be correct. Depending on the input timezone.

from ciso8601 import parse_rfc3339
from datetime import datetime, timezone
from pydantic import TypeAdapter
d = parse_rfc3339("2011-11-04T00:05:23.283+04:00")
d.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
# '2011-11-04T00:05:23.283000Z' <- Not actually the correct timezone
d.isoformat()
# '2011-11-04T00:05:23.283000+04:00'
d.astimezone(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S.%fZ")
# '2011-11-03T20:05:23.283000Z'
d.astimezone(timezone.utc).isoformat()
# '2011-11-03T20:05:23.283000+00:00'
TypeAdapter(datetime).dump_json(d)
# b'"2011-11-04T00:05:23.283000+04:00"'
TypeAdapter(datetime).dump_json(d.astimezone(timezone.utc))
# b'"2011-11-03T20:05:23.283000Z"'

All of those are valid RFC3339 outputs, but one of them is lying about the timezone.

And now looking back at https://github.com/radiantearth/stac-spec/blob/v1.0.0/item-spec/item-spec.md#properties-object seems like there is actually a bit more of an issue. The spec requires UTC on top of the RFC3339. So even with parse_rfc3339 that part is not being enforced on the input. So it would depend on if that needs to be enforced on input or not.

I see a few possibilities:

def parse_rfc3339_asutc(value: Any) -> datetime:
    return parse_rfc3339(value).astimezone(timezone.utc)

def parse_utc_rfc3339(value: Any) -> datetime:
    d = parse_rfc3339(value)
    if not d.tzinfo == timezone.utc:
        raise ValueError("Input must be UTC")
    return d

# Flexible input as long as there is a timezone. Converts to UTC. Use isoformat to get valid RFC format. 
Annotated[AwareDatetime, AfterValidator(lambda d: d.astimezone(timezone.utc)), PlainSerializer(lambda d: d.isoformat())]
# Input must be RFC, any timzone. Converts to UTC. Pydantic serializer outputs valid RFC format.
Annotated[datetime, BeforeValidator(parse_rfc3339_asutc)]
# Input must be RFC and UTC. Pydantic serializer outputs valid RFC format.
Annotated[datetime, BeforeValidator(parse_utc_rfc3339)]

I would probably still go with Pydantic for parsing. Gives the most flexibility on input, and can still guarantee valid output.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with @gadomski comments below in regards to "be permissive for inputs, strict on outputs."
Pystac solution for this uses dateutil.parser for converting strings and setting datetimes with missing timezone to UTC when parsing to json.

This has the advantage that any date format can be accepted including 2020, 2020-01 and 2020-01-01.

I added a possible solution in here

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@thomas-maschler if going the dateutil.parser route, it would probably make sense to just let pydantic do its thing instead. It should be more performant.

The way it is current written in this PR is to accept a datetime with a timezone, and convert it to UTC. Rather than accept a naive value and assume UTC. Though the spec here does say it must be in UTC. So, its not a horrible thing to assume either. Tons of possible solutions. I was trying to optimize on keeping as much as possible inside the pydantic rust code and minimal python code to maintain here.

Its funny how a spec trying to simplify things or add clarity can make things more complicated than necessary.


class Item(Feature, StacBaseModel):
"""
Expand Down
88 changes: 68 additions & 20 deletions stac_pydantic/shared.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,17 @@
from datetime import datetime
from datetime import timezone
from enum import Enum, auto
from typing import Any, Dict, List, Optional, Tuple, Union
from warnings import warn

from pydantic import BaseModel, ConfigDict, Field
from pydantic import (
AfterValidator,
AwareDatetime,
BaseModel,
ConfigDict,
Field,
model_validator,
)
from typing_extensions import Annotated, Self

from stac_pydantic.utils import AutoValueEnum

Expand All @@ -15,9 +23,14 @@

SEMVER_REGEX = r"^(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)(?:-((?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\+([0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?$"

# https://tools.ietf.org/html/rfc3339#section-5.6
# Unused, but leaving it here since it's used by dependencies
DATETIME_RFC339 = "%Y-%m-%dT%H:%M:%SZ"
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed from the tests. And, if there are downstream users which use this outside of that, they should do their own formatting, as there are issues with this format. Its missing ms and the Z part is not always correct.

# Allows for some additional flexibility in the input datetime format. As long as
# the input value has timezone information, it will be converted to UTC timezone.
UtcDatetime = Annotated[
# Input value must be in a format which has timezone information
AwareDatetime,
# Convert the input value to UTC timezone
AfterValidator(lambda d: d.astimezone(timezone.utc)),
]


class MimeTypes(str, Enum):
Expand Down Expand Up @@ -106,41 +119,76 @@ class Provider(StacBaseModel):
https://github.com/radiantearth/stac-spec/blob/v1.0.0/collection-spec/collection-spec.md#provider-object
"""

name: str = Field(..., alias="name", min_length=1)
name: str = Field(..., min_length=1)
description: Optional[str] = None
roles: Optional[List[str]] = None
url: Optional[str] = None


class StacCommonMetadata(StacBaseModel):
"""
https://github.com/radiantearth/stac-spec/blob/v1.0.0/item-spec/common-metadata.md#date-and-time-range
https://github.com/radiantearth/stac-spec/blob/v1.0.0/item-spec/common-metadata.md
"""

title: Optional[str] = Field(None, alias="title")
description: Optional[str] = Field(None, alias="description")
start_datetime: Optional[datetime] = Field(None, alias="start_datetime")
end_datetime: Optional[datetime] = Field(None, alias="end_datetime")
created: Optional[datetime] = Field(None, alias="created")
updated: Optional[datetime] = Field(None, alias="updated")
platform: Optional[str] = Field(None, alias="platform")
instruments: Optional[List[str]] = Field(None, alias="instruments")
constellation: Optional[str] = Field(None, alias="constellation")
mission: Optional[str] = Field(None, alias="mission")
providers: Optional[List[Provider]] = Field(None, alias="providers")
gsd: Optional[float] = Field(None, alias="gsd", gt=0)
# Basic
title: Optional[str] = None
description: Optional[str] = None
# Date and Time
datetime: Optional[UtcDatetime] = None
created: Optional[UtcDatetime] = None
updated: Optional[UtcDatetime] = None
# Date and Time Range
start_datetime: Optional[UtcDatetime] = None
end_datetime: Optional[UtcDatetime] = None
# Provider
providers: Optional[List[Provider]] = None
# Instrument
platform: Optional[str] = None
instruments: Optional[List[str]] = None
constellation: Optional[str] = None
mission: Optional[str] = None
gsd: Optional[float] = Field(None, gt=0)

@model_validator(mode="after")
def validate_datetime_or_start_end(self) -> Self:
# When datetime is null, start_datetime and end_datetime must be specified
if not self.datetime and (not self.start_datetime or not self.end_datetime):
raise ValueError(
"start_datetime and end_datetime must be specified when datetime is null"
)

return self

@model_validator(mode="after")
def validate_start_end(self) -> Self:
# Using one of start_datetime or end_datetime requires the use of the other
if (self.start_datetime and not self.end_datetime) or (
not self.start_datetime and self.end_datetime
):
raise ValueError(
"use of start_datetime or end_datetime requires the use of the other"
)
return self


class Asset(StacCommonMetadata):
"""
https://github.com/radiantearth/stac-spec/blob/v1.0.0/item-spec/item-spec.md#asset-object
"""

href: str = Field(..., alias="href", min_length=1)
href: str = Field(..., min_length=1)
type: Optional[str] = None
title: Optional[str] = None
description: Optional[str] = None
roles: Optional[List[str]] = None

model_config = ConfigDict(
populate_by_name=True, use_enum_values=True, extra="allow"
)

@model_validator(mode="after")
def validate_datetime_or_start_end(self) -> Self:
# Overriding the parent method to avoid requiring datetime or start/end_datetime
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

well IMO the Asset model should not herit from the StacCommonMetadata but from StacBaseModel

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can do this in another PR

# Additional fields MAY be added on the Asset object, but are not required.
# https://github.com/radiantearth/stac-spec/blob/v1.0.0/item-spec/item-spec.md#additional-fields-for-assets
return self
4 changes: 2 additions & 2 deletions tests/api/extensions/test_fields.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from datetime import datetime
from datetime import datetime, timezone

from shapely.geometry import Polygon

Expand All @@ -15,7 +15,7 @@ def test_fields_filter_item():
item = Item(
id="test-fields-filter",
geometry=Polygon.from_bounds(0, 0, 0, 0),
properties={"datetime": datetime.utcnow(), "foo": "foo", "bar": "bar"},
properties={"datetime": datetime.now(timezone.utc), "foo": "foo", "bar": "bar"},
assets={},
eseglem marked this conversation as resolved.
Show resolved Hide resolved
links=[
{"href": "http://link", "rel": "self"},
Expand Down
Loading
Loading