Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

STAC metadata as multidimensional coordinates #230

Draft
wants to merge 31 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
dffd6cc
Starting to refactor towards nd coordinates
gjoseph92 Oct 14, 2023
db6d6cf
driveby: copy-paste error excluding bad assets
gjoseph92 Oct 25, 2023
0ee4c5c
eo:bands is actually a list
gjoseph92 Oct 25, 2023
f6de475
multiple working nd coords methods, but 10x slower
gjoseph92 Oct 25, 2023
01eaca0
wip appendage
gjoseph92 Oct 25, 2023
2a00846
locality working; fastest but still 2x slower
gjoseph92 Oct 26, 2023
81b50ed
unnest dicts in generator
gjoseph92 Oct 26, 2023
d023cbe
unpack per band assets in generator
gjoseph92 Oct 26, 2023
713e242
fix unnesting with empty dict values
gjoseph92 Oct 26, 2023
abe0f25
create typed arrays initially
gjoseph92 Oct 26, 2023
53b7149
starting to generalize for properties as well
gjoseph92 Oct 27, 2023
fe8af5b
remove unused stuff
gjoseph92 Oct 27, 2023
8d4120f
3d coords test. works except missing values.
gjoseph92 Oct 27, 2023
77ed8b5
crappy handling of deduping with nans
gjoseph92 Nov 8, 2023
1f1cfef
docstring for `items_to_coords`
gjoseph92 Nov 8, 2023
8104331
skip some fields
gjoseph92 Nov 8, 2023
eb33dc9
backwards compatible field de-prefixing
gjoseph92 Nov 15, 2023
a678278
fix small comments
gjoseph92 Nov 15, 2023
32894f3
fix coords utils 3d tests
gjoseph92 Nov 15, 2023
11cf046
remove `descalar_obj_array`
gjoseph92 Nov 15, 2023
b8a7666
out with the old!
gjoseph92 Nov 15, 2023
7d5e693
driveby: fix small type errors in stack.py
gjoseph92 Nov 15, 2023
3cdf195
fix type errors in test_coordinates
gjoseph92 Nov 15, 2023
4157066
driveby: fix NameError with multi band rasters
gjoseph92 Nov 15, 2023
3067db7
remove commented-out test
gjoseph92 Nov 15, 2023
112ca47
remove pytest-benchmark
gjoseph92 Nov 30, 2023
6fa11c7
factor out DtypeUpdatingArray and start testing
gjoseph92 Dec 19, 2023
9db39b3
convert NaN fill values to None
gjoseph92 Dec 19, 2023
f7b9e29
test for weird cases
gjoseph92 Dec 19, 2023
42bb76f
treat datetimes as objects for now
gjoseph92 Dec 19, 2023
11f9b4e
convert bool arrays back
gjoseph92 Dec 19, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
196 changes: 0 additions & 196 deletions stackstac/accumulate_metadata.py

This file was deleted.

180 changes: 180 additions & 0 deletions stackstac/coordinates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
from __future__ import annotations

from typing import Any, Collection, Dict, List, Literal, Tuple

import numpy as np
import pandas as pd
import xarray as xr

from stackstac.coordinates_utils import (
Coordinates,
items_to_coords,
unnested_items,
unpacked_per_band_asset_fields,
)

from .raster_spec import RasterSpec
from .stac_types import ItemSequence

ASSET_TABLE_DT = np.dtype(
[("url", object), ("bounds", "float64", 4), ("scale_offset", "float64", 2)]
)

# Asset fields which are a list with one item per band in the asset.
# For one-band assets, they should be a list of length 1.
# We'll unpack those 1-length lists, so the subfields can be flattened into
# top-level coordinates.
# This is how we get `eo:bands_common_name` or `raster:bands_scale` coordinates.
PER_BAND_ASSET_FIELDS = {
"eo:bands",
"raster:bands",
}


def to_coords(
items: ItemSequence,
asset_ids: List[str],
spec: RasterSpec,
xy_coords: Literal["center", "topleft", False] = "topleft",
properties: bool = True,
band_coords: bool = True,
) -> Tuple[Coordinates, List[str]]:
times = pd.to_datetime(
[item["properties"]["datetime"] for item in items],
infer_datetime_format=True,
errors="coerce",
)
if times.tz is not None:
# xarray can't handle tz-aware DatetimeIndexes, so we convert to UTC and drop the timezone
# https://github.com/pydata/xarray/issues/3291.
# The `tz is None` case is typically a manifestation of https://github.com/pandas-dev/pandas/issues/41047.
# Since all STAC timestamps should be UTC (https://github.com/radiantearth/stac-spec/issues/1095),
# we feel safe assuming that any tz-naive datetimes are already in UTC.
times = times.tz_convert(None)

dims = ["time", "band", "y", "x"]
coords = {
"time": times,
"id": xr.Variable("time", [item["id"] for item in items]),
"band": asset_ids,
}

if xy_coords is not False:
if xy_coords == "center":
pixel_center = True
elif xy_coords == "topleft":
pixel_center = False
else:
raise ValueError(
f"xy_coords must be 'center', 'topleft', or False, not {xy_coords!r}"
)

transform = spec.transform
# We generate the transform ourselves in `RasterSpec`, and it's always constructed to be rectilinear.
# Someday, this should not always be the case, in order to support non-rectilinear data without warping.
assert (
transform.is_rectilinear
), f"Non-rectilinear transform generated: {transform}"
minx, miny, maxx, maxy = spec.bounds
xres, yres = spec.resolutions_xy

if pixel_center:
half_xpixel, half_ypixel = xres / 2, yres / 2
minx, miny, maxx, maxy = (
minx + half_xpixel,
miny - half_ypixel,
maxx + half_xpixel,
maxy - half_ypixel,
)

height, width = spec.shape
# Wish pandas had an RangeIndex that supported floats...
# https://github.com/pandas-dev/pandas/issues/46484
xs = pd.Index(np.linspace(minx, maxx, width, endpoint=False), dtype="float64")
ys = pd.Index(np.linspace(maxy, miny, height, endpoint=False), dtype="float64")

coords["x"] = xs
coords["y"] = ys

if properties:
assert properties is True, (
"Passing specific properties is no longer supported. "
"The `properties` argument must only be True or False. "
"If you have a use case for this, please open an issue."
)
coords.update(items_to_property_coords(items))

if band_coords:
coords.update(items_to_band_coords(items, asset_ids))

# Add `epsg` last in case it's also a field in properties; our data model assumes it's a coordinate
coords["epsg"] = spec.epsg

return coords, dims


def items_to_property_coords(
items: ItemSequence,
skip_fields: Collection[str] = frozenset(["datetime", "id"]),
) -> Coordinates:
return items_to_coords(
(
((i,), k, v)
for i, item in enumerate(items)
# TODO: should we unnest properties?
for k, v in item["properties"].items()
if k not in skip_fields
),
shape=(len(items),),
dims=("time",),
)


def items_to_band_coords(
items: ItemSequence,
asset_ids: List[str],
skip_fields: Collection[str] = frozenset(["href", "type"]),
) -> Coordinates:
def fields_values_generator():
for ii, item in enumerate(items):
for ai, id in enumerate(asset_ids):
try:
asset = item["assets"][id]
except KeyError:
continue

for field, value in unnested_items(
unpacked_per_band_asset_fields(asset.items(), PER_BAND_ASSET_FIELDS)
):
field = rename_some_band_fields(field)
if field not in skip_fields:
yield (ii, ai), field, value

return items_to_coords(
fields_values_generator(),
shape=(len(items), len(asset_ids)),
dims=("time", "band"),
)


def rename_some_band_fields(field: str) -> str:
"""
Apply renamings to band fields for "convenience".

This is just for backwards compatibility.
These renamings should probably be removed for simplicity and consistency.
"""
if field == "sar:polarizations":
return "polarization"
return field.removeprefix("eo:bands_")


def spec_to_attrs(spec: RasterSpec) -> Dict[str, Any]:
attrs = {"spec": spec, "crs": f"epsg:{spec.epsg}", "transform": spec.transform}

resolutions = spec.resolutions_xy
if resolutions[0] == resolutions[1]:
attrs["resolution"] = resolutions[0]
else:
attrs["resolution_xy"] = resolutions
return attrs
Loading