-
Notifications
You must be signed in to change notification settings - Fork 96
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add optional caching to AreaDefinition.get_area_slices #553
Merged
Merged
Changes from all commits
Commits
Show all changes
10 commits
Select commit
Hold shift + click to select a range
8c15774
Add cache directory and cache geometry slices configuration options
djhoese 6fd85a1
Add initial proof of concept area slice caching
djhoese 94ca7a8
Refactor JSON caching to resemble satpy caching decorators
djhoese 2c80543
Drop musllinux builds (pyproj isn't providing them)
djhoese e9f7fa0
Fix area slice tests after function refactor
djhoese 2f9df8b
Simplify area slice tests with parametrize
djhoese 1de5f82
Add more area slice caching tests
djhoese dc68bfe
Fix missing future annotations import
djhoese a224317
Rename cache_geom_slices to cache_geometry_slices
djhoese b0a2579
Add more information to slice caching config documentation
djhoese File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
"""Various tools for caching. | ||
|
||
These tools are rarely needed by users and are used where they make sense | ||
throughout pyresample. | ||
|
||
""" | ||
from __future__ import annotations | ||
|
||
import hashlib | ||
import json | ||
import os | ||
import warnings | ||
from functools import update_wrapper | ||
from glob import glob | ||
from pathlib import Path | ||
from typing import Any, Callable | ||
|
||
import pyresample | ||
|
||
|
||
class JSONCacheHelper: | ||
"""Decorator class to cache results to a JSON file on-disk.""" | ||
|
||
def __init__( | ||
self, | ||
func: Callable, | ||
cache_config_key: str, | ||
cache_version: int = 1, | ||
): | ||
self._callable = func | ||
self._cache_config_key = cache_config_key | ||
self._cache_version = cache_version | ||
self._uncacheable_arg_type_names = ("",) | ||
|
||
@staticmethod | ||
def cache_clear(cache_dir: str | None = None): | ||
"""Remove all on-disk files associated with this function. | ||
|
||
Intended to mimic the :func:`functools.cache` behavior. | ||
""" | ||
cache_dir = _get_cache_dir_from_config(cache_dir=cache_dir, cache_version="*") | ||
for json_file in glob(str(cache_dir / "*.json")): | ||
os.remove(json_file) | ||
|
||
def __call__(self, *args): | ||
"""Call decorated function and cache the result to JSON.""" | ||
should_cache = pyresample.config.get(self._cache_config_key, False) | ||
if not should_cache: | ||
return self._callable(*args) | ||
|
||
try: | ||
arg_hash = _hash_args(args) | ||
except TypeError as err: | ||
warnings.warn("Cannot cache function due to unhashable argument: " + str(err), | ||
stacklevel=2) | ||
return self._callable(*args) | ||
|
||
return self._run_and_cache(arg_hash, args) | ||
|
||
def _run_and_cache(self, arg_hash: str, args: tuple[Any]) -> Any: | ||
base_cache_dir = _get_cache_dir_from_config(cache_version=self._cache_version) | ||
json_path = base_cache_dir / f"{arg_hash}.json" | ||
if not json_path.is_file(): | ||
res = self._callable(*args) | ||
json_path.parent.mkdir(exist_ok=True) | ||
with open(json_path, "w") as json_cache: | ||
json.dump(res, json_cache, cls=_JSONEncoderWithSlice) | ||
|
||
# for consistency, always load the cached result | ||
with open(json_path, "r") as json_cache: | ||
res = json.load(json_cache, object_hook=_object_hook) | ||
return res | ||
|
||
|
||
def _get_cache_dir_from_config(cache_dir: str | None = None, cache_version: int | str = 1) -> Path: | ||
cache_dir = cache_dir or pyresample.config.get("cache_dir") | ||
if cache_dir is None: | ||
raise RuntimeError("Can't use JSON caching. No 'cache_dir' configured.") | ||
subdir = f"geometry_slices_v{cache_version}" | ||
return Path(cache_dir) / subdir | ||
|
||
|
||
def _hash_args(args: tuple[Any]) -> str: | ||
from pyresample.future.geometry import AreaDefinition, SwathDefinition | ||
from pyresample.geometry import AreaDefinition as LegacyAreaDefinition | ||
from pyresample.geometry import SwathDefinition as LegacySwathDefinition | ||
|
||
hashable_args = [] | ||
for arg in args: | ||
if isinstance(arg, (SwathDefinition, LegacySwathDefinition)): | ||
raise TypeError(f"Unhashable type ({type(arg)})") | ||
if isinstance(arg, (AreaDefinition, LegacyAreaDefinition)): | ||
arg = hash(arg) | ||
hashable_args.append(arg) | ||
arg_hash = hashlib.sha1() # nosec | ||
arg_hash.update(json.dumps(tuple(hashable_args)).encode("utf8")) | ||
return arg_hash.hexdigest() | ||
|
||
|
||
class _JSONEncoderWithSlice(json.JSONEncoder): | ||
def default(self, obj: Any) -> Any: | ||
if isinstance(obj, slice): | ||
return {"__slice__": True, "start": obj.start, "stop": obj.stop, "step": obj.step} | ||
return super().default(obj) | ||
|
||
|
||
def _object_hook(obj: object) -> Any: | ||
if isinstance(obj, dict) and obj.get("__slice__", False): | ||
return slice(obj["start"], obj["stop"], obj["step"]) | ||
return obj | ||
|
||
|
||
def cache_to_json_if(cache_config_key: str) -> Callable: | ||
"""Decorate a function and cache the results to a JSON file on disk. | ||
|
||
This caching only happens if the ``pyresample.config`` boolean value for | ||
the provided key is ``True`` as well as some other conditions. See | ||
:class:`JSONCacheHelper` for more information. Most importantly this | ||
decorator does not limit how many items can be cached and does not clear | ||
out old entries. It is up to the user to manage the size of the cache. | ||
|
||
""" | ||
def _decorator(func: Callable) -> Callable: | ||
zarr_cacher = JSONCacheHelper(func, cache_config_key) | ||
wrapper = update_wrapper(zarr_cacher, func) | ||
return wrapper | ||
|
||
return _decorator |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
"""Functions and tools for subsetting a geometry object.""" | ||
from __future__ import annotations | ||
|
||
import math | ||
from typing import TYPE_CHECKING, Any | ||
|
||
import numpy as np | ||
|
||
# this caching module imports the geometries so this subset module | ||
# must be imported inside functions in the geometry modules if needed | ||
# to avoid circular dependencies | ||
from pyresample._caching import cache_to_json_if | ||
from pyresample.boundary import Boundary | ||
from pyresample.geometry import get_geostationary_bounding_box_in_lonlats, logger | ||
from pyresample.utils import check_slice_orientation | ||
|
||
if TYPE_CHECKING: | ||
from pyresample import AreaDefinition | ||
|
||
|
||
@cache_to_json_if("cache_geometry_slices") | ||
def get_area_slices( | ||
src_area: AreaDefinition, | ||
area_to_cover: AreaDefinition, | ||
shape_divisible_by: int | None, | ||
) -> tuple[slice, slice]: | ||
"""Compute the slice to read based on an `area_to_cover`.""" | ||
if not _is_area_like(src_area): | ||
raise NotImplementedError(f"Only AreaDefinitions are supported, not {type(src_area)}") | ||
if not _is_area_like(area_to_cover): | ||
raise NotImplementedError(f"Only AreaDefinitions are supported, not {type(area_to_cover)}") | ||
|
||
# Intersection only required for two different projections | ||
proj_def_to_cover = area_to_cover.crs | ||
proj_def = src_area.crs | ||
if proj_def_to_cover == proj_def: | ||
logger.debug('Projections for data and slice areas are identical: %s', | ||
proj_def_to_cover) | ||
# Get slice parameters | ||
xstart, xstop, ystart, ystop = _get_slice_starts_stops(src_area, area_to_cover) | ||
|
||
x_slice = check_slice_orientation(slice(xstart, xstop)) | ||
y_slice = check_slice_orientation(slice(ystart, ystop)) | ||
x_slice = _ensure_integer_slice(x_slice) | ||
y_slice = _ensure_integer_slice(y_slice) | ||
return x_slice, y_slice | ||
|
||
data_boundary = _get_area_boundary(src_area) | ||
area_boundary = _get_area_boundary(area_to_cover) | ||
intersection = data_boundary.contour_poly.intersection( | ||
area_boundary.contour_poly) | ||
if intersection is None: | ||
logger.debug('Cannot determine appropriate slicing. ' | ||
"Data and projection area do not overlap.") | ||
raise NotImplementedError | ||
x, y = src_area.get_array_indices_from_lonlat( | ||
np.rad2deg(intersection.lon), np.rad2deg(intersection.lat)) | ||
x_slice = slice(np.ma.min(x), np.ma.max(x) + 1) | ||
y_slice = slice(np.ma.min(y), np.ma.max(y) + 1) | ||
x_slice = _ensure_integer_slice(x_slice) | ||
y_slice = _ensure_integer_slice(y_slice) | ||
if shape_divisible_by is not None: | ||
x_slice = _make_slice_divisible(x_slice, src_area.width, | ||
factor=shape_divisible_by) | ||
y_slice = _make_slice_divisible(y_slice, src_area.height, | ||
factor=shape_divisible_by) | ||
|
||
return (check_slice_orientation(x_slice), | ||
check_slice_orientation(y_slice)) | ||
|
||
|
||
def _is_area_like(area_obj: Any) -> bool: | ||
return hasattr(area_obj, "crs") and hasattr(area_obj, "area_extent") | ||
|
||
|
||
def _get_slice_starts_stops(src_area, area_to_cover): | ||
"""Get x and y start and stop points for slicing.""" | ||
llx, lly, urx, ury = area_to_cover.area_extent | ||
x, y = src_area.get_array_coordinates_from_projection_coordinates([llx, urx], [lly, ury]) | ||
|
||
# we use `round` because we want the *exterior* of the pixels to contain the area_to_cover's area extent. | ||
if (src_area.area_extent[0] > src_area.area_extent[2]) ^ (llx > urx): | ||
xstart = max(0, round(x[1])) | ||
xstop = min(src_area.width, round(x[0]) + 1) | ||
else: | ||
xstart = max(0, round(x[0])) | ||
xstop = min(src_area.width, round(x[1]) + 1) | ||
if (src_area.area_extent[1] > src_area.area_extent[3]) ^ (lly > ury): | ||
ystart = max(0, round(y[0])) | ||
ystop = min(src_area.height, round(y[1]) + 1) | ||
else: | ||
ystart = max(0, round(y[1])) | ||
ystop = min(src_area.height, round(y[0]) + 1) | ||
|
||
return xstart, xstop, ystart, ystop | ||
|
||
|
||
def _get_area_boundary(area_to_cover: AreaDefinition) -> Boundary: | ||
try: | ||
if area_to_cover.is_geostationary: | ||
return Boundary(*get_geostationary_bounding_box_in_lonlats(area_to_cover)) | ||
boundary_shape = max(max(*area_to_cover.shape) // 100 + 1, 3) | ||
return area_to_cover.boundary(frequency=boundary_shape, force_clockwise=True) | ||
except ValueError: | ||
raise NotImplementedError("Can't determine boundary of area to cover") | ||
|
||
|
||
def _make_slice_divisible(sli, max_size, factor=2): | ||
"""Make the given slice even in size.""" | ||
rem = (sli.stop - sli.start) % factor | ||
if rem != 0: | ||
adj = factor - rem | ||
if sli.stop + 1 + rem < max_size: | ||
sli = slice(sli.start, sli.stop + adj) | ||
elif sli.start > 0: | ||
sli = slice(sli.start - adj, sli.stop) | ||
else: | ||
sli = slice(sli.start, sli.stop - rem) | ||
|
||
return sli | ||
|
||
|
||
def _ensure_integer_slice(sli): | ||
start = sli.start | ||
stop = sli.stop | ||
step = sli.step | ||
return slice( | ||
math.floor(start) if start is not None else None, | ||
math.ceil(stop) if stop is not None else None, | ||
math.floor(step) if step is not None else None | ||
) |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we need to add a sentence or two on what kind of improvement we can expect on performance and in which situation
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh your comment is reminding me: Do you have a go-to benchmark for the gradient search that you can tell me or run yourself with this PR and with this caching enabled? I think the gradient search is the only other part of pyresample that uses the area slices directly and I don't want to make it unnecessarily slow. That said, if it caches the slices for an area -> area resampling (the only thing gradient search supports right now) then it'd probably make all future operations fast. Especially since iirc satpy doesn't do
reduce_data
for gradient search.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok I added more information, but I got a little wordy so let me know what you think.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@pnuu is using gradient search alot, maybe he can help here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks good with the explanation.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't have my usual test script at hand, but can test this on monday. If I remember....