From b4fc95a178ec5cb908899875e48af6cb21efb3e3 Mon Sep 17 00:00:00 2001 From: Stefaan Lippens Date: Mon, 4 Sep 2023 18:45:38 +0200 Subject: [PATCH] Issue #115 CrossBackendSplitter: internalize backend_for_collection caching --- scripts/crossbackend-processing-poc.py | 1 - .../partitionedjobs/crossbackend.py | 18 +++++++++--------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/scripts/crossbackend-processing-poc.py b/scripts/crossbackend-processing-poc.py index 25d33925..3d2f7b01 100644 --- a/scripts/crossbackend-processing-poc.py +++ b/scripts/crossbackend-processing-poc.py @@ -58,7 +58,6 @@ def main(): with TimingLogger(title=f"Connecting to {backend_url}", logger=_log): connection = openeo.connect(url=backend_url).authenticate_oidc() - @functools.lru_cache(maxsize=100) def backend_for_collection(collection_id) -> str: metadata = connection.describe_collection(collection_id) return metadata["summaries"][STAC_PROPERTY_FEDERATION_BACKENDS][0] diff --git a/src/openeo_aggregator/partitionedjobs/crossbackend.py b/src/openeo_aggregator/partitionedjobs/crossbackend.py index 909e22bc..74195348 100644 --- a/src/openeo_aggregator/partitionedjobs/crossbackend.py +++ b/src/openeo_aggregator/partitionedjobs/crossbackend.py @@ -48,14 +48,14 @@ def split( process_graph = process["process_graph"] # Extract necessary back-ends from `load_collection` usage - backend_usage = collections.Counter( - self.backend_for_collection(node["arguments"]["id"]) - for node in process_graph.values() - if node["process_id"] == "load_collection" - ) - _log.info( - f"Extracted backend usage from `load_collection` nodes: {backend_usage}" - ) + backend_per_collection: Dict[str, str] = { + cid: self.backend_for_collection(cid) + for cid in ( + node["arguments"]["id"] for node in process_graph.values() if node["process_id"] == "load_collection" + ) + } + backend_usage = collections.Counter(backend_per_collection.values()) + _log.info(f"Extracted backend usage from `load_collection` nodes: {backend_usage=} {backend_per_collection=}") primary_backend = backend_usage.most_common(1)[0][0] if backend_usage else None secondary_backends = {b for b in backend_usage if b != primary_backend} @@ -70,7 +70,7 @@ def split( for node_id, node in process_graph.items(): if node["process_id"] == "load_collection": - bid = self.backend_for_collection(node["arguments"]["id"]) + bid = backend_per_collection[node["arguments"]["id"]] if bid == primary_backend and not ( self._always_split and primary_has_load_collection ):