From 84625d82f3a11210e136f323054837222eaf2848 Mon Sep 17 00:00:00 2001 From: Kathryn Mazaitis Date: Fri, 14 Feb 2025 15:21:17 -0500 Subject: [PATCH 1/4] Integrate cambium into new abstract archiver system --- src/pudl_archiver/archivers/nrelcambium.py | 79 ++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 src/pudl_archiver/archivers/nrelcambium.py diff --git a/src/pudl_archiver/archivers/nrelcambium.py b/src/pudl_archiver/archivers/nrelcambium.py new file mode 100644 index 00000000..19ebd9eb --- /dev/null +++ b/src/pudl_archiver/archivers/nrelcambium.py @@ -0,0 +1,79 @@ +"""Download NREL Cambium Scenarios data.""" + +import re + +from pudl_archiver.archivers.classes import ( + ArchiveAwaitable, + ResourceInfo, + _download_file, +) +from pudl_archiver.archivers.nrelss import ( + API_URL_FILE_DOWNLOAD, + API_URL_PROJECTS_LIST, + AbstractNrelScenarioArchiver, +) +from pudl_archiver.utils import retry_async + + +class NrelCambiumArchiver(AbstractNrelScenarioArchiver): + """NREL Cambium archiver.""" + + name = "nrelcambium" + project_year_pattern = re.compile(r"Cambium (?P\d{4})") + project_startswith = "Cambium 2022" + report_section = "long_description" + file_naming_order = ("scenario", "metric", "time_resolution", "location_type") + + concurrency_limit = 1 # Cambium scenarios are Large so only handle 2 at a time + + async def get_resources(self) -> ArchiveAwaitable: + """Download NREL Cambium resources. + + Basic flow: + 1. Fetch the list of projects and extract just the one for this archiver. + 2. Pull out metadata: uuid, year, links to any PDF reports, and data files. PDF report URLs are not provided in a dedicated field in the project response, but are part of an HTML value for the description or citation in the project. Sometimes this field is simply blank, and we need to use a hard-coded exception. The data files don't have good filenames associated with them, so we make one. + 4. Download each report and file for the project as separate resources. + """ + project_records = await self.get_json(API_URL_PROJECTS_LIST) + scenario_project = [ + p for p in project_records if p["name"].startswith(self.project_startswith) + ] + assert len(scenario_project) == 1 + scenario_project = scenario_project.pop() + ( + project_uuid, + project_year, + report_data, + file_ids, + ) = await self.collect_project_info(scenario_project) + assert project_uuid + for filename, url in report_data: + yield self.get_report_resource(filename, url) + for filename, file_id in file_ids: + yield self.get_file_resource(filename, project_uuid, file_id) + + async def get_report_resource(self, filename, url) -> ResourceInfo: + """Retrieve and compress PDF report and return as ResourceInfo.""" + self.logger.info(f"Downloading report {filename}") + zip_path = self.download_directory / f"{filename}.zip" + await self.download_and_zip_file(url, filename, zip_path) + return ResourceInfo( + local_path=zip_path, + partitions={}, + ) + + async def get_file_resource(self, filename, uuid, file_id) -> ResourceInfo: + """Retrieve and data file and return as ResourceInfo.""" + self.logger.info(f"Downloading file {filename} {file_id} {uuid}") + download_path = self.download_directory / filename + + await retry_async( + _download_file, + [self.session, API_URL_FILE_DOWNLOAD, download_path, True], + kwargs={"data": {"project_uuid": uuid, "file_ids": file_id}}, + retry_base_s=20, + ) + return ResourceInfo( + local_path=download_path, + partitions={}, + ) From 139158d148a5f1a78351aab100ea178f3fcb9253 Mon Sep 17 00:00:00 2001 From: Kathryn Mazaitis Date: Fri, 14 Feb 2025 15:21:26 -0500 Subject: [PATCH 2/4] Remove implicit limit from archiver depth search. Old method only removed intermediate (abstract) classes to depth one. This approach works for arbitrary depth. (And hopefully won't introduce any cycle bugs) --- src/pudl_archiver/__init__.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/pudl_archiver/__init__.py b/src/pudl_archiver/__init__.py index c0acfa11..2d82fd0b 100644 --- a/src/pudl_archiver/__init__.py +++ b/src/pudl_archiver/__init__.py @@ -31,12 +31,19 @@ def all_archivers(): def all_subclasses(cls): """If a subclass has subclasses, include them in the list. Remove intermediaries.""" - subclasses = set(cls.__subclasses__()) - for c in subclasses.copy(): + subclasses = set() + queue = set(cls.__subclasses__()) + rejected = set() + while queue: + c = queue.pop() + if c in rejected: + continue subsubclasses = set(c.__subclasses__()) if subsubclasses: - subclasses.remove(c) - subclasses = subclasses.union(subsubclasses) + rejected.add(c) + queue = queue.union(subsubclasses) + else: + subclasses.add(c) return subclasses return all_subclasses(AbstractDatasetArchiver) From 0fea46ff1b5ddf63089bd1761db7fa7d79cebe02 Mon Sep 17 00:00:00 2001 From: Kathryn Mazaitis Date: Fri, 14 Feb 2025 15:21:33 -0500 Subject: [PATCH 3/4] Reset retry counter if the server has been working okay between timeouts --- src/pudl_archiver/utils.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/pudl_archiver/utils.py b/src/pudl_archiver/utils.py index 1815ac94..0cbe072f 100644 --- a/src/pudl_archiver/utils.py +++ b/src/pudl_archiver/utils.py @@ -8,6 +8,7 @@ from hashlib import md5 from io import BytesIO from pathlib import Path +from time import time import aiohttp from pydantic import AnyUrl, BaseModel @@ -46,7 +47,11 @@ async def retry_async( args = [] if kwargs is None: kwargs = {} - for try_count in range(1, retry_count + 1): # noqa: RET503 + last_failure_s = time() + max_delay_s = retry_base_s * 2**retry_count + try_count = 0 + while try_count < retry_count: # noqa: RET503 + try_count += 1 # try count is 1 indexed for logging clarity coro = async_func(*args, **kwargs) try: @@ -54,6 +59,10 @@ async def retry_async( except retry_on as e: if try_count == retry_count: raise e + current_failure_s = time() + if (current_failure_s - last_failure_s) > max_delay_s: + try_count = 1 + last_failure_s = current_failure_s retry_delay_s = retry_base_s * 2 ** (try_count - 1) logger.info( f"Error while executing {coro} (try #{try_count}, retry in {retry_delay_s}s): {type(e)} - {e}" From 25c427ac8cd999d2fa905f32a4e38a8afa5a0b99 Mon Sep 17 00:00:00 2001 From: Kathryn Mazaitis Date: Fri, 14 Feb 2025 15:21:42 -0500 Subject: [PATCH 4/4] Refactor NREL Cambium into a base class plus one subclass per year, and associated metadata --- src/pudl_archiver/archivers/nrelcambium.py | 40 +++++++++++++++++++--- src/pudl_archiver/metadata/nrelcambium.py | 40 ++++++++++++++++++++++ src/pudl_archiver/metadata/sources.py | 6 ++++ 3 files changed, 81 insertions(+), 5 deletions(-) create mode 100644 src/pudl_archiver/metadata/nrelcambium.py diff --git a/src/pudl_archiver/archivers/nrelcambium.py b/src/pudl_archiver/archivers/nrelcambium.py index 19ebd9eb..9adc757d 100644 --- a/src/pudl_archiver/archivers/nrelcambium.py +++ b/src/pudl_archiver/archivers/nrelcambium.py @@ -15,12 +15,12 @@ from pudl_archiver.utils import retry_async -class NrelCambiumArchiver(AbstractNrelScenarioArchiver): - """NREL Cambium archiver.""" +class AbstractNrelCambiumArchiver(AbstractNrelScenarioArchiver): + """Base class for NREL Cambium archivers.""" - name = "nrelcambium" + project_year: int project_year_pattern = re.compile(r"Cambium (?P\d{4})") - project_startswith = "Cambium 2022" + project_startswith = "Cambium " report_section = "long_description" file_naming_order = ("scenario", "metric", "time_resolution", "location_type") @@ -36,7 +36,9 @@ async def get_resources(self) -> ArchiveAwaitable: """ project_records = await self.get_json(API_URL_PROJECTS_LIST) scenario_project = [ - p for p in project_records if p["name"].startswith(self.project_startswith) + p + for p in project_records + if p["name"].startswith(f"{self.project_startswith}{self.project_year}") ] assert len(scenario_project) == 1 scenario_project = scenario_project.pop() @@ -77,3 +79,31 @@ async def get_file_resource(self, filename, uuid, file_id) -> ResourceInfo: local_path=download_path, partitions={}, ) + + +class NrelCambium2020Archiver(AbstractNrelCambiumArchiver): + """NREL Cambium archiver for 2020.""" + + name = "nrelcambium2020" + project_year = 2020 + + +class NrelCambium2021Archiver(AbstractNrelCambiumArchiver): + """NREL Cambium archiver for 2021.""" + + name = "nrelcambium2021" + project_year = 2021 + + +class NrelCambium2022Archiver(AbstractNrelCambiumArchiver): + """NREL Cambium archiver for 2022.""" + + name = "nrelcambium2022" + project_year = 2022 + + +class NrelCambium2023Archiver(AbstractNrelCambiumArchiver): + """NREL Cambium archiver for 2023.""" + + name = "nrelcambium2023" + project_year = 2023 diff --git a/src/pudl_archiver/metadata/nrelcambium.py b/src/pudl_archiver/metadata/nrelcambium.py new file mode 100644 index 00000000..37e4a527 --- /dev/null +++ b/src/pudl_archiver/metadata/nrelcambium.py @@ -0,0 +1,40 @@ +"""NREL Cambium -specific metadata helper.""" + +from pudl.metadata.constants import CONTRIBUTORS, KEYWORDS, LICENSES + + +def nrel_cambium_generator(year): + """Generate metadata dictionaries for NREL Cambium. + + NREL Cambium datasets are too large to group together under a "years" partition, but otherwise share metadata. + """ + return { + "title": f"NREL Cambium {year}", + "path": "https://www.nrel.gov/analysis/cambium.html", + "description": ( + f"""Cambium datasets contain modeled hourly data for a range of possible futures of the U.S. electricity sector, with metrics designed to be useful for forward-looking analysis and decision support. + +Cambium is annually updated and expands on the metrics reported in NREL’s Standard Scenarios—another annually released set of projections of how the U.S. electric sector could evolve across a suite of potential futures. + +The {year} Cambium release includes two products: + +The full {year} Cambium datasets; +NREL reports describing the scenarios, defining metrics and methods, describing major changes since the last release, and discussing intended uses and limitations of the dataset.""" + ), + "source_file_dict": { + "source_format": "CSV", + }, + "working_partitions": {}, + "contributors": [ + CONTRIBUTORS["catalyst-cooperative"], + ], + "keywords": sorted( + { + "nrel", + "cambium", + } + | set(KEYWORDS["us_govt"] + KEYWORDS["electricity"]) + ), + "license_raw": LICENSES["cc-by-4.0"], + "license_pudl": LICENSES["cc-by-4.0"], + } diff --git a/src/pudl_archiver/metadata/sources.py b/src/pudl_archiver/metadata/sources.py index 38e603bb..f8eff8da 100644 --- a/src/pudl_archiver/metadata/sources.py +++ b/src/pudl_archiver/metadata/sources.py @@ -4,6 +4,8 @@ from pudl.metadata.constants import CONTRIBUTORS, KEYWORDS, LICENSES +from pudl_archiver.metadata.nrelcambium import nrel_cambium_generator + # To add a new contributor, follow the following format to add an entry to the # ADDL_CONTRIBUTORS dictionary below formatted like this: # "name-shorthand": { @@ -454,4 +456,8 @@ "license_raw": LICENSES["cc-by-4.0"], "license_pudl": LICENSES["cc-by-4.0"], }, + "nrelcambium2020": nrel_cambium_generator(2020), + "nrelcambium2021": nrel_cambium_generator(2021), + "nrelcambium2022": nrel_cambium_generator(2022), + "nrelcambium2023": nrel_cambium_generator(2023), }