From 84625d82f3a11210e136f323054837222eaf2848 Mon Sep 17 00:00:00 2001
From: Kathryn Mazaitis <kathryn.mazaitis@catalyst.coop>
Date: Fri, 14 Feb 2025 15:21:17 -0500
Subject: [PATCH 1/4] Integrate cambium into new abstract archiver system

---
 src/pudl_archiver/archivers/nrelcambium.py | 79 ++++++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100644 src/pudl_archiver/archivers/nrelcambium.py
diff --git a/src/pudl_archiver/archivers/nrelcambium.py b/src/pudl_archiver/archivers/nrelcambium.py
new file mode 100644
index 00000000..19ebd9eb
--- /dev/null
+++ b/src/pudl_archiver/archivers/nrelcambium.py
@@ -0,0 +1,79 @@
+"""Download NREL Cambium Scenarios data."""
+
+import re
+
+from pudl_archiver.archivers.classes import (
+    ArchiveAwaitable,
+    ResourceInfo,
+    _download_file,
+)
+from pudl_archiver.archivers.nrelss import (
+    API_URL_FILE_DOWNLOAD,
+    API_URL_PROJECTS_LIST,
+    AbstractNrelScenarioArchiver,
+)
+from pudl_archiver.utils import retry_async
+
+
+class NrelCambiumArchiver(AbstractNrelScenarioArchiver):
+    """NREL Cambium archiver."""
+
+    name = "nrelcambium"
+    project_year_pattern = re.compile(r"Cambium (?P<year>\d{4})")
+    project_startswith = "Cambium 2022"
+    report_section = "long_description"
+    file_naming_order = ("scenario", "metric", "time_resolution", "location_type")
+
+    concurrency_limit = 1  # Cambium scenarios are Large so only handle 2 at a time
+
+    async def get_resources(self) -> ArchiveAwaitable:
+        """Download NREL Cambium resources.
+
+        Basic flow:
+            1. Fetch the list of projects and extract just the one for this archiver.
+            2. Pull out metadata: uuid, year, links to any PDF reports, and data files. PDF report URLs are not provided in a dedicated field in the project response, but are part of an HTML value for the description or citation in the project. Sometimes this field is simply blank, and we need to use a hard-coded exception. The data files don't have good filenames associated with them, so we make one.
+            4. Download each report and file for the project as separate resources.
+        """
+        project_records = await self.get_json(API_URL_PROJECTS_LIST)
+        scenario_project = [
+            p for p in project_records if p["name"].startswith(self.project_startswith)
+        ]
+        assert len(scenario_project) == 1
+        scenario_project = scenario_project.pop()
+        (
+            project_uuid,
+            project_year,
+            report_data,
+            file_ids,
+        ) = await self.collect_project_info(scenario_project)
+        assert project_uuid
+        for filename, url in report_data:
+            yield self.get_report_resource(filename, url)
+        for filename, file_id in file_ids:
+            yield self.get_file_resource(filename, project_uuid, file_id)
+
+    async def get_report_resource(self, filename, url) -> ResourceInfo:
+        """Retrieve and compress PDF report and return as ResourceInfo."""
+        self.logger.info(f"Downloading report {filename}")
+        zip_path = self.download_directory / f"{filename}.zip"
+        await self.download_and_zip_file(url, filename, zip_path)
+        return ResourceInfo(
+            local_path=zip_path,
+            partitions={},
+        )
+
+    async def get_file_resource(self, filename, uuid, file_id) -> ResourceInfo:
+        """Retrieve and data file and return as ResourceInfo."""
+        self.logger.info(f"Downloading file {filename} {file_id} {uuid}")
+        download_path = self.download_directory / filename
+
+        await retry_async(
+            _download_file,
+            [self.session, API_URL_FILE_DOWNLOAD, download_path, True],
+            kwargs={"data": {"project_uuid": uuid, "file_ids": file_id}},
+            retry_base_s=20,
+        )
+        return ResourceInfo(
+            local_path=download_path,
+            partitions={},
+        )

From 139158d148a5f1a78351aab100ea178f3fcb9253 Mon Sep 17 00:00:00 2001
From: Kathryn Mazaitis <kathryn.mazaitis@catalyst.coop>
Date: Fri, 14 Feb 2025 15:21:26 -0500
Subject: [PATCH 2/4] Remove implicit limit from archiver depth search.

Old method only removed intermediate (abstract) classes to depth one. This approach works for arbitrary depth.
(And hopefully won't introduce any cycle bugs)
---
 src/pudl_archiver/__init__.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/pudl_archiver/__init__.py b/src/pudl_archiver/__init__.py
index c0acfa11..2d82fd0b 100644
--- a/src/pudl_archiver/__init__.py
+++ b/src/pudl_archiver/__init__.py
@@ -31,12 +31,19 @@ def all_archivers():
 
     def all_subclasses(cls):
         """If a subclass has subclasses, include them in the list. Remove intermediaries."""
-        subclasses = set(cls.__subclasses__())
-        for c in subclasses.copy():
+        subclasses = set()
+        queue = set(cls.__subclasses__())
+        rejected = set()
+        while queue:
+            c = queue.pop()
+            if c in rejected:
+                continue
             subsubclasses = set(c.__subclasses__())
             if subsubclasses:
-                subclasses.remove(c)
-                subclasses = subclasses.union(subsubclasses)
+                rejected.add(c)
+                queue = queue.union(subsubclasses)
+            else:
+                subclasses.add(c)
         return subclasses
 
     return all_subclasses(AbstractDatasetArchiver)

From 0fea46ff1b5ddf63089bd1761db7fa7d79cebe02 Mon Sep 17 00:00:00 2001
From: Kathryn Mazaitis <kathryn.mazaitis@catalyst.coop>
Date: Fri, 14 Feb 2025 15:21:33 -0500
Subject: [PATCH 3/4] Reset retry counter if the server has been working okay
 between timeouts

---
 src/pudl_archiver/utils.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/pudl_archiver/utils.py b/src/pudl_archiver/utils.py
index 1815ac94..0cbe072f 100644
--- a/src/pudl_archiver/utils.py
+++ b/src/pudl_archiver/utils.py
@@ -8,6 +8,7 @@
 from hashlib import md5
 from io import BytesIO
 from pathlib import Path
+from time import time
 
 import aiohttp
 from pydantic import AnyUrl, BaseModel
@@ -46,7 +47,11 @@ async def retry_async(
         args = []
     if kwargs is None:
         kwargs = {}
-    for try_count in range(1, retry_count + 1):  # noqa: RET503
+    last_failure_s = time()
+    max_delay_s = retry_base_s * 2**retry_count
+    try_count = 0
+    while try_count < retry_count:  # noqa: RET503
+        try_count += 1
         # try count is 1 indexed for logging clarity
         coro = async_func(*args, **kwargs)
         try:
@@ -54,6 +59,10 @@ async def retry_async(
         except retry_on as e:
             if try_count == retry_count:
                 raise e
+            current_failure_s = time()
+            if (current_failure_s - last_failure_s) > max_delay_s:
+                try_count = 1
+            last_failure_s = current_failure_s
             retry_delay_s = retry_base_s * 2 ** (try_count - 1)
             logger.info(
                 f"Error while executing {coro} (try #{try_count}, retry in {retry_delay_s}s): {type(e)} - {e}"

From 25c427ac8cd999d2fa905f32a4e38a8afa5a0b99 Mon Sep 17 00:00:00 2001
From: Kathryn Mazaitis <kathryn.mazaitis@catalyst.coop>
Date: Fri, 14 Feb 2025 15:21:42 -0500
Subject: [PATCH 4/4] Refactor NREL Cambium into a base class plus one subclass
 per year, and associated metadata

---
 src/pudl_archiver/archivers/nrelcambium.py | 40 +++++++++++++++++++---
 src/pudl_archiver/metadata/nrelcambium.py  | 40 ++++++++++++++++++++++
 src/pudl_archiver/metadata/sources.py      |  6 ++++
 3 files changed, 81 insertions(+), 5 deletions(-)
 create mode 100644 src/pudl_archiver/metadata/nrelcambium.py

diff --git a/src/pudl_archiver/archivers/nrelcambium.py b/src/pudl_archiver/archivers/nrelcambium.py
index 19ebd9eb..9adc757d 100644
--- a/src/pudl_archiver/archivers/nrelcambium.py
+++ b/src/pudl_archiver/archivers/nrelcambium.py
@@ -15,12 +15,12 @@
 from pudl_archiver.utils import retry_async
 
 
-class NrelCambiumArchiver(AbstractNrelScenarioArchiver):
-    """NREL Cambium archiver."""
+class AbstractNrelCambiumArchiver(AbstractNrelScenarioArchiver):
+    """Base class for NREL Cambium archivers."""
 
-    name = "nrelcambium"
+    project_year: int
     project_year_pattern = re.compile(r"Cambium (?P<year>\d{4})")
-    project_startswith = "Cambium 2022"
+    project_startswith = "Cambium "
     report_section = "long_description"
     file_naming_order = ("scenario", "metric", "time_resolution", "location_type")
 
@@ -36,7 +36,9 @@ async def get_resources(self) -> ArchiveAwaitable:
         """
         project_records = await self.get_json(API_URL_PROJECTS_LIST)
         scenario_project = [
-            p for p in project_records if p["name"].startswith(self.project_startswith)
+            p
+            for p in project_records
+            if p["name"].startswith(f"{self.project_startswith}{self.project_year}")
         ]
         assert len(scenario_project) == 1
         scenario_project = scenario_project.pop()
@@ -77,3 +79,31 @@ async def get_file_resource(self, filename, uuid, file_id) -> ResourceInfo:
             local_path=download_path,
             partitions={},
         )
+
+
+class NrelCambium2020Archiver(AbstractNrelCambiumArchiver):
+    """NREL Cambium archiver for 2020."""
+
+    name = "nrelcambium2020"
+    project_year = 2020
+
+
+class NrelCambium2021Archiver(AbstractNrelCambiumArchiver):
+    """NREL Cambium archiver for 2021."""
+
+    name = "nrelcambium2021"
+    project_year = 2021
+
+
+class NrelCambium2022Archiver(AbstractNrelCambiumArchiver):
+    """NREL Cambium archiver for 2022."""
+
+    name = "nrelcambium2022"
+    project_year = 2022
+
+
+class NrelCambium2023Archiver(AbstractNrelCambiumArchiver):
+    """NREL Cambium archiver for 2023."""
+
+    name = "nrelcambium2023"
+    project_year = 2023
diff --git a/src/pudl_archiver/metadata/nrelcambium.py b/src/pudl_archiver/metadata/nrelcambium.py
new file mode 100644
index 00000000..37e4a527
--- /dev/null
+++ b/src/pudl_archiver/metadata/nrelcambium.py
@@ -0,0 +1,40 @@
+"""NREL Cambium -specific metadata helper."""
+
+from pudl.metadata.constants import CONTRIBUTORS, KEYWORDS, LICENSES
+
+
+def nrel_cambium_generator(year):
+    """Generate metadata dictionaries for NREL Cambium.
+
+    NREL Cambium datasets are too large to group together under a "years" partition, but otherwise share metadata.
+    """
+    return {
+        "title": f"NREL Cambium {year}",
+        "path": "https://www.nrel.gov/analysis/cambium.html",
+        "description": (
+            f"""Cambium datasets contain modeled hourly data for a range of possible futures of the U.S. electricity sector, with metrics designed to be useful for forward-looking analysis and decision support.
+
+Cambium is annually updated and expands on the metrics reported in NREL’s Standard Scenarios—another annually released set of projections of how the U.S. electric sector could evolve across a suite of potential futures.
+
+The {year} Cambium release includes two products:
+
+The full {year} Cambium datasets;
+NREL reports describing the scenarios, defining metrics and methods, describing major changes since the last release, and discussing intended uses and limitations of the dataset."""
+        ),
+        "source_file_dict": {
+            "source_format": "CSV",
+        },
+        "working_partitions": {},
+        "contributors": [
+            CONTRIBUTORS["catalyst-cooperative"],
+        ],
+        "keywords": sorted(
+            {
+                "nrel",
+                "cambium",
+            }
+            | set(KEYWORDS["us_govt"] + KEYWORDS["electricity"])
+        ),
+        "license_raw": LICENSES["cc-by-4.0"],
+        "license_pudl": LICENSES["cc-by-4.0"],
+    }
diff --git a/src/pudl_archiver/metadata/sources.py b/src/pudl_archiver/metadata/sources.py
index 38e603bb..f8eff8da 100644
--- a/src/pudl_archiver/metadata/sources.py
+++ b/src/pudl_archiver/metadata/sources.py
@@ -4,6 +4,8 @@
 
 from pudl.metadata.constants import CONTRIBUTORS, KEYWORDS, LICENSES
 
+from pudl_archiver.metadata.nrelcambium import nrel_cambium_generator
+
 # To add a new contributor, follow the following format to add an entry to the
 # ADDL_CONTRIBUTORS dictionary below formatted like this:
 #     "name-shorthand": {
@@ -454,4 +456,8 @@
         "license_raw": LICENSES["cc-by-4.0"],
         "license_pudl": LICENSES["cc-by-4.0"],
     },
+    "nrelcambium2020": nrel_cambium_generator(2020),
+    "nrelcambium2021": nrel_cambium_generator(2021),
+    "nrelcambium2022": nrel_cambium_generator(2022),
+    "nrelcambium2023": nrel_cambium_generator(2023),
 }