reichlab · bsweger · Sep 27, 2024 · Sep 25, 2024 · Sep 25, 2024 · Sep 26, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,6 +14,7 @@ license = {text = "MIT"}
 
 dependencies = [
     "awscli>=1.32.92",
+    "boto3",
     "click",
     "cloudpathlib",
     "pandas",
@@ -29,6 +30,8 @@ dependencies = [
 [project.optional-dependencies]
 dev = [
     "coverage",
+    "freezegun",
+    "moto",
     "mypy",
     "pytest",
     "ruff",

diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt
@@ -2,12 +2,20 @@
 #    uv pip compile pyproject.toml --extra dev -o requirements/requirements-dev.txt
 awscli==1.32.116
     # via virus-clade-utils (pyproject.toml)
+boto3==1.34.116
+    # via
+    #   virus-clade-utils (pyproject.toml)
+    #   moto
 botocore==1.34.116
     # via
     #   awscli
+    #   boto3
+    #   moto
     #   s3transfer
 certifi==2024.2.2
     # via requests
+cffi==1.17.1
+    # via cryptography
 charset-normalizer==3.3.2
     # via requests
 click==8.1.7
@@ -20,20 +28,34 @@ colorama==0.4.6
     # via awscli
 coverage==7.5.3
     # via virus-clade-utils (pyproject.toml)
+cryptography==43.0.1
+    # via moto
 docutils==0.16
     # via awscli
+freezegun==1.5.1
+    # via virus-clade-utils (pyproject.toml)
 idna==3.7
     # via requests
 iniconfig==2.0.0
     # via pytest
 jellyfish==1.1.0
     # via us
+jinja2==3.1.4
+    # via moto
 jmespath==1.0.1
-    # via botocore
+    # via
+    #   boto3
+    #   botocore
 markdown-it-py==3.0.0
     # via rich
+markupsafe==2.1.5
+    # via
+    #   jinja2
+    #   werkzeug
 mdurl==0.1.2
     # via markdown-it-py
+moto==5.0.15
+    # via virus-clade-utils (pyproject.toml)
 mypy==1.10.1
     # via virus-clade-utils (pyproject.toml)
 mypy-extensions==1.0.0
@@ -54,20 +76,31 @@ pyarrow==16.1.0
     # via virus-clade-utils (pyproject.toml)
 pyasn1==0.6.0
     # via rsa
+pycparser==2.22
+    # via cffi
 pygments==2.18.0
     # via rich
 pytest==8.2.1
     # via virus-clade-utils (pyproject.toml)
 python-dateutil==2.9.0.post0
     # via
     #   botocore
+    #   freezegun
+    #   moto
     #   pandas
 pytz==2024.1
     # via pandas
 pyyaml==6.0.1
-    # via awscli
+    # via
+    #   awscli
+    #   responses
 requests==2.32.3
-    # via virus-clade-utils (pyproject.toml)
+    # via
+    #   virus-clade-utils (pyproject.toml)
+    #   moto
+    #   responses
+responses==0.25.3
+    # via moto
 rich==13.7.1
     # via
     #   virus-clade-utils (pyproject.toml)
@@ -79,7 +112,9 @@ rsa==4.7.2
 ruff==0.5.0
     # via virus-clade-utils (pyproject.toml)
 s3transfer==0.10.1
-    # via awscli
+    # via
+    #   awscli
+    #   boto3
 six==1.16.0
     # via python-dateutil
 structlog==24.2.0
@@ -94,5 +129,10 @@ urllib3==2.2.1
     # via
     #   botocore
     #   requests
+    #   responses
 us==3.2.0
     # via virus-clade-utils (pyproject.toml)
+werkzeug==3.0.4
+    # via moto
+xmltodict==0.13.0
+    # via moto
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
@@ -2,9 +2,12 @@
 #    uv pip compile pyproject.toml -o requirements/requirements.txt
 awscli==1.32.97
     # via virus-clade-utils (pyproject.toml)
+boto3==1.34.97
+    # via virus-clade-utils (pyproject.toml)
 botocore==1.34.97
     # via
     #   awscli
+    #   boto3
     #   s3transfer
 certifi==2024.2.2
     # via requests
@@ -25,7 +28,9 @@ idna==3.7
 jellyfish==1.1.0
     # via us
 jmespath==1.0.1
-    # via botocore
+    # via
+    #   boto3
+    #   botocore
 markdown-it-py==3.0.0
     # via rich
 mdurl==0.1.2
@@ -63,7 +68,9 @@ rich-click==1.8.0
 rsa==4.7.2
     # via awscli
 s3transfer==0.10.1
-    # via awscli
+    # via
+    #   awscli
+    #   boto3
 six==1.16.0
     # via python-dateutil
 structlog==24.1.0

diff --git a/src/virus_clade_utils/get_clade_list.py b/src/virus_clade_utils/get_clade_list.py
@@ -15,6 +15,7 @@
     get_clade_counts,
     get_covid_genome_metadata,
 )
+from virus_clade_utils.util.session import get_session
 
 logger = structlog.get_logger()
 
@@ -68,7 +69,8 @@ def get_clades(clade_counts: pl.LazyFrame, threshold: float, threshold_weeks: in
 
 # FIXME: provide ability to instantiate Config for the get_clade_list function and get the data_path from there
 def main(
-    genome_metadata_path: AnyPath = Config.nextstrain_latest_genome_metadata,
+    genome_metadata_bucket: str = Config.nextstrain_ncov_bucket,
+    genome_metadata_key: str = Config.nextstrain_genome_metadata_key,
     data_dir: AnyPath = AnyPath(".").home() / "covid_variant",
     threshold: float = 0.01,
     threshold_weeks: int = 3,
@@ -79,8 +81,11 @@ def main(
 
     Parameters
     ----------
-    genome_metadata_path : AnyPath
-        Path to location of the most recent genome metadata file published by Nextstrain
+    genome_metadata_bucket : str
+        Name of the S3 bucket that hosts Nextstrain's open (GenBank) genome
+        metadata files published by the ncov pipeline.
+    genome_metdata_key : str
+        S3 key of the Nextstrain genome metadata file.
     data_dir : AnyPath
         Path to the location where the genome metadata file is saved after download.
     clade_counts : polars.LazyFrame
@@ -97,10 +102,12 @@ def main(
     -------
     list of strings
     """
-
     os.makedirs(data_dir, exist_ok=True)
+    session = get_session()
     genome_metadata_path = download_covid_genome_metadata(
-        genome_metadata_path,
+        session,
+        genome_metadata_bucket,
+        genome_metadata_key,
         data_dir,
     )
     lf_metadata = get_covid_genome_metadata(genome_metadata_path)

diff --git a/src/virus_clade_utils/util/config.py b/src/virus_clade_utils/util/config.py
@@ -7,9 +7,9 @@
 
 @dataclass
 class Config:
-    data_path_root: InitVar[AnyPath]
     sequence_released_date: InitVar[datetime]
     reference_tree_as_of_date: InitVar[datetime]
+    data_path_root: InitVar[str] = AnyPath(".")
     sequence_released_since_date: str = None
     reference_tree_date: str = None
     now = datetime.now()
@@ -18,7 +18,8 @@ class Config:
     ncbi_package_name: str = "ncbi.zip"
     ncbi_sequence_file: AnyPath = None
     ncbi_sequence_metadata_file: AnyPath = None
-    nextstrain_latest_genome_metadata = "https://data.nextstrain.org/files/ncov/open/metadata.tsv.zst"
+    nextstrain_ncov_bucket = "nextstrain-data"
+    nextstrain_genome_metadata_key = "files/ncov/open/metadata.tsv.zst"
     nextclade_base_url: str = "https://nextstrain.org/nextclade/sars-cov-2"
     reference_tree_file: AnyPath = None
     root_sequence_file: AnyPath = None
@@ -28,9 +29,9 @@ class Config:
 
     def __post_init__(
         self,
-        data_path_root: str | None,
         sequence_released_date: datetime.date,
         reference_tree_as_of_date: datetime.date,
+        data_path_root: str | None,
     ):
         if data_path_root:
             self.data_path = AnyPath(data_path_root)

diff --git a/src/virus_clade_utils/util/reference.py b/src/virus_clade_utils/util/reference.py
@@ -1,9 +1,14 @@
 """Functions for retrieving and parsing SARS-CoV-2 phylogenic tree data."""
 
 import subprocess
+from datetime import datetime
 from pathlib import Path
+from typing import Tuple
 
+import boto3
 import structlog
+from botocore import UNSIGNED
+from botocore.exceptions import BotoCoreError, ClientError, NoCredentialsError
 
 logger = structlog.get_logger()
 
@@ -40,3 +45,37 @@ def get_nextclade_dataset(as_of_date: str, data_path_root: str) -> str:
     )
 
     return DATASET_PATH
+
+
+def get_s3_object_url(bucket_name: str, object_key: str, date: datetime) -> Tuple[str, str]:
+    """
+    For a versioned, public S3 bucket and object key, return the version ID
+    of the object as it existed at a specific date (UTC)
+    """
+    try:
+        s3_client = boto3.client("s3", config=boto3.session.Config(signature_version=UNSIGNED))
+
+        paginator = s3_client.get_paginator("list_object_versions")
+        page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=object_key)
+
+        selected_version = None
+        for page in page_iterator:
+            for version in page.get("Versions", []):
+                version_date = version["LastModified"]
+                if version_date <= date:
+                    if selected_version is None or version_date > selected_version["LastModified"]:
+                        selected_version = version
+    except (BotoCoreError, ClientError, NoCredentialsError) as e:
+        logger.error("S3 client error", error=e)
+        raise e
+    except Exception as e:
+        logger.error("Unexpected error", error=e)
+        raise e
+
+    if selected_version is None:
+        raise ValueError(f"No version of {object_key} found before {date}")
-        raise ValueError(f"No version of {object_key} found before {date}")
+        raise ValueError(f"No version of {object_key} found on or before {date}")
-        raise ValueError(f"No version of {object_key} found before {date}")
+        raise ValueError(f"No version of {object_key} found on or before {date}")
+
+    version_id = selected_version["VersionId"]
+    version_url = f"https://{bucket_name}.s3.amazonaws.com/{object_key}?versionId={version_id}"
+
+    return version_id, version_url
diff --git a/src/virus_clade_utils/util/sequence.py b/src/virus_clade_utils/util/sequence.py
@@ -4,11 +4,14 @@
 import lzma
 import time
 import zipfile
+from datetime import datetime, timezone
 from pathlib import Path
 
 import polars as pl
 import structlog
-import us
+import us  # type: ignore
+from requests import Session
+from virus_clade_utils.util.reference import get_s3_object_url
 from virus_clade_utils.util.session import check_response, get_session
 
 logger = structlog.get_logger()
@@ -62,18 +65,26 @@ def get_covid_genome_data(released_since_date: str, base_url: str, filename: str
     logger.info("NCBI API call completed", elapsed=elapsed)
 
 
-def download_covid_genome_metadata(url: str, data_path: Path, use_existing: bool = False) -> Path:
+def download_covid_genome_metadata(
+    session: Session, bucket: str, key: str, data_path: Path, as_of: str | None = None, use_existing: bool = False
+) -> Path:
     """Download the latest GenBank genome metadata data from Nextstrain."""
 
-    session = get_session()
-    filename = data_path / Path(url).name
+    if as_of is None:
+        as_of_datetime = datetime.now().replace(tzinfo=timezone.utc)
+    else:
+        as_of_datetime = datetime.strptime(as_of, "%Y-%m-%d").replace(tzinfo=timezone.utc)
+
+    (s3_version, s3_url) = get_s3_object_url(bucket, key, as_of_datetime)
+    filename = data_path / f"{as_of_datetime.date().strftime("%Y-%m-%d")}-{Path(key).name}"
-    filename = data_path / f"{as_of_datetime.date().strftime("%Y-%m-%d")}-{Path(key).name}"
+    filename = data_path / f"{as_of}-{Path(key).name}"
-    filename = data_path / f"{as_of_datetime.date().strftime("%Y-%m-%d")}-{Path(key).name}"
+    filename = data_path / f"{as_of}-{Path(key).name}"
 
     if use_existing and filename.exists():
         logger.info("using existing genome metadata file", metadata_file=str(filename))
         return filename
 
     start = time.perf_counter()
-    with session.get(url, stream=True) as result:
+    logger.info("starting genome metadata download", source=s3_url, destination=str(filename))
+    with session.get(s3_url, stream=True) as result:
         result.raise_for_status()
         with open(filename, "wb") as f:
             for chunk in result.iter_content(chunk_size=None):

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,43 @@
+import boto3
+import pytest
+import requests
+from freezegun import freeze_time
+from moto import mock_aws
+
+
+@pytest.fixture
+def mock_session(mocker):
+    """Session mock for testing functions that use requests.Session"""
+    mock_session = mocker.patch.object(requests, "Session", autospec=True)
+    mock_session.return_value.__enter__.return_value = mock_session
+    return mock_session
+
+
+@pytest.fixture
+def s3_setup():
+    """Setup mock S3 bucket with versioned objects."""
+    with mock_aws():
+        bucket_name = "versioned-bucket"
+        object_key = "metadata/object-key/metadata.tsv.zst"
+
+        s3_client = boto3.client("s3", region_name="us-east-1")
+        s3_client.create_bucket(Bucket=bucket_name)
+        s3_client.put_bucket_versioning(Bucket=bucket_name, VersioningConfiguration={"Status": "Enabled"})
+
+        # Upload multiple versions of the object
+        versions = [
+            ("2023-01-01 03:05:01", "object version 1"),
+            ("2023-02-05 14:33:06", "object version 2"),
+            ("2023-03-22 22:55:12", "object version 3"),
+        ]
+
+        for version_date, content in versions:
+            # use freezegun to override system date, which in
+            # turn sets S3 object version LastModified date
+            with freeze_time(version_date):
+                s3_client.put_object(
+                    Bucket=bucket_name,
+                    Key=object_key,
+                    Body=content,
+                )
+        yield s3_client, bucket_name, object_key
diff --git a/tests/unit/test_get_clade_list.py b/tests/unit/test_get_clade_list.py
@@ -34,6 +34,6 @@ def test_clade_list(test_file_path, tmp_path, threshold, weeks, max_clades, expe
     mock = MagicMock(return_value=test_genome_metadata, name="genome_metadata_download_mock")
 
     with patch("virus_clade_utils.get_clade_list.download_covid_genome_metadata", mock):
-        actual_list = main(test_genome_metadata, tmp_path, threshold, weeks, max_clades)
+        actual_list = main("some_bucket", "some_key", tmp_path, threshold, weeks, max_clades)
 
     assert set(expected_list) == set(actual_list)