Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bsweger/metadata download as of #23

Merged
merged 6 commits into from
Sep 27, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ license = {text = "MIT"}

dependencies = [
"awscli>=1.32.92",
"boto3",
"click",
"cloudpathlib",
"pandas",
Expand All @@ -29,6 +30,8 @@ dependencies = [
[project.optional-dependencies]
dev = [
"coverage",
"freezegun",
"moto",
"mypy",
"pytest",
"ruff",
Expand Down
48 changes: 44 additions & 4 deletions requirements/requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,20 @@
# uv pip compile pyproject.toml --extra dev -o requirements/requirements-dev.txt
awscli==1.32.116
# via virus-clade-utils (pyproject.toml)
boto3==1.34.116
# via
# virus-clade-utils (pyproject.toml)
# moto
botocore==1.34.116
# via
# awscli
# boto3
# moto
# s3transfer
certifi==2024.2.2
# via requests
cffi==1.17.1
# via cryptography
charset-normalizer==3.3.2
# via requests
click==8.1.7
Expand All @@ -20,20 +28,34 @@ colorama==0.4.6
# via awscli
coverage==7.5.3
# via virus-clade-utils (pyproject.toml)
cryptography==43.0.1
# via moto
docutils==0.16
# via awscli
freezegun==1.5.1
# via virus-clade-utils (pyproject.toml)
idna==3.7
# via requests
iniconfig==2.0.0
# via pytest
jellyfish==1.1.0
# via us
jinja2==3.1.4
# via moto
jmespath==1.0.1
# via botocore
# via
# boto3
# botocore
markdown-it-py==3.0.0
# via rich
markupsafe==2.1.5
# via
# jinja2
# werkzeug
mdurl==0.1.2
# via markdown-it-py
moto==5.0.15
# via virus-clade-utils (pyproject.toml)
mypy==1.10.1
# via virus-clade-utils (pyproject.toml)
mypy-extensions==1.0.0
Expand All @@ -54,20 +76,31 @@ pyarrow==16.1.0
# via virus-clade-utils (pyproject.toml)
pyasn1==0.6.0
# via rsa
pycparser==2.22
# via cffi
pygments==2.18.0
# via rich
pytest==8.2.1
# via virus-clade-utils (pyproject.toml)
python-dateutil==2.9.0.post0
# via
# botocore
# freezegun
# moto
# pandas
pytz==2024.1
# via pandas
pyyaml==6.0.1
# via awscli
# via
# awscli
# responses
requests==2.32.3
# via virus-clade-utils (pyproject.toml)
# via
# virus-clade-utils (pyproject.toml)
# moto
# responses
responses==0.25.3
# via moto
rich==13.7.1
# via
# virus-clade-utils (pyproject.toml)
Expand All @@ -79,7 +112,9 @@ rsa==4.7.2
ruff==0.5.0
# via virus-clade-utils (pyproject.toml)
s3transfer==0.10.1
# via awscli
# via
# awscli
# boto3
six==1.16.0
# via python-dateutil
structlog==24.2.0
Expand All @@ -94,5 +129,10 @@ urllib3==2.2.1
# via
# botocore
# requests
# responses
us==3.2.0
# via virus-clade-utils (pyproject.toml)
werkzeug==3.0.4
# via moto
xmltodict==0.13.0
# via moto
11 changes: 9 additions & 2 deletions requirements/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,12 @@
# uv pip compile pyproject.toml -o requirements/requirements.txt
awscli==1.32.97
# via virus-clade-utils (pyproject.toml)
boto3==1.34.97
# via virus-clade-utils (pyproject.toml)
botocore==1.34.97
# via
# awscli
# boto3
# s3transfer
certifi==2024.2.2
# via requests
Expand All @@ -25,7 +28,9 @@ idna==3.7
jellyfish==1.1.0
# via us
jmespath==1.0.1
# via botocore
# via
# boto3
# botocore
markdown-it-py==3.0.0
# via rich
mdurl==0.1.2
Expand Down Expand Up @@ -63,7 +68,9 @@ rich-click==1.8.0
rsa==4.7.2
# via awscli
s3transfer==0.10.1
# via awscli
# via
# awscli
# boto3
six==1.16.0
# via python-dateutil
structlog==24.1.0
Expand Down
17 changes: 12 additions & 5 deletions src/virus_clade_utils/get_clade_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
get_clade_counts,
get_covid_genome_metadata,
)
from virus_clade_utils.util.session import get_session

logger = structlog.get_logger()

Expand Down Expand Up @@ -68,7 +69,8 @@ def get_clades(clade_counts: pl.LazyFrame, threshold: float, threshold_weeks: in

# FIXME: provide ability to instantiate Config for the get_clade_list function and get the data_path from there
def main(
genome_metadata_path: AnyPath = Config.nextstrain_latest_genome_metadata,
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rather than download the metadata file from the URL listed on Nextstrain's website, we'll download the file via an S3 https link.

genome_metadata_bucket: str = Config.nextstrain_ncov_bucket,
genome_metadata_key: str = Config.nextstrain_genome_metadata_key,
data_dir: AnyPath = AnyPath(".").home() / "covid_variant",
threshold: float = 0.01,
threshold_weeks: int = 3,
Expand All @@ -79,8 +81,11 @@ def main(

Parameters
----------
genome_metadata_path : AnyPath
Path to location of the most recent genome metadata file published by Nextstrain
genome_metadata_bucket : str
Name of the S3 bucket that hosts Nextstrain's open (GenBank) genome
metadata files published by the ncov pipeline.
genome_metdata_key : str
S3 key of the Nextstrain genome metadata file.
data_dir : AnyPath
Path to the location where the genome metadata file is saved after download.
clade_counts : polars.LazyFrame
Expand All @@ -97,10 +102,12 @@ def main(
-------
list of strings
"""

os.makedirs(data_dir, exist_ok=True)
session = get_session()
genome_metadata_path = download_covid_genome_metadata(
genome_metadata_path,
session,
genome_metadata_bucket,
genome_metadata_key,
data_dir,
)
lf_metadata = get_covid_genome_metadata(genome_metadata_path)
Expand Down
7 changes: 4 additions & 3 deletions src/virus_clade_utils/util/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@

@dataclass
class Config:
data_path_root: InitVar[AnyPath]
sequence_released_date: InitVar[datetime]
reference_tree_as_of_date: InitVar[datetime]
data_path_root: InitVar[str] = AnyPath(".")
sequence_released_since_date: str = None
reference_tree_date: str = None
now = datetime.now()
Expand All @@ -18,7 +18,8 @@ class Config:
ncbi_package_name: str = "ncbi.zip"
ncbi_sequence_file: AnyPath = None
ncbi_sequence_metadata_file: AnyPath = None
nextstrain_latest_genome_metadata = "https://data.nextstrain.org/files/ncov/open/metadata.tsv.zst"
nextstrain_ncov_bucket = "nextstrain-data"
nextstrain_genome_metadata_key = "files/ncov/open/metadata.tsv.zst"
nextclade_base_url: str = "https://nextstrain.org/nextclade/sars-cov-2"
reference_tree_file: AnyPath = None
root_sequence_file: AnyPath = None
Expand All @@ -28,9 +29,9 @@ class Config:

def __post_init__(
self,
data_path_root: str | None,
sequence_released_date: datetime.date,
reference_tree_as_of_date: datetime.date,
data_path_root: str | None,
):
if data_path_root:
self.data_path = AnyPath(data_path_root)
Expand Down
39 changes: 39 additions & 0 deletions src/virus_clade_utils/util/reference.py
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reference.py is no longer a sensible name for this file, but I chose not to pull that thread as part of this update

Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
"""Functions for retrieving and parsing SARS-CoV-2 phylogenic tree data."""

import subprocess
from datetime import datetime
from pathlib import Path
from typing import Tuple

import boto3
import structlog
from botocore import UNSIGNED
from botocore.exceptions import BotoCoreError, ClientError, NoCredentialsError

logger = structlog.get_logger()

Expand Down Expand Up @@ -40,3 +45,37 @@ def get_nextclade_dataset(as_of_date: str, data_path_root: str) -> str:
)

return DATASET_PATH


def get_s3_object_url(bucket_name: str, object_key: str, date: datetime) -> Tuple[str, str]:
"""
For a versioned, public S3 bucket and object key, return the version ID
of the object as it existed at a specific date (UTC)
Comment on lines +52 to +53
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If multiple versions are stored on the same date, will this be the first or last of those?

"""
try:
s3_client = boto3.client("s3", config=boto3.session.Config(signature_version=UNSIGNED))

paginator = s3_client.get_paginator("list_object_versions")
page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=object_key)

selected_version = None
for page in page_iterator:
for version in page.get("Versions", []):
version_date = version["LastModified"]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

related to question above -- is version_date a date or a datetime?

if version_date <= date:
if selected_version is None or version_date > selected_version["LastModified"]:
selected_version = version
except (BotoCoreError, ClientError, NoCredentialsError) as e:
logger.error("S3 client error", error=e)
raise e
except Exception as e:
logger.error("Unexpected error", error=e)
raise e

if selected_version is None:
raise ValueError(f"No version of {object_key} found before {date}")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
raise ValueError(f"No version of {object_key} found before {date}")
raise ValueError(f"No version of {object_key} found on or before {date}")

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually, i'm not sure about this


version_id = selected_version["VersionId"]
version_url = f"https://{bucket_name}.s3.amazonaws.com/{object_key}?versionId={version_id}"

return version_id, version_url
21 changes: 16 additions & 5 deletions src/virus_clade_utils/util/sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,14 @@
import lzma
import time
import zipfile
from datetime import datetime, timezone
from pathlib import Path

import polars as pl
import structlog
import us
import us # type: ignore
from requests import Session
from virus_clade_utils.util.reference import get_s3_object_url
from virus_clade_utils.util.session import check_response, get_session

logger = structlog.get_logger()
Expand Down Expand Up @@ -62,18 +65,26 @@ def get_covid_genome_data(released_since_date: str, base_url: str, filename: str
logger.info("NCBI API call completed", elapsed=elapsed)


def download_covid_genome_metadata(url: str, data_path: Path, use_existing: bool = False) -> Path:
def download_covid_genome_metadata(
session: Session, bucket: str, key: str, data_path: Path, as_of: str | None = None, use_existing: bool = False
) -> Path:
"""Download the latest GenBank genome metadata data from Nextstrain."""

session = get_session()
filename = data_path / Path(url).name
if as_of is None:
as_of_datetime = datetime.now().replace(tzinfo=timezone.utc)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if we need to add 1 day (or 23 hours, 59 minutes, 59 seconds?) here to ensure that we get the latest available as of right now. I think this returns midnight (first second of the day) of today, which may be prior to a Nextstrain data run that happened at e.g. 2am UTC?

else:
as_of_datetime = datetime.strptime(as_of, "%Y-%m-%d").replace(tzinfo=timezone.utc)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

similar -- do we want midnight of that day, or just before midnight of the next day?


(s3_version, s3_url) = get_s3_object_url(bucket, key, as_of_datetime)
filename = data_path / f"{as_of_datetime.date().strftime("%Y-%m-%d")}-{Path(key).name}"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we just use the provided as_of string here?

Suggested change
filename = data_path / f"{as_of_datetime.date().strftime("%Y-%m-%d")}-{Path(key).name}"
filename = data_path / f"{as_of}-{Path(key).name}"

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good suggestion--I'll hold off, given that we're planning to rejig things and start passing timestamps around!


if use_existing and filename.exists():
logger.info("using existing genome metadata file", metadata_file=str(filename))
return filename

start = time.perf_counter()
with session.get(url, stream=True) as result:
logger.info("starting genome metadata download", source=s3_url, destination=str(filename))
with session.get(s3_url, stream=True) as result:
result.raise_for_status()
with open(filename, "wb") as f:
for chunk in result.iter_content(chunk_size=None):
Expand Down
43 changes: 43 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import boto3
import pytest
import requests
from freezegun import freeze_time
from moto import mock_aws


@pytest.fixture
def mock_session(mocker):
"""Session mock for testing functions that use requests.Session"""
mock_session = mocker.patch.object(requests, "Session", autospec=True)
mock_session.return_value.__enter__.return_value = mock_session
return mock_session


@pytest.fixture
def s3_setup():
"""Setup mock S3 bucket with versioned objects."""
with mock_aws():
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

honestly, this is pretty dazzling!! ✨

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks--I fixed up the dependencies so the tests, you know, actually run!

bucket_name = "versioned-bucket"
object_key = "metadata/object-key/metadata.tsv.zst"

s3_client = boto3.client("s3", region_name="us-east-1")
s3_client.create_bucket(Bucket=bucket_name)
s3_client.put_bucket_versioning(Bucket=bucket_name, VersioningConfiguration={"Status": "Enabled"})

# Upload multiple versions of the object
versions = [
("2023-01-01 03:05:01", "object version 1"),
("2023-02-05 14:33:06", "object version 2"),
("2023-03-22 22:55:12", "object version 3"),
]

for version_date, content in versions:
# use freezegun to override system date, which in
# turn sets S3 object version LastModified date
with freeze_time(version_date):
s3_client.put_object(
Bucket=bucket_name,
Key=object_key,
Body=content,
)
yield s3_client, bucket_name, object_key
2 changes: 1 addition & 1 deletion tests/unit/test_get_clade_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,6 @@ def test_clade_list(test_file_path, tmp_path, threshold, weeks, max_clades, expe
mock = MagicMock(return_value=test_genome_metadata, name="genome_metadata_download_mock")

with patch("virus_clade_utils.get_clade_list.download_covid_genome_metadata", mock):
actual_list = main(test_genome_metadata, tmp_path, threshold, weeks, max_clades)
actual_list = main("some_bucket", "some_key", tmp_path, threshold, weeks, max_clades)

assert set(expected_list) == set(actual_list)
Loading
Loading