-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Bsweger/metadata download as of #23
Changes from 5 commits
980e468
ae0796b
4b0abcd
a1fe96f
5c99db0
e7c7359
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -1,9 +1,14 @@ | ||||||
"""Functions for retrieving and parsing SARS-CoV-2 phylogenic tree data.""" | ||||||
|
||||||
import subprocess | ||||||
from datetime import datetime | ||||||
from pathlib import Path | ||||||
from typing import Tuple | ||||||
|
||||||
import boto3 | ||||||
import structlog | ||||||
from botocore import UNSIGNED | ||||||
from botocore.exceptions import BotoCoreError, ClientError, NoCredentialsError | ||||||
|
||||||
logger = structlog.get_logger() | ||||||
|
||||||
|
@@ -40,3 +45,37 @@ def get_nextclade_dataset(as_of_date: str, data_path_root: str) -> str: | |||||
) | ||||||
|
||||||
return DATASET_PATH | ||||||
|
||||||
|
||||||
def get_s3_object_url(bucket_name: str, object_key: str, date: datetime) -> Tuple[str, str]: | ||||||
""" | ||||||
For a versioned, public S3 bucket and object key, return the version ID | ||||||
of the object as it existed at a specific date (UTC) | ||||||
Comment on lines
+52
to
+53
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If multiple versions are stored on the same date, will this be the first or last of those? |
||||||
""" | ||||||
try: | ||||||
s3_client = boto3.client("s3", config=boto3.session.Config(signature_version=UNSIGNED)) | ||||||
|
||||||
paginator = s3_client.get_paginator("list_object_versions") | ||||||
page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=object_key) | ||||||
|
||||||
selected_version = None | ||||||
for page in page_iterator: | ||||||
for version in page.get("Versions", []): | ||||||
version_date = version["LastModified"] | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. related to question above -- is |
||||||
if version_date <= date: | ||||||
if selected_version is None or version_date > selected_version["LastModified"]: | ||||||
selected_version = version | ||||||
except (BotoCoreError, ClientError, NoCredentialsError) as e: | ||||||
logger.error("S3 client error", error=e) | ||||||
raise e | ||||||
except Exception as e: | ||||||
logger.error("Unexpected error", error=e) | ||||||
raise e | ||||||
|
||||||
if selected_version is None: | ||||||
raise ValueError(f"No version of {object_key} found before {date}") | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. actually, i'm not sure about this |
||||||
|
||||||
version_id = selected_version["VersionId"] | ||||||
version_url = f"https://{bucket_name}.s3.amazonaws.com/{object_key}?versionId={version_id}" | ||||||
|
||||||
return version_id, version_url |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -4,11 +4,14 @@ | |||||
import lzma | ||||||
import time | ||||||
import zipfile | ||||||
from datetime import datetime, timezone | ||||||
from pathlib import Path | ||||||
|
||||||
import polars as pl | ||||||
import structlog | ||||||
import us | ||||||
import us # type: ignore | ||||||
from requests import Session | ||||||
from virus_clade_utils.util.reference import get_s3_object_url | ||||||
from virus_clade_utils.util.session import check_response, get_session | ||||||
|
||||||
logger = structlog.get_logger() | ||||||
|
@@ -62,18 +65,26 @@ def get_covid_genome_data(released_since_date: str, base_url: str, filename: str | |||||
logger.info("NCBI API call completed", elapsed=elapsed) | ||||||
|
||||||
|
||||||
def download_covid_genome_metadata(url: str, data_path: Path, use_existing: bool = False) -> Path: | ||||||
def download_covid_genome_metadata( | ||||||
session: Session, bucket: str, key: str, data_path: Path, as_of: str | None = None, use_existing: bool = False | ||||||
) -> Path: | ||||||
"""Download the latest GenBank genome metadata data from Nextstrain.""" | ||||||
|
||||||
session = get_session() | ||||||
filename = data_path / Path(url).name | ||||||
if as_of is None: | ||||||
as_of_datetime = datetime.now().replace(tzinfo=timezone.utc) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder if we need to add 1 day (or 23 hours, 59 minutes, 59 seconds?) here to ensure that we get the latest available as of right now. I think this returns midnight (first second of the day) of today, which may be prior to a Nextstrain data run that happened at e.g. 2am UTC? |
||||||
else: | ||||||
as_of_datetime = datetime.strptime(as_of, "%Y-%m-%d").replace(tzinfo=timezone.utc) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. similar -- do we want midnight of that day, or just before midnight of the next day? |
||||||
|
||||||
(s3_version, s3_url) = get_s3_object_url(bucket, key, as_of_datetime) | ||||||
filename = data_path / f"{as_of_datetime.date().strftime("%Y-%m-%d")}-{Path(key).name}" | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we just use the provided
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good suggestion--I'll hold off, given that we're planning to rejig things and start passing timestamps around! |
||||||
|
||||||
if use_existing and filename.exists(): | ||||||
logger.info("using existing genome metadata file", metadata_file=str(filename)) | ||||||
return filename | ||||||
|
||||||
start = time.perf_counter() | ||||||
with session.get(url, stream=True) as result: | ||||||
logger.info("starting genome metadata download", source=s3_url, destination=str(filename)) | ||||||
with session.get(s3_url, stream=True) as result: | ||||||
result.raise_for_status() | ||||||
with open(filename, "wb") as f: | ||||||
for chunk in result.iter_content(chunk_size=None): | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
import boto3 | ||
import pytest | ||
import requests | ||
from freezegun import freeze_time | ||
from moto import mock_aws | ||
|
||
|
||
@pytest.fixture | ||
def mock_session(mocker): | ||
"""Session mock for testing functions that use requests.Session""" | ||
mock_session = mocker.patch.object(requests, "Session", autospec=True) | ||
mock_session.return_value.__enter__.return_value = mock_session | ||
return mock_session | ||
|
||
|
||
@pytest.fixture | ||
def s3_setup(): | ||
"""Setup mock S3 bucket with versioned objects.""" | ||
with mock_aws(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. honestly, this is pretty dazzling!! ✨ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks--I fixed up the dependencies so the tests, you know, actually run! |
||
bucket_name = "versioned-bucket" | ||
object_key = "metadata/object-key/metadata.tsv.zst" | ||
|
||
s3_client = boto3.client("s3", region_name="us-east-1") | ||
s3_client.create_bucket(Bucket=bucket_name) | ||
s3_client.put_bucket_versioning(Bucket=bucket_name, VersioningConfiguration={"Status": "Enabled"}) | ||
|
||
# Upload multiple versions of the object | ||
versions = [ | ||
("2023-01-01 03:05:01", "object version 1"), | ||
("2023-02-05 14:33:06", "object version 2"), | ||
("2023-03-22 22:55:12", "object version 3"), | ||
] | ||
|
||
for version_date, content in versions: | ||
# use freezegun to override system date, which in | ||
# turn sets S3 object version LastModified date | ||
with freeze_time(version_date): | ||
s3_client.put_object( | ||
Bucket=bucket_name, | ||
Key=object_key, | ||
Body=content, | ||
) | ||
yield s3_client, bucket_name, object_key |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Rather than download the metadata file from the URL listed on Nextstrain's website, we'll download the file via an S3 https link.