Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bsweger/get clades list #18

Merged
merged 3 commits into from
Sep 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
/data/*/*.md
/data/*/*.tsv
/data/*/*.zip
/data/*/*.zst
/data/*/*.xz


# Byte-compiled / optimized / DLL files
Expand Down
25 changes: 24 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,12 @@ This package was developed to provide data required for the [COVID-19 Variant No

We are releasing `virus-clade-utils` as a standalone package for use by others who may find the functionality useful.


## Usage

TODO: Actual documentation


## Docker Setup

Use the directions below to run the pipeline in a Docker container.
Expand All @@ -32,7 +36,7 @@ Use the directions below to run the pipeline in a Docker container.
docker build --platform=linux/amd64 -t virus-clade-utils .
```

3. Run the pipeline, passing in required arguments:
3. To run the target data pipeline, passing in required arguments:

```bash
docker run --platform linux/amd64 \
Expand All @@ -45,6 +49,25 @@ Use the directions below to run the pipeline in a Docker container.

The clade assignments will now be in the local directory that was mounted to the Docker container via the `-v` flag (in this case, a folder called `data` in the current working directory).


### Generating the clade list

[This will evolve; below are some temporary instructions for anyone who wants to try this via Docker]

1. Enter the container's bash shell:

```bash
docker run --platform linux/amd64 -it --entrypoint bash virus-clade-utils
```

2. Once you're in the shell of the container:

```bash
clade_list
```

**Note:** Sometimes this results in a "Killed" message from Docker due to memory constraints (it depends on the host machine, and we'll need to look into this).

### Running the test suite

To run the test suite in the Docker container (built above):
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ dev = [

[project.entry-points."console_scripts"]
assign_clades = "virus_clade_utils.assign_clades:main"
clade_list = "virus_clade_utils.get_clade_list:main"

[build-system]
requires = ["setuptools>=64", "wheel"]
Expand Down
115 changes: 115 additions & 0 deletions src/virus_clade_utils/get_clade_list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
"""Get a list of SARS-CoV-2 clades."""

import os
import time
from datetime import timedelta

import polars as pl
import structlog
from cloudpathlib import AnyPath

from virus_clade_utils.util.config import Config
from virus_clade_utils.util.sequence import (
download_covid_genome_metadata,
filter_covid_genome_metadata,
get_clade_counts,
get_covid_genome_metadata,
)

logger = structlog.get_logger()


def get_clades(clade_counts: pl.LazyFrame, threshold: float, threshold_weeks: int, max_clades: int) -> list[str]:
rogersbw marked this conversation as resolved.
Show resolved Hide resolved
"""Get a list of clades to forecast based."""
start = time.perf_counter()

# based on the data's most recent date, get the week start three weeks ago (not including this week)
max_day = clade_counts.select(pl.max("date")).collect().item()
rogersbw marked this conversation as resolved.
Show resolved Hide resolved
threshold_sundays_ago = max_day - timedelta(days=max_day.weekday() + 7 * (threshold_weeks))

# sum over weeks, combine states, and limit to just the past 3 weeks (not including current week)
lf = (
clade_counts.filter(pl.col("date") >= threshold_sundays_ago)
.sort("date")
.group_by_dynamic("date", every="1w", start_by="sunday", group_by="clade")
.agg(pl.col("count").sum())
)

# create a separate frame with the total counts per week
total_counts = lf.group_by("date").agg(pl.col("count").sum().alias("total_count"))

# join with count data to add a total counts per day column
prop_dat = lf.join(total_counts, on="date").with_columns(
(pl.col("count") / pl.col("total_count")).alias("proportion")
)

# retrieve list of variants which have crossed the threshold over the past threshold_weeks
high_prev_variants = prop_dat.filter(pl.col("proportion") > threshold).select("clade").unique().collect()

# if more than the specified number of clades cross the threshold,
# take the clades with the largest counts over the past threshold_weeks
# (if there's a tie, take the first clade alphabetically)
if len(high_prev_variants) > max_clades:
high_prev_variants = (
prop_dat.group_by("clade")
.agg(pl.col("count").sum())
.sort("count", "clade", descending=[True, False])
.collect()
)

variants = high_prev_variants.get_column("clade").to_list()[:max_clades]

end = time.perf_counter()
elapsed = end - start
logger.info("generated clade list", elapsed=elapsed)

return variants


# FIXME: provide ability to instantiate Config for the get_clade_list function and get the data_path from there
def main(
genome_metadata_path: AnyPath = Config.nextstrain_latest_genome_metadata,
data_dir: AnyPath = AnyPath(".").home() / "covid_variant",
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As with @rogersbw original code, we're saving the Nextstrain metadata to disk so we can work with it locally. I'm assuming that this is an internal concern (for example, we don't need to surface the download to whatever process runs get_clade_list and could change this implementation detail if needed).

Please let me know if that assumption is incorrect!

threshold: float = 0.01,
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These defaults are from the original clades_to_model code: https://github.com/rogersbw/clade_data_utils/blob/main/utility/data_utility.py#L79

threshold_weeks: int = 3,
max_clades: int = 9,
) -> list[str]:
"""
Determine list of clades to model

Parameters
----------
genome_metadata_path : AnyPath
Path to location of the most recent genome metadata file published by Nextstrain
data_dir : AnyPath
Path to the location where the genome metadata file is saved after download.
clade_counts : polars.LazyFrame
Clade counts by date and location, summarized from Nextstrain metadata
threshold : float
Clades that account for at least ``threshold`` proportion of reported
sequences are candidates for inclusion.
threshold_weeks : int
The number of weeks that we look back to identify clades.
max_clades : int
The maximum number of clades to include in the list.

Returns
-------
list of strings
"""

os.makedirs(data_dir, exist_ok=True)
genome_metadata_path = download_covid_genome_metadata(
genome_metadata_path,
data_dir,
)
lf_metadata = get_covid_genome_metadata(genome_metadata_path)
lf_metadata_filtered = filter_covid_genome_metadata(lf_metadata)
counts = get_clade_counts(lf_metadata_filtered)
clade_list = get_clades(counts, threshold, threshold_weeks, max_clades)

return clade_list


if __name__ == "__main__":
main()
24 changes: 22 additions & 2 deletions src/virus_clade_utils/util/sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,16 @@ def get_covid_genome_data(released_since_date: str, base_url: str, filename: str
logger.info("NCBI API call completed", elapsed=elapsed)


def download_covid_genome_metadata(url: str, data_path: Path) -> Path:
def download_covid_genome_metadata(url: str, data_path: Path, use_existing: bool = False) -> Path:
"""Download the latest GenBank genome metadata data from Nextstrain."""

session = get_session()
filename = data_path / Path(url).name

if use_existing and filename.exists():
rogersbw marked this conversation as resolved.
Show resolved Hide resolved
logger.info("using existing genome metadata file", metadata_file=str(filename))
return filename

start = time.perf_counter()
with session.get(url, stream=True) as result:
result.raise_for_status()
Expand All @@ -85,7 +89,7 @@ def download_covid_genome_metadata(url: str, data_path: Path) -> Path:
def get_covid_genome_metadata(metadata_path: Path, num_rows: int | None = None) -> pl.LazyFrame:
"""Read GenBank genome metadata into a Polars LazyFrame."""

if (compression_type := metadata_path.suffix) == ".zst":
if (compression_type := metadata_path.suffix) in [".tsv", ".zst"]:
metadata = pl.scan_csv(metadata_path, separator="\t", n_rows=num_rows)
elif compression_type == ".xz":
metadata = pl.read_csv(
Expand Down Expand Up @@ -130,6 +134,22 @@ def filter_covid_genome_metadata(metadata: pl.LazyFrame, cols: list = []) -> pl.
return filtered_metadata


def get_clade_counts(filtered_metadata: pl.LazyFrame) -> pl.LazyFrame:
rogersbw marked this conversation as resolved.
Show resolved Hide resolved
"""Return a count of clades by location and date."""

cols = [
"clade",
"country",
"date",
"location",
"host",
]

counts = filtered_metadata.select(cols).group_by("location", "date", "clade").agg(pl.len().alias("count"))

return counts


def unzip_sequence_package(filename: str, data_path: str):
"""Unzip the downloaded virus genome data package."""
with zipfile.ZipFile(filename, "r") as package_zip:
Expand Down
30 changes: 30 additions & 0 deletions tests/data/test_metadata.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
genbank_accession genbank_accession_rev unwanted_column date host country division clade_nextstrain location another unwanted column
abc abc.1 i ❤️ wombats 2024-09-01 Homo sapiens USA Massachusetts AA.ZZ Vulcan hummus a tune
abc abc.1 i ❤️ wombats 2024-09-01 Homo sapiens USA Massachusetts AA.ZZ Vulcan hummus a tune
def def.1 i ❤️ wombats 2024-09-01 Homo sapiens USA Massachusetts AA.ZZ Earth hummus a tune
ghi ghi.4 i ❤️ wombats 2024-09-01 Homo sapiens USA Utah BB Cardassia hummus a tune
jkl jkl.1 i ❤️ wombats 2024-09-01 Homo sapiens USA Utah CC Bajor hummus a tune
mno mno.1 i ❤️ wombats 2024-09-01 Homo sapiens Canada Alberta DD Vulcan hummus a tune
mno mno.1 i ❤️ wombats 2024-09-01 marmots USA Massachusetts DD Vulcan hummus a tune
mno mno.1 i ❤️ wombats 2024-09-01 Homo sapiens USA Puerto Rico DD Reisa hummus a tune
abc abc.1 i ❤️ wombats 2024-09-08 Homo sapiens USA Massachusetts EE Vulcan hummus a tune
abc abc.1 i ❤️ wombats 2024-09-08 Homo sapiens USA Massachusetts EE Vulcan hummus a tune
def def.1 i ❤️ wombats 2024-09-08 Homo sapiens USA Massachusetts DD Earth hummus a tune
ghi ghi.4 i ❤️ wombats 2024-09-08 Homo sapiens USA Utah AA Cardassia hummus a tune
jkl jkl.1 i ❤️ wombats 2024-09-08 Homo sapiens USA Utah AA.ZZ Bajor hummus a tune
abc abc.1 i ❤️ wombats 2024-09-15 Homo sapiens USA Massachusetts AA Vulcan hummus a tune
abc abc.1 i ❤️ wombats 2024-09-15 Homo sapiens USA Massachusetts AA Vulcan hummus a tune
def def.1 i ❤️ wombats 2024-09-15 Homo sapiens USA Massachusetts AA Earth hummus a tune
ghi ghi.4 i ❤️ wombats 2024-09-15 Homo sapiens USA Utah BB Cardassia hummus a tune
jkl jkl.1 i ❤️ wombats 2024-09-15 Homo sapiens USA Utah CC Bajor hummus a tune
mno mno.1 i ❤️ wombats 2024-09-15 Homo sapiens Canada Mississippi DD Earth hummus a tune
mno mno.1 i ❤️ wombats 2024-09-15 marmots USA Massachusetts DD Cardassia hummus a tune
mno mno.1 i ❤️ wombats 2024-09-15 Homo sapiens USA Puerto Rico DD Bajor hummus a tune
abcd abcd.1 i ❤️ wombats 2024-09-22 Homo sapiens USA Massachusetts FF Vulcan hummus a tune
abc abc.1 i ❤️ wombats 2024-09-22 Homo sapiens USA Massachusetts AA Vulcan hummus a tune
def def.1 i ❤️ wombats 2024-09-22 Homo sapiens USA Massachusetts AA Earth hummus a tune
ghi ghi.4 i ❤️ wombats 2024-09-22 Homo sapiens USA Utah BB Cardassia hummus a tune
jkl jkl.1 i ❤️ wombats 2024-09-22 Homo sapiens USA Utah CC Bajor hummus a tune
mno mno.1 i ❤️ wombats 2024-09-22 Homo sapiens Canada Mississippi FF Earth hummus a tune
mno mno.1 i ❤️ wombats 2024-09-22 marmots USA Massachusetts FF Cardassia hummus a tune
mno mno.1 i ❤️ wombats 2024-09-22 Homo sapiens USA Puerto Rico FF Bajor hummus a tune
39 changes: 39 additions & 0 deletions tests/unit/test_get_clade_list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from pathlib import Path
from unittest.mock import MagicMock, patch

import pytest
from virus_clade_utils.get_clade_list import main


@pytest.fixture
def test_file_path() -> Path:
"""
Return path to the unit test files.
"""
test_file_path = Path(__file__).parents[1].joinpath("data")
return test_file_path


@pytest.mark.parametrize(
"threshold, weeks, max_clades, expected_list",
rogersbw marked this conversation as resolved.
Show resolved Hide resolved
[
(0.1, 3, 9, ["AA", "AA.ZZ", "BB", "CC", "DD", "EE", "FF"]),
(0.3, 3, 9, ["AA", "AA.ZZ", "EE"]),
(0.1, 2, 9, ["AA.ZZ", "AA", "BB", "CC", "DD", "EE", "FF"]),
(0.1, 1, 9, ["AA", "BB", "CC", "FF"]),
(0.3, 1, 9, ["AA"]),
(0.1, 3, 4, ["AA", "AA.ZZ", "BB", "CC"]),
(0.3, 3, 2, ["AA", "AA.ZZ"]),
(0.1, 2, 3, ["AA", "BB", "CC"]),
(0.1, 1, 3, ["AA", "BB", "CC"]),
(1, 3, 9, []),
],
)
def test_clade_list(test_file_path, tmp_path, threshold, weeks, max_clades, expected_list):
test_genome_metadata = test_file_path / "test_metadata.tsv"
mock = MagicMock(return_value=test_genome_metadata, name="genome_metadata_download_mock")

with patch("virus_clade_utils.get_clade_list.download_covid_genome_metadata", mock):
actual_list = main(test_genome_metadata, tmp_path, threshold, weeks, max_clades)

assert set(expected_list) == set(actual_list)