reichlab · rogersbw · Sep 9, 2024 · Sep 6, 2024 · Sep 6, 2024 · Sep 6, 2024
diff --git a/.gitignore b/.gitignore
@@ -11,6 +11,8 @@
 /data/*/*.md
 /data/*/*.tsv
 /data/*/*.zip
+/data/*/*.zst
+/data/*/*.xz
 
 
 # Byte-compiled / optimized / DLL files

diff --git a/README.md b/README.md
@@ -9,8 +9,12 @@ This package was developed to provide data required for the [COVID-19 Variant No
 
 We are releasing `virus-clade-utils` as a standalone package for use by others who may find the functionality useful.
 
+
+## Usage
+
 TODO: Actual documentation
 
+
 ## Docker Setup
 
 Use the directions below to run the pipeline in a Docker container.
@@ -32,7 +36,7 @@ Use the directions below to run the pipeline in a Docker container.
     docker build --platform=linux/amd64 -t virus-clade-utils .
     ```
 
-3. Run the pipeline, passing in required arguments:
+3. To run the target data pipeline, passing in required arguments:
 
     ```bash
     docker run --platform linux/amd64 \
@@ -45,6 +49,25 @@ Use the directions below to run the pipeline in a Docker container.
 
 The clade assignments will now be in the local directory that was mounted to the Docker container via the `-v` flag (in this case, a folder called `data` in the current working directory).
 
+
+### Generating the clade list
+
+[This will evolve; below are some temporary instructions for anyone who wants to try this via Docker]
+
+1. Enter the container's bash shell:
+
+    ```bash
+    docker run --platform linux/amd64 -it --entrypoint bash virus-clade-utils
+    ```
+
+2. Once you're in the shell of the container:
+
+    ```bash
+    clade_list
+    ```
+
+**Note:** Sometimes this results in a "Killed" message from Docker due to memory constraints (it depends on the host machine, and we'll need to look into this).
+
 ### Running the test suite
 
 To run the test suite in the Docker container (built above):

diff --git a/pyproject.toml b/pyproject.toml
@@ -36,6 +36,7 @@ dev = [
 
 [project.entry-points."console_scripts"]
 assign_clades = "virus_clade_utils.assign_clades:main"
+clade_list = "virus_clade_utils.get_clade_list:main"
 
 [build-system]
 requires = ["setuptools>=64", "wheel"]

diff --git a/src/virus_clade_utils/get_clade_list.py b/src/virus_clade_utils/get_clade_list.py
@@ -0,0 +1,115 @@
+"""Get a list of SARS-CoV-2 clades."""
+
+import os
+import time
+from datetime import timedelta
+
+import polars as pl
+import structlog
+from cloudpathlib import AnyPath
+
+from virus_clade_utils.util.config import Config
+from virus_clade_utils.util.sequence import (
+    download_covid_genome_metadata,
+    filter_covid_genome_metadata,
+    get_clade_counts,
+    get_covid_genome_metadata,
+)
+
+logger = structlog.get_logger()
+
+
+def get_clades(clade_counts: pl.LazyFrame, threshold: float, threshold_weeks: int, max_clades: int) -> list[str]:
+    """Get a list of clades to forecast based."""
+    start = time.perf_counter()
+
+    # based on the data's most recent date, get the week start three weeks ago (not including this week)
+    max_day = clade_counts.select(pl.max("date")).collect().item()
+    threshold_sundays_ago = max_day - timedelta(days=max_day.weekday() + 7 * (threshold_weeks))
+
+    # sum over weeks, combine states, and limit to just the past 3 weeks (not including current week)
+    lf = (
+        clade_counts.filter(pl.col("date") >= threshold_sundays_ago)
+        .sort("date")
+        .group_by_dynamic("date", every="1w", start_by="sunday", group_by="clade")
+        .agg(pl.col("count").sum())
+    )
+
+    # create a separate frame with the total counts per week
+    total_counts = lf.group_by("date").agg(pl.col("count").sum().alias("total_count"))
+
+    # join with count data to add a total counts per day column
+    prop_dat = lf.join(total_counts, on="date").with_columns(
+        (pl.col("count") / pl.col("total_count")).alias("proportion")
+    )
+
+    # retrieve list of variants which have crossed the threshold over the past threshold_weeks
+    high_prev_variants = prop_dat.filter(pl.col("proportion") > threshold).select("clade").unique().collect()
+
+    # if more than the specified number of clades cross the threshold,
+    # take the clades with the largest counts over the past threshold_weeks
+    # (if there's a tie, take the first clade alphabetically)
+    if len(high_prev_variants) > max_clades:
+        high_prev_variants = (
+            prop_dat.group_by("clade")
+            .agg(pl.col("count").sum())
+            .sort("count", "clade", descending=[True, False])
+            .collect()
+        )
+
+    variants = high_prev_variants.get_column("clade").to_list()[:max_clades]
+
+    end = time.perf_counter()
+    elapsed = end - start
+    logger.info("generated clade list", elapsed=elapsed)
+
+    return variants
+
+
+# FIXME: provide ability to instantiate Config for the get_clade_list function and get the data_path from there
+def main(
+    genome_metadata_path: AnyPath = Config.nextstrain_latest_genome_metadata,
+    data_dir: AnyPath = AnyPath(".").home() / "covid_variant",
+    threshold: float = 0.01,
+    threshold_weeks: int = 3,
+    max_clades: int = 9,
+) -> list[str]:
+    """
+    Determine list of clades to model
+
+    Parameters
+    ----------
+    genome_metadata_path : AnyPath
+        Path to location of the most recent genome metadata file published by Nextstrain
+    data_dir : AnyPath
+        Path to the location where the genome metadata file is saved after download.
+    clade_counts : polars.LazyFrame
+        Clade counts by date and location, summarized from Nextstrain metadata
+    threshold : float
+        Clades that account for at least ``threshold`` proportion of reported
+        sequences are candidates for inclusion.
+    threshold_weeks : int
+        The number of weeks that we look back to identify clades.
+    max_clades : int
+        The maximum number of clades to include in the list.
+
+    Returns
+    -------
+    list of strings
+    """
+
+    os.makedirs(data_dir, exist_ok=True)
+    genome_metadata_path = download_covid_genome_metadata(
+        genome_metadata_path,
+        data_dir,
+    )
+    lf_metadata = get_covid_genome_metadata(genome_metadata_path)
+    lf_metadata_filtered = filter_covid_genome_metadata(lf_metadata)
+    counts = get_clade_counts(lf_metadata_filtered)
+    clade_list = get_clades(counts, threshold, threshold_weeks, max_clades)
+
+    return clade_list
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/virus_clade_utils/util/sequence.py b/src/virus_clade_utils/util/sequence.py
@@ -62,12 +62,16 @@ def get_covid_genome_data(released_since_date: str, base_url: str, filename: str
     logger.info("NCBI API call completed", elapsed=elapsed)
 
 
-def download_covid_genome_metadata(url: str, data_path: Path) -> Path:
+def download_covid_genome_metadata(url: str, data_path: Path, use_existing: bool = False) -> Path:
     """Download the latest GenBank genome metadata data from Nextstrain."""
 
     session = get_session()
     filename = data_path / Path(url).name
 
+    if use_existing and filename.exists():
+        logger.info("using existing genome metadata file", metadata_file=str(filename))
+        return filename
+
     start = time.perf_counter()
     with session.get(url, stream=True) as result:
         result.raise_for_status()
@@ -85,7 +89,7 @@ def download_covid_genome_metadata(url: str, data_path: Path) -> Path:
 def get_covid_genome_metadata(metadata_path: Path, num_rows: int | None = None) -> pl.LazyFrame:
     """Read GenBank genome metadata into a Polars LazyFrame."""
 
-    if (compression_type := metadata_path.suffix) == ".zst":
+    if (compression_type := metadata_path.suffix) in [".tsv", ".zst"]:
         metadata = pl.scan_csv(metadata_path, separator="\t", n_rows=num_rows)
     elif compression_type == ".xz":
         metadata = pl.read_csv(
@@ -130,6 +134,22 @@ def filter_covid_genome_metadata(metadata: pl.LazyFrame, cols: list = []) -> pl.
     return filtered_metadata
 
 
+def get_clade_counts(filtered_metadata: pl.LazyFrame) -> pl.LazyFrame:
+    """Return a count of clades by location and date."""
+
+    cols = [
+        "clade",
+        "country",
+        "date",
+        "location",
+        "host",
+    ]
+
+    counts = filtered_metadata.select(cols).group_by("location", "date", "clade").agg(pl.len().alias("count"))
+
+    return counts
+
+
 def unzip_sequence_package(filename: str, data_path: str):
     """Unzip the downloaded virus genome data package."""
     with zipfile.ZipFile(filename, "r") as package_zip:

diff --git a/tests/data/test_metadata.tsv b/tests/data/test_metadata.tsv
@@ -0,0 +1,30 @@
+genbank_accession	genbank_accession_rev	unwanted_column	date	host	country	division	clade_nextstrain	location	another unwanted column
+abc	abc.1	i ❤️ wombats	2024-09-01	Homo sapiens	USA	Massachusetts	AA.ZZ	Vulcan	hummus a tune
+abc	abc.1	i ❤️ wombats	2024-09-01	Homo sapiens	USA	Massachusetts	AA.ZZ	Vulcan	hummus a tune
+def	def.1	i ❤️ wombats	2024-09-01	Homo sapiens	USA	Massachusetts	AA.ZZ	Earth	hummus a tune
+ghi	ghi.4	i ❤️ wombats	2024-09-01	Homo sapiens	USA	Utah	BB	Cardassia	hummus a tune
+jkl	jkl.1	i ❤️ wombats	2024-09-01	Homo sapiens	USA	Utah	CC	Bajor	hummus a tune
+mno	mno.1	i ❤️ wombats	2024-09-01	Homo sapiens	Canada	Alberta	DD	Vulcan	hummus a tune
+mno	mno.1	i ❤️ wombats	2024-09-01	marmots	USA	Massachusetts	DD	Vulcan	hummus a tune
+mno	mno.1	i ❤️ wombats	2024-09-01	Homo sapiens	USA	Puerto Rico	DD	Reisa	hummus a tune
+abc	abc.1	i ❤️ wombats	2024-09-08	Homo sapiens	USA	Massachusetts	EE	Vulcan	hummus a tune
+abc	abc.1	i ❤️ wombats	2024-09-08	Homo sapiens	USA	Massachusetts	EE	Vulcan	hummus a tune
+def	def.1	i ❤️ wombats	2024-09-08	Homo sapiens	USA	Massachusetts	DD	Earth	hummus a tune
+ghi	ghi.4	i ❤️ wombats	2024-09-08	Homo sapiens	USA	Utah	AA	Cardassia	hummus a tune
+jkl	jkl.1	i ❤️ wombats	2024-09-08	Homo sapiens	USA	Utah	AA.ZZ	Bajor	hummus a tune
+abc	abc.1	i ❤️ wombats	2024-09-15	Homo sapiens	USA	Massachusetts	AA	Vulcan	hummus a tune
+abc	abc.1	i ❤️ wombats	2024-09-15	Homo sapiens	USA	Massachusetts	AA	Vulcan	hummus a tune
+def	def.1	i ❤️ wombats	2024-09-15	Homo sapiens	USA	Massachusetts	AA	Earth	hummus a tune
+ghi	ghi.4	i ❤️ wombats	2024-09-15	Homo sapiens	USA	Utah	BB	Cardassia	hummus a tune
+jkl	jkl.1	i ❤️ wombats	2024-09-15	Homo sapiens	USA	Utah	CC	Bajor	hummus a tune
+mno	mno.1	i ❤️ wombats	2024-09-15	Homo sapiens	Canada	Mississippi	DD	Earth	hummus a tune
+mno	mno.1	i ❤️ wombats	2024-09-15	marmots	USA	Massachusetts	DD	Cardassia	hummus a tune
+mno	mno.1	i ❤️ wombats	2024-09-15	Homo sapiens	USA	Puerto Rico	DD	Bajor	hummus a tune
+abcd	abcd.1	i ❤️ wombats	2024-09-22	Homo sapiens	USA	Massachusetts	FF	Vulcan	hummus a tune
+abc	abc.1	i ❤️ wombats	2024-09-22	Homo sapiens	USA	Massachusetts	AA	Vulcan	hummus a tune
+def	def.1	i ❤️ wombats	2024-09-22	Homo sapiens	USA	Massachusetts	AA	Earth	hummus a tune
+ghi	ghi.4	i ❤️ wombats	2024-09-22	Homo sapiens	USA	Utah	BB	Cardassia	hummus a tune
+jkl	jkl.1	i ❤️ wombats	2024-09-22	Homo sapiens	USA	Utah	CC	Bajor	hummus a tune
+mno	mno.1	i ❤️ wombats	2024-09-22	Homo sapiens	Canada	Mississippi	FF	Earth	hummus a tune
+mno	mno.1	i ❤️ wombats	2024-09-22	marmots	USA	Massachusetts	FF	Cardassia	hummus a tune
+mno	mno.1	i ❤️ wombats	2024-09-22	Homo sapiens	USA	Puerto Rico	FF	Bajor	hummus a tune
diff --git a/tests/unit/test_get_clade_list.py b/tests/unit/test_get_clade_list.py
@@ -0,0 +1,39 @@
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+from virus_clade_utils.get_clade_list import main
+
+
+@pytest.fixture
+def test_file_path() -> Path:
+    """
+    Return path to the unit test files.
+    """
+    test_file_path = Path(__file__).parents[1].joinpath("data")
+    return test_file_path
+
+
+@pytest.mark.parametrize(
+    "threshold, weeks, max_clades, expected_list",
+    [
+        (0.1, 3, 9, ["AA", "AA.ZZ", "BB", "CC", "DD", "EE", "FF"]),
+        (0.3, 3, 9, ["AA", "AA.ZZ", "EE"]),
+        (0.1, 2, 9, ["AA.ZZ", "AA", "BB", "CC", "DD", "EE", "FF"]),
+        (0.1, 1, 9, ["AA", "BB", "CC", "FF"]),
+        (0.3, 1, 9, ["AA"]),
+        (0.1, 3, 4, ["AA", "AA.ZZ", "BB", "CC"]),
+        (0.3, 3, 2, ["AA", "AA.ZZ"]),
+        (0.1, 2, 3, ["AA", "BB", "CC"]),
+        (0.1, 1, 3, ["AA", "BB", "CC"]),
+        (1, 3, 9, []),
+    ],
+)
+def test_clade_list(test_file_path, tmp_path, threshold, weeks, max_clades, expected_list):
+    test_genome_metadata = test_file_path / "test_metadata.tsv"
+    mock = MagicMock(return_value=test_genome_metadata, name="genome_metadata_download_mock")
+
+    with patch("virus_clade_utils.get_clade_list.download_covid_genome_metadata", mock):
+        actual_list = main(test_genome_metadata, tmp_path, threshold, weeks, max_clades)
+
+    assert set(expected_list) == set(actual_list)