Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bsweger/align with nextstrain #10

Merged
merged 6 commits into from
Aug 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
.python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
# Pulling the nextclade image to grab the CLI binary is kind of overkill, but it's
# an easy way to control the version we're using.
# TODO: fix up the reference tree/root sequence so we can use a newer version
FROM nextstrain/nextclade:3.3.1
FROM nextstrain/nextclade:3.8.2

FROM python:3.12-slim-bookworm
COPY --from=0 /usr/bin/nextclade /opt/src/virus_clade_utils/bin/
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ dependencies = [
"pandas",
"polars>=0.20.23",
"pyarrow",
"requests",
"requests>=2.32.0",
"rich",
"rich-click",
"structlog",
Expand Down
2 changes: 1 addition & 1 deletion requirements/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ pytz==2024.1
# via pandas
pyyaml==6.0.1
# via awscli
requests==2.31.0
requests==2.32.3
# via virus-clade-utils (pyproject.toml)
rich==13.7.1
# via
Expand Down
47 changes: 15 additions & 32 deletions src/virus_clade_utils/assign_clades.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import datetime
import json
import os
import subprocess

Expand All @@ -8,7 +7,7 @@
import structlog

from virus_clade_utils.util.config import Config
from virus_clade_utils.util.reference import get_reference_data
from virus_clade_utils.util.reference import get_nextclade_dataset
from virus_clade_utils.util.sequence import (
get_covid_genome_data,
parse_sequence_assignments,
Expand Down Expand Up @@ -67,43 +66,23 @@ def get_sequence_metadata(config: Config):
logger.info("extracted sequence metadata", metadata_file=config.ncbi_sequence_metadata_file)


def save_reference_info(config: Config):
"""Download a reference tree and save it to a file."""

reference = get_reference_data(config.nextclade_base_url, config.reference_tree_date)

with open(config.reference_tree_file, "w") as f:
json.dump(reference, f)

with open(config.root_sequence_file, "w") as f:
f.write(reference["root_sequence"])

logger.info(
"Reference data saved",
tree_path=str(config.reference_tree_file),
root_sequence_path=str(config.root_sequence_file),
)


def assign_clades(config: Config):
def assign_clades(config: Config, nextclade_dataset_path: str):
"""Assign downloaded genbank sequences to a clade."""

logger.info("Assigning sequences to clades using reference tree")

subprocess.run(
[
"nextclade",
"run",
"--input-tree",
f"{config.reference_tree_file}",
"--input-ref",
f"{config.root_sequence_file}",
f"{config.ncbi_sequence_file}",
"--input-dataset",
nextclade_dataset_path,
"--output-csv",
f"{config.assignment_no_metadata_file}",
f"{config.ncbi_sequence_file}",
]
)

logger.info("Assigned sequences to clades via Nextclade CLI", output_file=config.assignment_no_metadata_file)


def merge_metadata(config: Config) -> pl.DataFrame:
"""Merge sequence metadata with clade assignments."""
Expand All @@ -115,7 +94,7 @@ def merge_metadata(config: Config) -> pl.DataFrame:
# duplicate Accession values?
assert df_metadata["Accession"].n_unique() == df_metadata.shape[0]

df_assignments = pl.read_csv(config.assignment_no_metadata_file, separator=";")
df_assignments = pl.read_csv(config.assignment_no_metadata_file, separator=";", infer_schema_length=5000)
df_assignments = parse_sequence_assignments(df_assignments)

joined = df_metadata.join(df_assignments, left_on="Accession", right_on="seq", how="left")
Expand All @@ -139,6 +118,11 @@ def merge_metadata(config: Config) -> pl.DataFrame:
missing_clade_assignments=missing_clade_assignments,
)

# TBD: include only the columns we need
# https://github.com/reichlab/virus-clade-utils/issues/11
if len(config.assignment_file_columns) > 0:
joined = joined.select(config.assignment_file_columns)

return joined


Expand Down Expand Up @@ -172,11 +156,10 @@ def main(sequence_released_since_date: datetime.date, reference_tree_date: datet
logger.info("Starting pipeline", reference_tree_date=reference_tree_date, run_time=config.run_time)

os.makedirs(config.data_path, exist_ok=True)
nextclade_dataset_path = get_nextclade_dataset(config.reference_tree_date, config.data_path)
get_sequences(config)
get_sequence_metadata(config)
save_reference_info(config)
assign_clades(config)

assign_clades(config, nextclade_dataset_path)
merged_data = merge_metadata(config)
merged_data.write_csv(config.assignment_file)

Expand Down
17 changes: 16 additions & 1 deletion src/virus_clade_utils/util/config.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from dataclasses import InitVar, asdict, dataclass
from dataclasses import InitVar, asdict, dataclass, field
from datetime import datetime
from pprint import pprint

Expand All @@ -23,6 +23,7 @@ class Config:
root_sequence_file: AnyPath = None
assignment_no_metadata_file: AnyPath = None
assignment_file: AnyPath = None
assignment_file_columns: list[str] = field(default_factory=list)

def __post_init__(
self,
Expand All @@ -44,6 +45,20 @@ def __post_init__(
self.data_path / f"{self.sequence_released_since_date}_clade_assignments_no_metadata.csv"
)
self.assignment_file = self.data_path / f"{self.sequence_released_since_date}_clade_assignments.csv"
self.assignment_file_columns = [
"Accession",
"Source database",
"Release date",
"Update date",
"Isolate Collection date",
"clade",
"clade_nextstrain",
"Nextclade_pango",
"partiallyAliased",
"clade_who",
"clade_display",
"Virus Pangolin Classification",
]

def __repr__(self):
return str(pprint(asdict(self)))
66 changes: 31 additions & 35 deletions src/virus_clade_utils/util/reference.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,42 @@
"""Functions for retrieving and parsing SARS-CoV-2 phylogenic tree data."""

import requests
import subprocess
from pathlib import Path

import structlog
from virus_clade_utils.util.session import check_response, get_session

logger = structlog.get_logger()


def get_reference_data(base_url: str, as_of_date: str) -> dict:
"""Return a reference tree as of a given date in YYYY-MM-DD format."""
headers = {
"Accept": "application/vnd.nextstrain.dataset.main+json",
}
session = get_session()
session.headers.update(headers)

response = requests.get(f"{base_url}@{as_of_date}", headers=headers)
check_response(response)
reference_data = response.json()
def get_nextclade_dataset(as_of_date: str, data_path_root: str) -> str:
"""
Return the Nextclade dataset relevant to a specified as_of_date. The dataset is
in .zip format and contains two components required for assignming virus
genome sequences to clades: a tree and the reference sequence of the virus.
"""

# Until Nextstrain provides this information, we're hard-coding a
# a specific version of the nextclade dataset here.
as_of_date = "not yet implemented"
DATASET_VERSION = "2024-07-17--12-57-03Z"
DATASET_PATH = Path(f"{data_path_root}/nextclade_dataset_{DATASET_VERSION}.zip")

subprocess.run(
[
"nextclade",
"dataset",
"get",
"--name",
"sars-cov-2",
"--tag",
DATASET_VERSION,
"--output-zip",
str(DATASET_PATH),
]
)

logger.info(
"Reference data retrieved",
tree_updated=reference_data["meta"].get("updated"),
"Nextclade reference dataset retrieved", as_of_date=as_of_date, version=DATASET_VERSION, output_zip=DATASET_PATH
)

reference = {
"tree": reference_data["tree"],
"meta": reference_data["meta"],
}

try:
# response schema: https://raw.githubusercontent.com/nextstrain/augur/HEAD/augur/data/schema-export-v2.json
# root sequence schema: https://raw.githubusercontent.com/nextstrain/augur/HEAD/augur/data/schema-export-root-sequence.json
# this code adds a fasta-compliant header to the root sequence returned by the API
fasta_root_header = (
">NC_045512.2 Severe acute respiratory syndrome" " coronavirus 2 isolate Wuhan-Hu-1, complete genome"
)
root_sequence = reference_data["root_sequence"]["nuc"]
reference["root_sequence"] = f"{fasta_root_header}\n{root_sequence}"
except KeyError:
# Older versions of the dataset don't include a root_sequence.
logger.error("Aborting pipeline: no root sequence found in reference data.", as_of_date=as_of_date)
raise SystemExit(f"\nAborting pipeline: no root sequence found for date {as_of_date}")

return reference
return DATASET_PATH
59 changes: 8 additions & 51 deletions tests/unit/util/test_reference.py
Original file line number Diff line number Diff line change
@@ -1,56 +1,13 @@
from unittest import mock

import pytest
from requests import Response
from virus_clade_utils.util.reference import get_reference_data
from virus_clade_utils.util.reference import get_nextclade_dataset


@pytest.fixture
def get_nextclade_response():
def _get_nextclade_response():
return {
"tree": "cladesandstuff",
"meta": {"updated": "2021-09-01"},
"root_sequence": {"nuc": "fastasequence"},
}
@mock.patch("subprocess.run")
def test_get_nextclade_dataset(tmp_path):
dataset_path = get_nextclade_dataset("2021-09-01", tmp_path)

return _get_nextclade_response


@pytest.fixture
def get_nextclade_response_no_root():
def _get_nextclade_response_no_root():
return {
"tree": "cladesandstuff",
"meta": {"updated": "2021-09-01"},
}

return _get_nextclade_response_no_root


@mock.patch("requests.get")
def test_get_reference_data(mock_get, get_nextclade_response):
mock_response = Response()
mock_response.status_code = 200
mock_response.json = get_nextclade_response
mock_get.return_value = mock_response

reference = get_reference_data("www.fakenextclade.com", "2021-09-01")

assert reference["tree"] == "cladesandstuff"
assert reference["meta"]["updated"] == "2021-09-01"
assert "fastasequence" in reference["root_sequence"]


@mock.patch("requests.get")
def test_missing_root_sequence(mock_get, get_nextclade_response_no_root, capsys):
mock_response = Response()
mock_response.status_code = 200
mock_response.json = get_nextclade_response_no_root
mock_get.return_value = mock_response

with pytest.raises(SystemExit):
get_reference_data("www.fakenextclade.com", "2021-09-01")
out, err = capsys.readouterr()
assert "no root sequence" in out
assert "2021-09-01" in out
# the dataset_path being returned should contain the correct nextclade
# datasetset version, as determined by the as_of_date being passed
# (returned version is temporarily hard-coded until Nextstrain provides the info we need)
assert "2024-07-17--12-57-03Z" in str(dataset_path)