diff --git a/CHANGELOG.md b/CHANGELOG.md index 062971847..add0e5f38 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,21 @@ # Changelog +## [v2.0.0-alpha.7](https://github.com/NPLinker/nplinker/tree/v2.0.0-alpha.7) (2024-11-28) + +[Full Changelog](https://github.com/NPLinker/nplinker/compare/v2.0.0-alpha.6...v2.0.0-alpha.7) + +**Closed issues:** + +- Incorrect precursor m/z when loading MGF file from GNPS [\#282](https://github.com/NPLinker/nplinker/issues/282) +- Use bigscape version in loaders [\#271](https://github.com/NPLinker/nplinker/issues/271) + +**Merged pull requests:** + +- remove default config file to make all settings explicit [\#287](https://github.com/NPLinker/nplinker/pull/287) ([CunliangGeng](https://github.com/CunliangGeng)) +- add support of mibig v4.0 [\#286](https://github.com/NPLinker/nplinker/pull/286) ([CunliangGeng](https://github.com/CunliangGeng)) +- fix the resolving of genbank and jgi IDs [\#285](https://github.com/NPLinker/nplinker/pull/285) ([CunliangGeng](https://github.com/CunliangGeng)) +- Precursor m/z value fix [\#283](https://github.com/NPLinker/nplinker/pull/283) ([liannette](https://github.com/liannette)) + ## [v2.0.0-alpha.6](https://github.com/NPLinker/nplinker/tree/v2.0.0-alpha.6) (2024-09-17) [Full Changelog](https://github.com/NPLinker/nplinker/compare/v2.0.0-alpha.5...v2.0.0-alpha.6) diff --git a/CITATION.cff b/CITATION.cff index f4d42db61..0029adb18 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -45,7 +45,7 @@ authors: - given-names: Marnix family-names: Medema -version: "2.0.0-alpha.6" +version: "2.0.0-alpha.7" repository-code: "https://github.com/NPLinker/nplinker" keywords: - Genome diff --git a/README.md b/README.md index 6afc95870..7580e160d 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,8 @@ | Citation data consistency | [![cffconvert](https://github.com/NPLinker/nplinker/actions/workflows/cffconvert.yml/badge.svg)](https://github.com/NPLinker/nplinker/actions/workflows/cffconvert.yml) | -# Natural Products Linker (NPLinker) +![NPLinker Logo](./docs/images/NPLinker_standard_black.svg) + NPLinker is a python framework for data mining microbial natural products by integrating genomics and metabolomics data. Original paper: [Ranking microbial metabolomic and genomic links in the NPLinker framework using complementary scoring functions](https://doi.org/10.1371/journal.pcbi.1008920). diff --git a/docs/concepts/config_file.md b/docs/concepts/config_file.md index d7e8f4315..e2f74ffc6 100644 --- a/docs/concepts/config_file.md +++ b/docs/concepts/config_file.md @@ -4,13 +4,9 @@ --8<-- "src/nplinker/data/nplinker.toml" ``` +## Example Configuration -## Default Configurations -The default configurations are automatically used by NPLinker if you don't set them in your config file. - -```toml ---8<-- "src/nplinker/nplinker_default.toml" -``` +For a full example of a configuration file, see [here](../quickstart.md#3-prepare-config-file). ## Config loader diff --git a/docs/images/NPLinker_icon_black.svg b/docs/images/NPLinker_icon_black.svg new file mode 100644 index 000000000..b1808e1d2 --- /dev/null +++ b/docs/images/NPLinker_icon_black.svg @@ -0,0 +1,25 @@ + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/images/NPLinker_icon_white.svg b/docs/images/NPLinker_icon_white.svg new file mode 100644 index 000000000..3b2e95308 --- /dev/null +++ b/docs/images/NPLinker_icon_white.svg @@ -0,0 +1,25 @@ + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/images/NPLinker_standard_black.svg b/docs/images/NPLinker_standard_black.svg new file mode 100644 index 000000000..38cec3178 --- /dev/null +++ b/docs/images/NPLinker_standard_black.svg @@ -0,0 +1,71 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/images/NPLinker_standard_white.svg b/docs/images/NPLinker_standard_white.svg new file mode 100644 index 000000000..2da5590f0 --- /dev/null +++ b/docs/images/NPLinker_standard_white.svg @@ -0,0 +1,71 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/index.md b/docs/index.md index f443c12cc..681f61b13 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,4 +1,5 @@ -# NPLinker +# + NPLinker is a python framework for data mining microbial natural products by integrating genomics and metabolomics data. diff --git a/docs/quickstart.md b/docs/quickstart.md index 1d918c83a..de215f038 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -166,16 +166,27 @@ is recommended to put it in the working directory created in step 2. The details of all settings can be found at this page [Config File](./concepts/config_file.md). -To keep it simple, [default settings](./concepts/config_file.md#default-configurations) will be used -automatically by NPLinker if you don't set them in your `nplinker.toml` config file. - -What you need to do is to set the `root_dir` and `mode` in the `nplinker.toml` file. +Here are some example values for the `nplinker.toml` file: === "`local` mode" ```toml title="nplinker.toml" root_dir = "absolute/path/to/working/directory" # (1)! mode = "local" - # and other settings you want to override the default settings + + [log] + level = "DEBUG" + use_console = true + + [mibig] + to_use = true + version = "3.1" + + [bigscape] + version = 1 + cutoff = "0.30" + + [scoring] + methods = ["metcalf"] ``` 1. Replace `absolute/path/to/working/directory` with the **absolute** path to the working directory @@ -187,7 +198,22 @@ What you need to do is to set the `root_dir` and `mode` in the `nplinker.toml` f root_dir = "absolute/path/to/working/directory" # (1)! mode = "podp" podp_id = "podp_id" # (2)! - # and other settings you want to override the default settings + + [log] + level = "DEBUG" + use_console = true + + [mibig] + to_use = true + version = "3.1" + + [bigscape] + version = 2 + cutoff = "0.30" + parameters = "--mibig_version 3.1 --include_singletons --gcf_cutoffs 0.30" + + [scoring] + methods = ["metcalf"] ``` 1. Replace `absolute/path/to/working/directory` with the **absolute** path to the working directory diff --git a/docs/scripts/extra.js b/docs/scripts/extra.js new file mode 100644 index 000000000..f0b72b388 --- /dev/null +++ b/docs/scripts/extra.js @@ -0,0 +1,21 @@ +document.addEventListener("DOMContentLoaded", function () { + const img = document.querySelector(".theme-toggle-image"); + + if (!img) return; // Exit if no image is found + + // Function to update the image based on the current theme + function updateImage() { + const theme = document.body.getAttribute("data-md-color-scheme"); + img.src = theme === "slate" ? "images/NPLinker_standard_white.svg" : "images/NPLinker_standard_black.svg"; + } + + // Initial update + updateImage(); + + // Observe changes to the `data-md-color-scheme` attribute + const observer = new MutationObserver(updateImage); + observer.observe(document.body, { + attributes: true, + attributeFilter: ["data-md-color-scheme"], // Monitor changes to the theme attribute + }); +}); \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index a3aa6e8f7..5ba372a81 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -35,7 +35,8 @@ theme: logo: 'material/library-outline' previous: fontawesome/solid/angle-left next: fontawesome/solid/angle-right - favicon: 'favicon.png' + favicon: images/NPLinker_icon_black.svg + logo: images/NPLinker_icon_white.svg repo_name: nplinker/nplinker repo_url: https://github.com/nplinker/nplinker @@ -47,6 +48,9 @@ extra: extra_css: - css/extra.css +extra_javascript: + - scripts/extra.js + # https://www.mkdocs.org/user-guide/configuration/#validation validation: omitted_files: warn diff --git a/pyproject.toml b/pyproject.toml index c627f6ca9..74d050a05 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta" [project] name = "nplinker" -version = "2.0.0-alpha.6" +version = "2.0.0-alpha.7" description = "Natural Products Linker" readme = "README.md" requires-python = ">=3.9" diff --git a/src/nplinker/__init__.py b/src/nplinker/__init__.py index e7efe61e9..c80be90cc 100644 --- a/src/nplinker/__init__.py +++ b/src/nplinker/__init__.py @@ -7,7 +7,7 @@ __author__ = "Cunliang Geng" __email__ = "c.geng@esciencecenter.nl" -__version__ = "2.0.0-alpha.6" +__version__ = "2.0.0-alpha.7" __all__ = ["NPLinker", "setup_logging"] diff --git a/src/nplinker/config.py b/src/nplinker/config.py index d1ade8bae..a9491799f 100644 --- a/src/nplinker/config.py +++ b/src/nplinker/config.py @@ -1,6 +1,5 @@ from __future__ import annotations from os import PathLike -from pathlib import Path from dynaconf import Dynaconf from dynaconf import Validator from nplinker.utils import transform_to_full_path @@ -25,11 +24,8 @@ def load_config(config_file: str | PathLike) -> Dynaconf: if not config_file.exists(): raise FileNotFoundError(f"Config file '{config_file}' not found") - # Locate the default config file - default_config_file = Path(__file__).resolve().parent / "nplinker_default.toml" - # Load config files - config = Dynaconf(settings_files=[config_file], preload=[default_config_file]) + config = Dynaconf(settings_files=[config_file]) # Validate configs config.validators.register(*CONFIG_VALIDATORS) @@ -61,7 +57,7 @@ def load_config(config_file: str | PathLike) -> Dynaconf: is_in=["NOTSET", "DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], ), Validator("log.file", is_type_of=str), - Validator("log.use_console", is_type_of=bool), + Validator("log.use_console", required=True, is_type_of=bool), # Mibig Validator("mibig.to_use", required=True, is_type_of=bool), Validator( @@ -71,9 +67,9 @@ def load_config(config_file: str | PathLike) -> Dynaconf: when=Validator("mibig.to_use", eq=True), ), # BigScape - Validator("bigscape.parameters", required=True, is_type_of=str), + Validator("bigscape.parameters", is_type_of=str), Validator("bigscape.cutoff", required=True, is_type_of=str), - Validator("bigscape.version", required=True, is_type_of=int), + Validator("bigscape.version", required=True, is_type_of=int, is_in=[1, 2]), # Scoring ## `scoring.methods` must be a list of strings and must contain at least one of the ## supported scoring methods. diff --git a/src/nplinker/data/nplinker.toml b/src/nplinker/data/nplinker.toml index 6a9f0d8da..f069de474 100644 --- a/src/nplinker/data/nplinker.toml +++ b/src/nplinker/data/nplinker.toml @@ -2,76 +2,87 @@ # NPLinker configuration file ############################# -# The root directory of the NPLinker project. You need to create it first. -# The value is required and must be a full path. root_dir = "" +# [REQUIRED] The value is required and must be a full path. +# The root directory of the NPLinker project. You need to create it first. + +mode = "podp" +# [REQUIRED] Available values are "podp" and "local". # The mode for preparing dataset. -# The available modes are "podp" and "local". # "podp" mode is for using the PODP platform (https://pairedomicsdata.bioinformatics.nl/) to prepare the dataset. -# "local" mode is for preparing the dataset locally. So uers do not need to upload their data to the PODP platform. -# The value is required. -mode = "podp" -# The PODP project identifier. -# The value is required if the mode is "podp". +# "local" mode is for preparing the dataset locally. So users do not need to upload their data to the PODP platform. + podp_id = "" +# [REQUIRED-UNDER-CONDITIONS] The value is required if the mode is "podp". +# The PODP project identifier. +# Example: The identifier is "4b29ddc3-26d0-40d7-80c5-44fb6631dbf9.4" for the project +# https://pairedomicsdata.bioinformatics.nl/projects/4b29ddc3-26d0-40d7-80c5-44fb6631dbf9.4 [log] -# Log level. The available levels are same as the levels in python package `logging`: -# "DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL". -# The default value is "INFO". +# Settings for logging. + level = "INFO" +# [REQUIRED] Available values are "DEBUG", "INFO", "WARNING", "ERROR", and "CRITICAL". +# Log level. + +use_console = true +# [REQUIRED] Available values are "true" and "false". +# Whether to write log messages to console. + +file = "path/to/logfile" +# [OPTIONAL] # The log file to append log messages. -# The value is optional. # If not set or use empty string, log messages will not be written to a file. # The file will be created if it does not exist. Log messages will be appended to the file if it exists. -file = "path/to/logfile" -# Whether to write log meesages to console. -# The default value is true. -use_console = true [mibig] -# Whether to use mibig metadta (json). -# The default value is true. +# Settings for MIBiG. + to_use = true -# The version of mibig metadata. -# Make sure using the same version of mibig in bigscape. -# The default value is "3.1" +# [REQUIRED] Available values are `true` and `false`. +# Whether to use MIBiG annotations/metadata data for the analysis. + version = "3.1" +# [REQUIRED-UNDER-CONDITIONS] The version must be same as the version of MIBiG used in BiG-SCAPE. +# The version of MIBiG data to use. +# Check all available versions at https://mibig.secondarymetabolites.org/download. [bigscape] -# The parameters to use for running BiG-SCAPE. -# Version of BiG-SCAPE to run. Make sure to change the parameters property below as well -# when changing versions. +# Settings for BiG-SCAPE. + version = 1 -# Required BiG-SCAPE parameters. +# [REQUIRED] Available values are 1 and 2. 1 for version 1.x series and 2 for version 2.x series. +# The version of BiG-SCAPE to use. + +cutoff = "0.30" +# [REQUIRED] The value must be a string. +# Which cutoff to use for the analysis. +# There might be multiple cutoffs in the BiG-SCAPE output and this value must be one of them. + +parameters = "version1_parameters_or_version2_parameters" +# [REQUIRED-UNDER-CONDITIONS] It's required when you want to run BiG-SCAPE in NPLinker. +# Parameters for running BiG-SCAPE. # -------------- # For version 1: # ------------- -# Required parameters are: `--mix`, `--include_singletons` and `--cutoffs`. NPLinker needs them to run the analysis properly. -# Do NOT set these parameters: `--inputdir`, `--outputdir`, `--pfam_dir`. NPLinker will automatically configure them. -# If parameter `--mibig` is set, make sure to set the config `mibig.to_use` to true and `mibig.version` to the version of mibig in BiG-SCAPE. -# The default value is "--mibig --clans-off --mix --include_singletons --cutoffs 0.30". +# The parameters MUST contain `--mix`, `--include_singletons` and `--cutoffs`. NPLinker needs them to run the analysis properly. +# The parameters must NOT contain `--inputdir`, `--outputdir`, `--pfam_dir`. NPLinker will automatically configure them. +# An example value could be: "--mibig --clans-off --mix --include_singletons --cutoffs 0.30". # -------------- # For version 2: # -------------- -# Note that BiG-SCAPE v2 has subcommands. NPLinker requires the `cluster` subcommand and its parameters. +# BiG-SCAPE v2 has subcommands. NPLinker requires the `cluster` subcommand and its parameters. # Required parameters of `cluster` subcommand are: `--mibig_version`, `--include_singletons` and `--gcf_cutoffs`. # DO NOT set these parameters: `--pfam_path`, `--inputdir`, `--outputdir`. NPLinker will automatically configure them. # BiG-SCPAPE v2 also runs a `--mix` analysis by default, so you don't need to set this parameter here. -# Example parameters for BiG-SCAPE v2: "--mibig_version 3.1 --include_singletons --gcf_cutoffs 0.30" -parameters = "--mibig --clans-off --mix --include_singletons --cutoffs 0.30" -# Which bigscape cutoff to use for NPLinker analysis. -# There might be multiple cutoffs in bigscape output. -# Note that this value must be a string. -# The default value is "0.30". -cutoff = "0.30" +# An example value could be: "--mibig_version 3.1 --include_singletons --gcf_cutoffs 0.30" [scoring] -# Scoring methods. -# Valid values are "metcalf" and "rosetta". -# The default value is "metcalf". +# Settings for scoring. methods = ["metcalf"] +# [REQUIRED] Available values are "metcalf" and "rosetta". +# Scoring methods to use for the analysis. \ No newline at end of file diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index 3973d2208..efbbffacd 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -2,7 +2,6 @@ import json import logging import re -import time import warnings from collections.abc import Mapping from collections.abc import Sequence @@ -10,8 +9,6 @@ from pathlib import Path import httpx from bs4 import BeautifulSoup -from bs4 import NavigableString -from bs4 import Tag from jsonschema import validate from nplinker.defaults import GENOME_STATUS_FILENAME from nplinker.genomics.antismash import download_and_extract_antismash_data @@ -20,7 +17,6 @@ logger = logging.getLogger(__name__) -NCBI_LOOKUP_URL = "https://www.ncbi.nlm.nih.gov/assembly/?term={}" JGI_GENOME_LOOKUP_URL = ( "https://img.jgi.doe.gov/cgi-bin/m/main.cgi?section=TaxonDetail&page=taxonDetail&taxon_oid={}" ) @@ -251,90 +247,49 @@ def get_best_available_genome_id(genome_id_data: Mapping[str, str]) -> str | Non return best_id -def _ncbi_genbank_search(genbank_id: str, retry_times: int = 3) -> Tag | NavigableString | None: - url = NCBI_LOOKUP_URL.format(genbank_id) - retry = 1 - while retry <= retry_times: - logger.info(f"Looking up GenBank data for {genbank_id} at {url}") - resp = httpx.get(url, follow_redirects=True) - if resp.status_code == httpx.codes.OK: - # the page should contain a
element with class "assembly_summary_new". retrieving - # the page seems to fail occasionally in the middle of lengthy sequences of genome - # lookups, so there might be some throttling going on. this will automatically retry - # the lookup if the expected content isn't found the first time - soup = BeautifulSoup(resp.content, "html.parser") - # find the
element with class "assembly_summary_new" - dl_element = soup.find("dl", {"class": "assembly_summary_new"}) - if dl_element is not None: - return dl_element - retry = retry + 1 - time.sleep(5) - - logger.warning(f"Failed to resolve NCBI genome ID {genbank_id} at URL {url} (after retrying)") - return None - - def _resolve_genbank_accession(genbank_id: str) -> str: - """Try to get RefSeq id through given GenBank id. + """Try to get RefSeq assembly id through given GenBank assembly id. + + Note that GenBank assembly accession starts with "GCA_" and RefSeq assembly + accession starts with "GCF_". For more info, see + https://www.ncbi.nlm.nih.gov/datasets/docs/v2/troubleshooting/faq Args: - genbank_id: ID for GenBank accession. + genbank_id: ID for GenBank assembly accession. Raises: - Exception: "Unknown HTML format" if the search of genbank does not give any results. - Exception: "Expected HTML elements not found" if no match with RefSeq assembly accession is found. + httpx.ReadTimeout: If the request times out. Returns: - RefSeq ID if the search is successful, otherwise None. + RefSeq assembly ID if the search is successful, otherwise an empty string. """ - logger.info(f"Attempting to resolve Genbank accession {genbank_id} to RefSeq accession") - # genbank id => genbank seq => refseq - - # The GenBank accession can have several formats: - # 1: BAFR00000000.1 - # 2: NZ_BAGG00000000.1 - # 3: NC_016887.1 - # Case 1 is the default. - if "_" in genbank_id: - # case 2 - if len(genbank_id.split("_")[-1].split(".")[0]) == 12: - genbank_id = genbank_id.split("_")[-1] - # case 3 - else: - genbank_id = genbank_id.lower() - - # get rid of any extraneous whitespace - genbank_id = genbank_id.strip() - logger.info(f'Parsed GenBank ID to "{genbank_id}"') - - # run a search using the GenBank accession ID + logger.info( + f"Attempting to resolve Genbank assembly accession {genbank_id} to RefSeq accession" + ) + # NCBI Datasets API https://www.ncbi.nlm.nih.gov/datasets/docs/v2/api/ + # Note that there is a API rate limit of 5 requests per second without using an API key + # For more info, see https://www.ncbi.nlm.nih.gov/datasets/docs/v2/troubleshooting/faq/ + + # API for getting revision history of a genome assembly + # For schema, see https://www.ncbi.nlm.nih.gov/datasets/docs/v2/api/rest-api/#get-/genome/accession/-accession-/revision_history + url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{genbank_id}/revision_history" + + refseq_id = "" try: - dl_element = _ncbi_genbank_search(genbank_id) - if dl_element is None or isinstance(dl_element, NavigableString): - raise Exception("Unknown HTML format") - - refseq_idx = -1 - for field_idx, field in enumerate(dl_element.children): - # this is the element immediately preceding the one with - # the actual RefSeq ID we want - if field.getText().strip() == "RefSeq assembly accession:": - refseq_idx = field_idx + 1 - - # this should be True when we've reached the right element - if field_idx == refseq_idx: - refseq_id = field.getText() - # if it has any spaces, take everything up to first one (some have annotations afterwards) - if refseq_id.find(" ") != -1: - refseq_id = refseq_id[: refseq_id.find(" ")] - - return str(refseq_id) - - if refseq_idx == -1: - raise Exception("Expected HTML elements not found") - except Exception as e: - logger.warning(f"Failed resolving GenBank accession {genbank_id}, error {e}") + resp = httpx.get( + url, headers={"User-Agent": USER_AGENT}, timeout=10.0, follow_redirects=True + ) + if resp.status_code == httpx.codes.OK: + data = resp.json() + latest_entry = max( + (entry for entry in data["assembly_revisions"] if "refseq_accession" in entry), + key=lambda x: x["release_date"], + ) + refseq_id = latest_entry["refseq_accession"] + except httpx.ReadTimeout: + logger.warning("Timed out waiting for result of GenBank assembly lookup") - return "" + return refseq_id def _resolve_jgi_accession(jgi_id: str) -> str: @@ -344,7 +299,7 @@ def _resolve_jgi_accession(jgi_id: str) -> str: jgi_id: JGI_Genome_ID for GenBank accession. Returns: - RefSeq ID if search is successful, otherwise None. + RefSeq ID if search is successful, otherwise an empty string. """ url = JGI_GENOME_LOOKUP_URL.format(jgi_id) logger.info(f"Attempting to resolve JGI_Genome_ID {jgi_id} to GenBank accession via {url}") @@ -358,12 +313,17 @@ def _resolve_jgi_accession(jgi_id: str) -> str: return "" soup = BeautifulSoup(resp.content, "html.parser") - # find the table entry giving the NCBI assembly accession ID - link = soup.find("a", href=re.compile("https://www.ncbi.nlm.nih.gov/nuccore/.*")) + # Find the table entry giving the "NCBI Assembly Accession" ID + link = soup.find("a", href=re.compile("https://www.ncbi.nlm.nih.gov/datasets/genome/.*")) if link is None: return "" - return _resolve_genbank_accession(link.text) + assembly_id = link.text + # check if the assembly ID is already a RefSeq ID + if assembly_id.startswith("GCF_"): + return assembly_id + else: + return _resolve_genbank_accession(assembly_id) def _resolve_refseq_id(genome_id_data: Mapping[str, str]) -> str: diff --git a/src/nplinker/genomics/mibig/mibig_downloader.py b/src/nplinker/genomics/mibig/mibig_downloader.py index 9b7d1d530..f063a7cb7 100644 --- a/src/nplinker/genomics/mibig/mibig_downloader.py +++ b/src/nplinker/genomics/mibig/mibig_downloader.py @@ -21,6 +21,7 @@ "2.0": "843ce4677db6d11422f0e6d94dd03e81", "3.0": "7c38b90f939086c03392d99a913baef9", "3.1": "643d1349722a9437d8dcf558dac5f815", + "4.0": "70d1e7d573652ba62548b1fcfbdbf844", } @@ -31,6 +32,8 @@ def download_and_extract_mibig_metadata( ): """Download and extract MIBiG metadata json files. + The MIBiG metadata json files are available at https://mibig.secondarymetabolites.org/download. + Note that it does not matter whether the metadata json files are in nested folders or not in the archive, all json files will be extracted to the same location, i.e. `extract_path`. The nested folders will be removed if they exist. So the `extract_path` will have only json files. @@ -39,7 +42,7 @@ def download_and_extract_mibig_metadata( download_root: Path to the directory in which to place the downloaded archive. extract_path: Path to an empty directory where the json files will be extracted. The directory must be empty if it exists. If it doesn't exist, the directory will be created. - version: _description_. Defaults to "3.1". + version: MIBiG version. Defaults to "3.1". Examples: >>> download_and_extract_mibig_metadata("/data/download", "/data/mibig_metadata") @@ -58,7 +61,7 @@ def download_and_extract_mibig_metadata( raise ValueError(f'Nonempty directory: "{extract_path}"') # download and extract - md5 = _MD5_MIBIG_METADATA[version] + md5 = _MD5_MIBIG_METADATA.get(version, None) download_and_extract_archive( url=MIBIG_METADATA_URL.format(version=version), download_root=download_root, diff --git a/src/nplinker/genomics/mibig/mibig_metadata.py b/src/nplinker/genomics/mibig/mibig_metadata.py index 84c2ae336..a8e06fa16 100644 --- a/src/nplinker/genomics/mibig/mibig_metadata.py +++ b/src/nplinker/genomics/mibig/mibig_metadata.py @@ -9,6 +9,8 @@ class MibigMetadata: MIBiG is a specification of BGC metadata and use JSON schema to represent BGC metadata. More details see: https://mibig.secondarymetabolites.org/download. + + This class supports MIBiG version 1.0 to 4.0. """ def __init__(self, file: str | PathLike) -> None: @@ -37,22 +39,37 @@ def mibig_accession(self) -> str: def biosyn_class(self) -> tuple[str]: """Get the value of metadata item 'biosyn_class'. - The 'biosyn_class' is biosynthetic class(es), namely the type of - natural product or secondary metabolite. + The 'biosyn_class' is biosynthetic class(es) defined by MIBiG. - MIBiG defines 6 major biosynthetic classes for natural products, + Before version 4.0 of MIBiG, it defines 6 major biosynthetic classes, including `NRP`, `Polyketide`, `RiPP`, `Terpene`, `Saccharide` - and `Alkaloid`. Note that natural products created by the other - biosynthetic mechanisms fall under the category `Other`. For more details - see [the paper](https://doi.org/10.1186/s40793-018-0318-y). + and `Alkaloid`. + + Starting from version 4.0, MIBiG defines 5 major biosynthetic classes, + including `PKS`, `NRPS`, `Ribosomal`, `Terpene` and `Saccharide`. + + The mapping between the old and new classes is as follows: + + - `NRP` -> `NRPS` + - `Polyketide` -> `PKS` + - `RiPP` -> `Ribosomal` + - `Terpene` -> `Terpene` + - `Saccharide` -> `Saccharide` + - `Alkaloid` -> `Other` + + Note that natural products that do not fit into any of the above + biosynthetic classes fall under the category `Other`. """ return self._biosyn_class def _parse_metadata(self) -> None: """Parse metadata to get 'mibig_accession' and 'biosyn_class' values.""" - if "general_params" in self.metadata: + if "general_params" in self.metadata: # version ≤1.4 self._mibig_accession = self.metadata["general_params"]["mibig_accession"] self._biosyn_class = tuple(self.metadata["general_params"]["biosyn_class"]) - else: # version≥2.0 + elif "cluster" in self.metadata: # version ≥2.0 and <4.0 self._mibig_accession = self.metadata["cluster"]["mibig_accession"] self._biosyn_class = tuple(self.metadata["cluster"]["biosyn_class"]) + elif "version" in self.metadata: # version≥4.0 + self._mibig_accession = self.metadata["accession"] + self._biosyn_class = tuple(i["class"] for i in self.metadata["biosynthesis"]["classes"]) diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py index 8d2bcd189..7c0e5eaab 100644 --- a/src/nplinker/loader.py +++ b/src/nplinker/loader.py @@ -206,24 +206,20 @@ def _load_genomics(self): all_bgcs_with_strain = antismash_bgcs_with_strain + self.mibig_bgcs # Step 4: load all GCF objects - bigscape_cluster_file = ( - self.config.root_dir - / defaults.BIGSCAPE_DIRNAME - / f"mix_clustering_c{self.config.bigscape.cutoff}.tsv" - ) - bigscape_db_file = self.config.root_dir / defaults.BIGSCAPE_DIRNAME / "data_sqlite.db" - - # switch depending on found file. prefer V1 if both are found - if bigscape_cluster_file.exists(): + if self.config.bigscape.version == 1: + bigscape_cluster_file = ( + self.config.root_dir + / defaults.BIGSCAPE_DIRNAME + / f"mix_clustering_c{self.config.bigscape.cutoff}.tsv" + ) loader = BigscapeGCFLoader(bigscape_cluster_file) logger.info(f"Loading BigSCAPE cluster file {bigscape_cluster_file}") - elif bigscape_db_file.exists(): + elif self.config.bigscape.version == 2: + bigscape_db_file = self.config.root_dir / defaults.BIGSCAPE_DIRNAME / "data_sqlite.db" loader = BigscapeV2GCFLoader(bigscape_db_file) logger.info(f"Loading BigSCAPE database file {bigscape_db_file}") else: - raise FileNotFoundError( - f"Neither BigSCAPE cluster file {bigscape_cluster_file} nor database file {bigscape_db_file} were found." - ) + raise ValueError(f"Unsupported BigScape version: {self.config.bigscape.version}") raw_gcfs = loader.get_gcfs() diff --git a/src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py b/src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py index 9d7eff8bd..13d259686 100644 --- a/src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py +++ b/src/nplinker/metabolomics/gnps/gnps_spectrum_loader.py @@ -85,10 +85,10 @@ def _load(self) -> None: # Load the spectrum spectrum_id: str = spec["params"]["scans"] - # calculate precursor m/z from precursor mass and charge - precursor_mass = spec["params"]["pepmass"][0] - precursor_charge = self._get_precursor_charge(spec["params"]["charge"]) - precursor_mz: float = precursor_mass / abs(precursor_charge) + # The pepmass in an mgf file is actually the m/z and not the peptide mass + # See: https://www.matrixscience.com/help/obsolete_data_file_formats.html + precursor_mz: float = spec["params"]["pepmass"][0] + precursor_charge: int = spec["params"]["charge"][0] rt = spec["params"].get("rtinseconds", 0) spectrum = Spectrum( @@ -96,25 +96,8 @@ def _load(self) -> None: mz=list(spec["m/z array"]), intensity=list(spec["intensity array"]), precursor_mz=precursor_mz, + precursor_charge=precursor_charge, rt=rt, metadata=spec["params"], ) self._spectra.append(spectrum) - - def _get_precursor_charge(self, charges: list[int]) -> int: - """Get the precursor charge from the charge list. - - Args: - charges: list of charge values. - - Returns: - the precursor charge. - """ - charge = charges[0] - if charge == 0: - logger.warning( - f"Invalid precursor charge value 0. " - f"Assuming charge is 1 for spectrum '{self._file}'." - ) - charge = 1 - return charge diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index 6928a4f1c..af4a401c8 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -18,10 +18,10 @@ class Spectrum: id: the spectrum ID. mz: the list of m/z values. intensity: the list of intensity values. - precursor_mz: the m/z value of the precursor. + precursor_mz: the m/z value of the precursor ion. + precursor_charge: the charge of the precursor ion. rt: the retention time in seconds. - metadata: the metadata of the spectrum, i.e. the header information in the MGF - file. + metadata: the metadata of the spectrum, i.e. the header information in the MGF file. gnps_annotations: the GNPS annotations of the spectrum. gnps_id: the GNPS ID of the spectrum. strains: the strains that this spectrum belongs to. @@ -35,6 +35,7 @@ def __init__( mz: list[float], intensity: list[float], precursor_mz: float, + precursor_charge: int, rt: float = 0, metadata: dict | None = None, ) -> None: @@ -45,6 +46,7 @@ def __init__( mz: the list of m/z values. intensity: the list of intensity values. precursor_mz: the precursor m/z. + precursor_charge: the charge of the precursor ion. rt: the retention time in seconds. Defaults to 0. metadata: the metadata of the spectrum, i.e. the header information in the MGF file. @@ -53,6 +55,7 @@ def __init__( self.mz = mz self.intensity = intensity self.precursor_mz = precursor_mz + self.precursor_charge = precursor_charge self.rt = rt self.metadata = metadata or {} @@ -79,7 +82,15 @@ def __reduce__(self) -> tuple: """Reduce function for pickling.""" return ( self.__class__, - (self.id, self.mz, self.intensity, self.precursor_mz, self.rt, self.metadata), + ( + self.id, + self.mz, + self.intensity, + self.precursor_mz, + self.precursor_charge, + self.rt, + self.metadata, + ), self.__dict__, ) diff --git a/src/nplinker/nplinker_default.toml b/src/nplinker/nplinker_default.toml deleted file mode 100644 index 390800366..000000000 --- a/src/nplinker/nplinker_default.toml +++ /dev/null @@ -1,17 +0,0 @@ -# NPLinker default configurations - -[log] -level = "INFO" -use_console = true - -[mibig] -to_use = true -version = "3.1" - -[bigscape] -version = 1 -parameters = "--mibig --clans-off --mix --include_singletons --cutoffs 0.30" -cutoff = "0.30" - -[scoring] -methods = ["metcalf"] diff --git a/tests/integration/data/nplinker_local_mode.toml b/tests/integration/data/nplinker_local_mode.toml index 174ee852e..f578c8716 100644 --- a/tests/integration/data/nplinker_local_mode.toml +++ b/tests/integration/data/nplinker_local_mode.toml @@ -1,7 +1,17 @@ -dynaconf_merge = true # merge with the default settings, provided by the Dynaconf library - root_dir = "@format {env[NPLINKER_ROOT_DIR]}" mode = "local" [log] level = "DEBUG" +use_console = true + +[mibig] +to_use = true +version = "3.1" + +[bigscape] +version = 1 +cutoff = "0.30" + +[scoring] +methods = ["metcalf"] \ No newline at end of file diff --git a/tests/unit/data/mibig/BGC0000001_v4.0.json b/tests/unit/data/mibig/BGC0000001_v4.0.json new file mode 100644 index 000000000..958f613a8 --- /dev/null +++ b/tests/unit/data/mibig/BGC0000001_v4.0.json @@ -0,0 +1,603 @@ +{ + "accession": "BGC0000001", + "version": 5, + "changelog": { + "releases": [ + { + "version": "1", + "entries": [ + { + "contributors": [ + "M2JBIQNJAZIP5YVYS7CZLALR" + ], + "reviewers": [ + "AAAAAAAAAAAAAAAAAAAAAAAA" + ], + "date": "2015-06-12", + "comment": "Submitted" + } + ], + "date": "2015-06-12" + }, + { + "version": "2", + "entries": [ + { + "contributors": [ + "AAAAAAAAAAAAAAAAAAAAAAAA" + ], + "reviewers": [ + "AAAAAAAAAAAAAAAAAAAAAAAA" + ], + "date": "2019-10-16", + "comment": "Migrated from v1.4" + }, + { + "contributors": [ + "AAAAAAAAAAAAAAAAAAAAAAAA" + ], + "reviewers": [ + "AAAAAAAAAAAAAAAAAAAAAAAA" + ], + "date": "2019-10-16", + "comment": "Updated compound(s) information (MIBiG Annotathon)" + }, + { + "contributors": [ + "AAAAAAAAAAAAAAAAAAAAAAAA" + ], + "reviewers": [ + "AAAAAAAAAAAAAAAAAAAAAAAA" + ], + "date": "2019-10-16", + "comment": "Updated compound(s) information (NPAtlas curation)" + } + ], + "date": "2019-10-16" + }, + { + "version": "3", + "entries": [ + { + "contributors": [ + "AAAAAAAAAAAAAAAAAAAAAAAA" + ], + "reviewers": [ + "AAAAAAAAAAAAAAAAAAAAAAAA" + ], + "date": "2022-09-15", + "comment": "Removed ketoreductase stereochemistry annotation from modules without ketoreductases" + }, + { + "contributors": [ + "AAAAAAAAAAAAAAAAAAAAAAAA" + ], + "reviewers": [ + "AAAAAAAAAAAAAAAAAAAAAAAA" + ], + "date": "2022-09-15", + "comment": "Sorted modules by module number" + } + ], + "date": "2022-09-15" + }, + { + "version": "4", + "entries": [ + { + "contributors": [ + "AAAAAAAAAAAAAAAAAAAAAAAA" + ], + "reviewers": [ + "AAAAAAAAAAAAAAAAAAAAAAAA" + ], + "date": "2022-10-07", + "comment": "Update chemical activity to schema version 2.11" + } + ], + "date": "2022-10-07" + }, + { + "version": "5", + "entries": [ + { + "contributors": [ + "IQDGAEIXNOAWG3AVLVJH6HBZ" + ], + "reviewers": [ + "AAAAAAAAAAAAAAAAAAAAAAAA" + ], + "date": "2024-04-23", + "comment": "MIBiG v4 annotathon" + } + ], + "date": "2024-11-15" + } + ] + }, + "quality": "questionable", + "status": "active", + "completeness": "complete", + "loci": [ + { + "accession": "JF752342.1", + "location": { + "from": 0, + "to": 0 + }, + "evidence": [ + { + "method": "Knock-out studies" + } + ] + } + ], + "biosynthesis": { + "classes": [ + { + "class": "PKS", + "subclass": "Type I", + "cyclases": [] + } + ], + "modules": [ + { + "type": "pks-modular", + "name": "1", + "genes": [ + "AEK75502.1" + ], + "active": true, + "carriers": [ + { + "type": "carrier", + "gene": "AEK75502.1", + "location": { + "from": -1, + "to": -1 + }, + "subtype": "ACP", + "beta_branching": false + } + ], + "at_domain": { + "type": "acyltransferase", + "gene": "AEK75502.1", + "location": { + "from": -1, + "to": -1 + }, + "substrates": [], + "evidence": [] + }, + "ks_domain": { + "type": "ketosynthase", + "gene": "AEK75502.1", + "location": { + "from": -1, + "to": -1 + } + } + }, + { + "type": "pks-modular", + "name": "2", + "genes": [ + "AEK75502.1" + ], + "active": true, + "carriers": [ + { + "type": "carrier", + "gene": "AEK75502.1", + "location": { + "from": -1, + "to": -1 + }, + "subtype": "ACP", + "beta_branching": false + } + ], + "modification_domains": [ + { + "type": "ketoreductase", + "gene": "AEK75502.1", + "location": { + "from": -1, + "to": -1 + } + }, + { + "type": "dehydratase", + "gene": "AEK75502.1", + "location": { + "from": -1, + "to": -1 + } + } + ], + "at_domain": { + "type": "acyltransferase", + "gene": "AEK75502.1", + "location": { + "from": -1, + "to": -1 + }, + "substrates": [], + "evidence": [] + }, + "ks_domain": { + "type": "ketosynthase", + "gene": "AEK75502.1", + "location": { + "from": -1, + "to": -1 + } + } + }, + { + "type": "pks-modular", + "name": "3", + "genes": [ + "AEK75502.1" + ], + "active": true, + "carriers": [ + { + "type": "carrier", + "gene": "AEK75502.1", + "location": { + "from": -1, + "to": -1 + }, + "subtype": "ACP", + "beta_branching": false + } + ], + "modification_domains": [ + { + "type": "ketoreductase", + "gene": "AEK75502.1", + "location": { + "from": -1, + "to": -1 + } + }, + { + "type": "dehydratase", + "gene": "AEK75502.1", + "location": { + "from": -1, + "to": -1 + } + } + ], + "at_domain": { + "type": "acyltransferase", + "gene": "AEK75502.1", + "location": { + "from": -1, + "to": -1 + }, + "substrates": [], + "evidence": [] + }, + "ks_domain": { + "type": "ketosynthase", + "gene": "AEK75502.1", + "location": { + "from": -1, + "to": -1 + } + } + }, + { + "type": "pks-modular", + "name": "4", + "genes": [ + "AEK75502.1" + ], + "active": true, + "carriers": [ + { + "type": "carrier", + "gene": "AEK75502.1", + "location": { + "from": -1, + "to": -1 + }, + "subtype": "ACP", + "beta_branching": false + } + ], + "modification_domains": [ + { + "type": "ketoreductase", + "gene": "AEK75502.1", + "location": { + "from": -1, + "to": -1 + } + }, + { + "type": "dehydratase", + "gene": "AEK75502.1", + "location": { + "from": -1, + "to": -1 + } + } + ], + "at_domain": { + "type": "acyltransferase", + "gene": "AEK75502.1", + "location": { + "from": -1, + "to": -1 + }, + "substrates": [], + "evidence": [] + }, + "ks_domain": { + "type": "ketosynthase", + "gene": "AEK75502.1", + "location": { + "from": -1, + "to": -1 + } + } + }, + { + "type": "pks-modular", + "name": "5", + "genes": [ + "AEK75503.1" + ], + "active": true, + "carriers": [ + { + "type": "carrier", + "gene": "AEK75503.1", + "location": { + "from": -1, + "to": -1 + }, + "subtype": "ACP", + "beta_branching": false + } + ], + "at_domain": { + "type": "acyltransferase", + "gene": "AEK75503.1", + "location": { + "from": -1, + "to": -1 + }, + "substrates": [], + "evidence": [] + }, + "ks_domain": { + "type": "ketosynthase", + "gene": "AEK75503.1", + "location": { + "from": -1, + "to": -1 + } + } + }, + { + "type": "pks-modular", + "name": "6", + "genes": [ + "AEK75503.1" + ], + "active": true, + "carriers": [ + { + "type": "carrier", + "gene": "AEK75503.1", + "location": { + "from": -1, + "to": -1 + }, + "subtype": "ACP", + "beta_branching": false + } + ], + "modification_domains": [ + { + "type": "ketoreductase", + "gene": "AEK75503.1", + "location": { + "from": -1, + "to": -1 + } + }, + { + "type": "dehydratase", + "gene": "AEK75503.1", + "location": { + "from": -1, + "to": -1 + } + }, + { + "type": "enoylreductase", + "gene": "AEK75503.1", + "location": { + "from": -1, + "to": -1 + } + } + ], + "at_domain": { + "type": "acyltransferase", + "gene": "AEK75503.1", + "location": { + "from": -1, + "to": -1 + }, + "substrates": [], + "evidence": [] + }, + "ks_domain": { + "type": "ketosynthase", + "gene": "AEK75503.1", + "location": { + "from": -1, + "to": -1 + } + } + }, + { + "type": "pks-modular", + "name": "7", + "genes": [ + "AEK75504.1" + ], + "active": true, + "carriers": [ + { + "type": "carrier", + "gene": "AEK75504.1", + "location": { + "from": -1, + "to": -1 + }, + "subtype": "ACP", + "beta_branching": false + } + ], + "at_domain": { + "type": "acyltransferase", + "gene": "AEK75504.1", + "location": { + "from": -1, + "to": -1 + }, + "substrates": [], + "evidence": [] + }, + "ks_domain": { + "type": "ketosynthase", + "gene": "AEK75504.1", + "location": { + "from": -1, + "to": -1 + } + } + } + ] + }, + "compounds": [ + { + "name": "abyssomicin C", + "evidence": [], + "bioactivities": [ + { + "name": "antibacterial", + "observed": true, + "references": [] + }, + { + "name": "cytotoxic", + "observed": true, + "references": [] + } + ], + "structure": "CC1C[C@]23OC(=O)C4=C2OC1C(O)C3\\C=C/C(=O)[C@@H](C)C[C@@H](C)C4=O", + "databaseIds": [ + "npatlas:NPA01961", + "pubchem:71455791", + "chembl:CHEMBL1651097" + ], + "moieties": [ + "Tetronate" + ], + "mass": 346.141638424, + "formula": "C19H22O6" + }, + { + "name": "atrop-abyssomicin C", + "evidence": [], + "bioactivities": [ + { + "name": "antibacterial", + "observed": true, + "references": [] + }, + { + "name": "cytotoxic", + "observed": true, + "references": [] + } + ], + "structure": "CC1CC23OC(=O)C4=C2OC1C(O)C3\\C=C/C(=O)C(C)CC(C)C4=O", + "databaseIds": [ + "npatlas:NPA018239", + "chemspider:19955692" + ], + "moieties": [ + "Tetronate" + ], + "mass": 346.141638424, + "formula": "C19H22O6" + } + ], + "taxonomy": { + "name": "Verrucosispora maris AB-18-032", + "ncbiTaxId": 263358 + }, + "genes": { + "to_add": [ + { + "id": "abyU", + "location": { + "exons": [ + { + "from": 7760, + "to": 8182 + } + ], + "strand": -1 + }, + "translation": "MTERLETRPQALLIKVPTEIVVKVVDDVDVAAPAVGQVGKFDDELYDEAGAQIGTSSGNFRIEYVRPTDGGLLTYYQEDITLSDGVIHAEGWADFNDVRTSKWVFYPATGVSGRYLGLTGFRQWRMTGVRKSAEARILLGE" + } + ], + "annotations": [ + { + "id": "AEK75497.1", + "name": "abyA1", + "product": "3-oxoacyl-ACP synthase III" + }, + { + "id": "AEK75498.1", + "name": "abyA2", + "product": "phosphatase and glyceryl transferase" + }, + { + "id": "AEK75499.1", + "name": "abyA3", + "product": "acyl-carrier protein" + }, + { + "id": "AEK75500.1", + "name": "abyA4", + "product": "pyruvate/2-oxoglutarate dehydrogenase" + }, + { + "id": "abyU", + "name": "abyU", + "product": "Diels-Alderase" + }, + { + "id": "AEK75512.1", + "name": "abyV", + "product": "cytochrome P450" + } + ] + }, + "legacy_references": [ + "pubmed:21656887" + ] +} \ No newline at end of file diff --git a/tests/unit/data/nplinker_local_mode.toml b/tests/unit/data/nplinker_local_mode.toml index 174ee852e..f578c8716 100644 --- a/tests/unit/data/nplinker_local_mode.toml +++ b/tests/unit/data/nplinker_local_mode.toml @@ -1,7 +1,17 @@ -dynaconf_merge = true # merge with the default settings, provided by the Dynaconf library - root_dir = "@format {env[NPLINKER_ROOT_DIR]}" mode = "local" [log] level = "DEBUG" +use_console = true + +[mibig] +to_use = true +version = "3.1" + +[bigscape] +version = 1 +cutoff = "0.30" + +[scoring] +methods = ["metcalf"] \ No newline at end of file diff --git a/tests/unit/data/strain_mappings.json b/tests/unit/data/strain_mappings.json deleted file mode 100644 index 49d2e0557..000000000 --- a/tests/unit/data/strain_mappings.json +++ /dev/null @@ -1,447 +0,0 @@ -{ - "strain_mappings": [ - { - "strain_id": "Salinispora arenicola CNB527", - "strain_alias": [ - "42b.mzXML" - ] - }, - { - "strain_id": "Salinispora arenicola CNT005", - "strain_alias": [ - "NZ_AZWU01000037", - "NZ_KI911492", - "NZ_AZWU01000047", - "GCF_000514895", - "NZ_AZWU01000018", - "NZ_AZWU01000036", - "NZ_AZWU01000007", - "GCF_000514895.1", - "NZ_AZWU01000017", - "NZ_AZWU01000052", - "NZ_AZWU01000030", - "NZ_KI911493", - "NZ_AZWU01000054", - "NZ_AZWU01000031", - "NZ_KI911490", - "NZ_AZWU01000064", - "NZ_AZWU01000029", - "NZ_AZWU01000023", - "NZ_AZWU01000020", - "NZ_AZWU01000039", - "NZ_AZWU01000019", - "12c.mzXML", - "NZ_AZWU01000022", - "NZ_KI911494", - "NZ_AZWU01000045", - "12b.mzXML", - "NZ_AZWU01000021", - "NZ_AZWU01000038", - "NZ_AZWU01000046" - ] - }, - { - "strain_id": "Salinispora arenicola CNS205", - "strain_alias": [ - "NC_009953", - "13a.mzXML", - "GCF_000018265", - "13b.mzXML", - "GCF_000018265.1" - ] - }, - { - "strain_id": "Salinispora arenicola CNH646", - "strain_alias": [ - "NZ_AZWH01000027", - "NZ_AZWH01000004", - "NZ_AZWH01000050", - "NZ_AZWH01000017", - "NZ_AZWH01000001", - "NZ_AZWH01000023", - "NZ_AZWH01000010", - "GCF_000514635", - "NZ_AZWH01000005", - "NZ_AZWH01000022", - "NZ_AZWH01000015", - "GCF_000514635.1", - "15a.mzXML", - "NZ_AZWH01000048", - "15b.mzXML", - "NZ_AZWH01000019", - "NZ_AZWH01000008", - "NZ_AZWH01000009", - "NZ_AZWH01000024", - "NZ_AZWH01000035", - "NZ_AZWH01000040", - "NZ_AZWH01000016", - "NZ_AZWH01000014" - ] - }, - { - "strain_id": "Salinispora arenicola CNQ748", - "strain_alias": [ - "NZ_AZWY01000070", - "NZ_AZWY01000017", - "NZ_AZWY01000050", - "NZ_AZWY01000016", - "NZ_KI911506", - "NZ_AZWY01000007", - "GCF_000514975.1", - "NZ_AZWY01000010", - "GCF_000514975", - "NZ_AZWY01000055", - "NZ_AZWY01000025", - "NZ_AZWY01000034", - "NZ_AZWY01000041", - "16b.mzXML", - "NZ_AZWY01000063", - "NZ_AZWY01000045", - "NZ_AZWY01000048", - "NZ_AZWY01000058", - "NZ_AZWY01000020", - "NZ_AZWY01000032", - "NZ_AZWY01000006", - "NZ_AZWY01000021", - "NZ_AZWY01000040", - "NZ_AZWY01000011", - "NZ_AZWY01000023", - "NZ_AZWY01000009", - "NZ_AZWY01000053", - "NZ_AZWY01000008", - "NZ_AZWY01000005", - "16a.mzXML", - "NZ_AZWY01000019" - ] - }, - { - "strain_id": "Salinispora arenicola CNT849", - "strain_alias": [ - "GCF_000373825.1", - "NZ_KB892511", - "NZ_KB892473", - "NZ_KB892499", - "NZ_KB892514", - "NZ_KB892480", - "17a.mzXML", - "GCF_000373825", - "NZ_KB892479", - "NZ_KB892482", - "NZ_KB892490", - "NZ_KB892476", - "NZ_KB892481", - "NZ_KB892500", - "NZ_KB892485", - "NZ_KB892505", - "NZ_KB892497", - "NZ_KB892501", - "NZ_KB892477", - "NZ_KB892494", - "NZ_KB892475", - "NZ_KB892483", - "17b.mzXML", - "NZ_KB892478", - "NZ_KB892474", - "NZ_KB892507" - ] - }, - { - "strain_id": "Salinispora arenicola CNP193", - "strain_alias": [ - "18a.mzXML", - "18b.mzXML" - ] - }, - { - "strain_id": "Salinispora arenicola CNX508", - "strain_alias": [ - "NZ_AZWT01000002", - "NZ_AZWT01000016", - "NZ_AZWT01000034", - "NZ_AZWT01000032", - "NZ_AZWT01000018", - "GCF_000514875", - "NZ_AZWT01000008", - "NZ_AZWT01000036", - "NZ_AZWT01000004", - "NZ_AZWT01000027", - "NZ_AZWT01000057", - "NZ_AZWT01000001", - "NZ_AZWT01000015", - "NZ_AZWT01000003", - "NZ_AZWT01000009", - "NZ_AZWT01000059", - "NZ_AZWT01000021", - "20a.mzXML", - "NZ_AZWT01000005", - "GCF_000514875.1", - "NZ_AZWT01000010", - "NZ_AZWT01000035", - "NZ_AZWT01000080", - "NZ_AZWT01000047", - "NZ_AZWT01000020" - ] - }, - { - "strain_id": "Salinispora pacifica CNR114", - "strain_alias": [ - "9b.mzXML", - "NZ_AZWO01000060", - "NZ_AZWO01000029", - "GCF_000514775.1", - "NZ_AZWO01000016", - "NZ_AZWO01000034", - "NZ_AZWO01000025", - "NZ_AZWO01000036", - "NZ_AZWO01000028", - "NZ_AZWO01000053", - "NZ_AZWO01000038", - "NZ_AZWO01000072", - "NZ_AZWO01000011", - "NZ_AZWO01000056", - "NZ_AZWO01000007", - "NZ_AZWO01000061", - "GCF_000514775", - "NZ_KI911467", - "NZ_AZWO01000019", - "NZ_AZWO01000008", - "NZ_AZWO01000004", - "NZ_AZWO01000048", - "9a.mzXML", - "NZ_AZWO01000014", - "NZ_KI911468", - "NZ_AZWO01000027" - ] - }, - { - "strain_id": "Salinispora pacifica CNR894", - "strain_alias": [ - "41a.mzXML" - ] - }, - { - "strain_id": "Salinispora pacifica CNR942", - "strain_alias": [ - "NZ_KB894861", - "NZ_KB894873", - "NZ_KB894848", - "NZ_KB894883", - "NZ_KB894871", - "NZ_KB894851", - "NZ_KB894875", - "NZ_KB894863", - "GCF_000374665.1", - "34c.mzXML", - "GCF_000374665", - "34b.mzXML", - "NZ_KB894878", - "NZ_KB894866", - "NZ_KB894859", - "NZ_KB894850", - "NZ_KB894860", - "NZ_KB894857", - "NZ_KB894856", - "NZ_KB894854", - "NZ_KB894865" - ] - }, - { - "strain_id": "Salinispora pacifica CNS055", - "strain_alias": [ - "NZ_KB894967", - "NZ_KB894982", - "11a.mzXML", - "NZ_KB894966", - "NZ_KB895001", - "NZ_KB894969", - "NZ_KB894963", - "NZ_KB894993", - "NZ_KB894962", - "NZ_KB894970", - "NZ_KB895000", - "NZ_KB894991", - "NZ_KB894971", - "NZ_KB894979", - "NZ_KB894997", - "11c.mzXML", - "GCF_000374685", - "NZ_KB894980", - "NZ_KB895006", - "NZ_KB894992", - "GCF_000374685.1" - ] - }, - { - "strain_id": "Salinispora pacifica CNS237", - "strain_alias": [ - "NZ_AUGH01000017", - "8a.mzXML", - "GCF_000424905.1", - "NZ_KE384268", - "NZ_AUGH01000011", - "NZ_AUGH01000032", - "NZ_AUGH01000028", - "NZ_KE384271", - "8b.mzXML", - "NZ_AUGH01000045", - "NZ_AUGH01000035", - "NZ_KE384269", - "NZ_AUGH01000027", - "NZ_AUGH01000015", - "NZ_KE384272", - "GCF_000424905", - "NZ_AUGH01000030", - "NZ_AUGH01000019" - ] - }, - { - "strain_id": "Salinispora pacifica CNS863", - "strain_alias": [ - "22a.mzXML", - "22b.mzXML" - ] - }, - { - "strain_id": "Salinispora pacifica CNT003", - "strain_alias": [ - "26c.mzXML", - "26a.mzXML" - ] - }, - { - "strain_id": "Salinispora pacifica CNT029", - "strain_alias": [ - "NZ_AZWB01000024", - "GCF_000514515.1", - "NZ_AZWB01000013", - "NZ_AZWB01000012", - "NZ_AZWB01000006", - "NZ_AZWB01000016", - "NZ_KI911413", - "NZ_AZWB01000005", - "NZ_KI911416", - "NZ_KI911414", - "NZ_KI911412", - "27a.mzXML", - "GCF_000514515", - "27b.mzXML" - ] - }, - { - "strain_id": "Salinispora pacifica CNT138", - "strain_alias": [ - "19a.mzXML", - "19c.mzXML" - ] - }, - { - "strain_id": "Salinispora pacifica CNT148", - "strain_alias": [ - "25c.mzXML", - "25b.mzXML" - ] - }, - { - "strain_id": "Salinispora pacifica CNT150", - "strain_alias": [ - "23a.mzXML" - ] - }, - { - "strain_id": "Salinispora pacifica CNT851", - "strain_alias": [ - "35b.mzXML", - "35a.mzXML" - ] - }, - { - "strain_id": "Salinispora pacifica CNT855", - "strain_alias": [ - "NZ_AZWS01000028", - "NZ_AZWS01000004", - "29a.mzXML", - "NZ_KI911483", - "NZ_AZWS01000040", - "NZ_AZWS01000038", - "NZ_AZWS01000048", - "NZ_AZWS01000025", - "GCF_000514855", - "NZ_AZWS01000012", - "NZ_AZWS01000024", - "NZ_AZWS01000036", - "NZ_AZWS01000051", - "GCF_000514855.1", - "NZ_AZWS01000008", - "NZ_AZWS01000001", - "NZ_AZWS01000014", - "NZ_AZWS01000053", - "NZ_AZWS01000016", - "NZ_AZWS01000037", - "NZ_AZWS01000015", - "NZ_AZWS01000062", - "NZ_AZWS01000013", - "NZ_AZWS01000034" - ] - }, - { - "strain_id": "Salinispora pacifica CNY202", - "strain_alias": [ - "7a.mzXML", - "7b.mzXML" - ] - }, - { - "strain_id": "Salinispora pacifica CNY330", - "strain_alias": [ - "10b.mzXML" - ] - }, - { - "strain_id": "Salinispora tropica CNB536", - "strain_alias": [ - "2a.mzXML", - "2b.mzXML" - ] - }, - { - "strain_id": "Salinispora tropica CNY012", - "strain_alias": [ - "5a.mzXML", - "5b.mzXML" - ] - }, - { - "strain_id": "Salinispora tropica CNS197", - "strain_alias": [ - "NZ_AZWK01000006", - "NZ_AZWK01000009", - "36a.mzXML", - "NZ_AZWK01000061", - "NZ_KI911452", - "NZ_AZWK01000027", - "NZ_AZWK01000038", - "GCF_000514695.1", - "NZ_AZWK01000002", - "NZ_KI911456", - "NZ_AZWK01000021", - "GCF_000514695", - "36b.mzXML", - "NZ_AZWK01000022", - "NZ_AZWK01000003", - "NZ_AZWK01000019", - "NZ_KI911458" - ] - }, - { - "strain_id": "Salinispora tropica CNB440", - "strain_alias": [ - "38b.mzXML", - "38a.mzXML", - "GCF_000016425", - "NC_009380", - "GCF_000016425.1" - ] - } - ], - "version": "1.0" -} diff --git a/tests/unit/genomics/test_mibig_downloader.py b/tests/unit/genomics/test_mibig_downloader.py index 8dfdb2646..a83e30e2e 100644 --- a/tests/unit/genomics/test_mibig_downloader.py +++ b/tests/unit/genomics/test_mibig_downloader.py @@ -3,26 +3,21 @@ class TestDownloadAndExtractMibigMetadata: - def test_default(self, tmp_path): + @pytest.mark.parametrize( + "version, expected", + [ + ["1.4", "mibig_json_1.4.tar.gz"], + ["3.1", "mibig_json_3.1.tar.gz"], + ["4.0", "mibig_json_4.0.tar.gz"], + ], + ) + def test_version(self, tmp_path, version, expected): download_path = tmp_path / "download" extract_path = tmp_path / "metadata" download_path.mkdir() extract_path.mkdir() - mibig.download_and_extract_mibig_metadata(download_path, extract_path) - archive = download_path / "mibig_json_3.1.tar.gz" - metadata = extract_path / "BGC0000002.json" - assert archive.exists() - assert archive.is_file() - assert metadata.exists() - assert metadata.is_file() - - def test_version(self, tmp_path): - download_path = tmp_path / "download" - extract_path = tmp_path / "metadata" - download_path.mkdir() - extract_path.mkdir() - mibig.download_and_extract_mibig_metadata(download_path, extract_path, version="1.4") - archive = download_path / "mibig_json_1.4.tar.gz" + mibig.download_and_extract_mibig_metadata(download_path, extract_path, version=version) + archive = download_path / expected metadata = extract_path / "BGC0000002.json" assert archive.exists() assert archive.is_file() diff --git a/tests/unit/genomics/test_mibig_metadata.py b/tests/unit/genomics/test_mibig_metadata.py index 738db8d31..f837078ac 100644 --- a/tests/unit/genomics/test_mibig_metadata.py +++ b/tests/unit/genomics/test_mibig_metadata.py @@ -3,23 +3,13 @@ from .. import DATA_DIR -@pytest.mark.parametrize("version", ["v1.4", "v3.1"]) -class TestMibigMetadata: - @pytest.fixture - def json_file(self, version): - json_file = DATA_DIR / "mibig" / f"BGC0000001_{version}.json" - yield str(json_file) - - @pytest.fixture - def metadata(self, json_file): - yield MibigMetadata(json_file) - - def test_init(self, metadata, json_file): - assert metadata.file == json_file - assert isinstance(metadata.metadata, dict) - - def test_mibig_accession(self, metadata): - assert metadata.mibig_accession == "BGC0000001" - - def test_biosyn_class(self, metadata): - assert metadata.biosyn_class == ("Polyketide",) +@pytest.mark.parametrize( + "version, expected", [["v1.4", "Polyketide"], ["v3.1", "Polyketide"], ["v4.0", "PKS"]] +) +def test_versions(version, expected): + json_file = DATA_DIR / "mibig" / f"BGC0000001_{version}.json" + metadata = MibigMetadata(str(json_file)) + assert metadata.file == str(json_file) + assert isinstance(metadata.metadata, dict) + assert metadata.mibig_accession == "BGC0000001" + assert metadata.biosyn_class == (expected,) diff --git a/tests/unit/metabolomics/test_molecular_family.py b/tests/unit/metabolomics/test_molecular_family.py index eb6fcd263..185871015 100644 --- a/tests/unit/metabolomics/test_molecular_family.py +++ b/tests/unit/metabolomics/test_molecular_family.py @@ -8,7 +8,7 @@ @pytest.fixture() def spectrum1(): """Return a Spectrum object.""" - spec = Spectrum(id="spec001", mz=[1.0], intensity=[1.0], precursor_mz=100.0) + spec = Spectrum(id="spec001", mz=[1.0], intensity=[1.0], precursor_mz=100.0, precursor_charge=1) spec.strains = StrainCollection() spec.strains.add(Strain("strain001")) yield spec @@ -17,7 +17,7 @@ def spectrum1(): @pytest.fixture() def spectrum2(): """Return a Spectrum object.""" - spec = Spectrum(id="spec002", mz=[1.0], intensity=[1.0], precursor_mz=100.0) + spec = Spectrum(id="spec002", mz=[1.0], intensity=[1.0], precursor_mz=100.0, precursor_charge=1) spec.strains = StrainCollection() spec.strains.add(Strain("strain002")) yield spec diff --git a/tests/unit/metabolomics/test_spectrum.py b/tests/unit/metabolomics/test_spectrum.py index 75f74990f..e643e1b72 100644 --- a/tests/unit/metabolomics/test_spectrum.py +++ b/tests/unit/metabolomics/test_spectrum.py @@ -14,12 +14,13 @@ ) def test_init(rt, metadata, expected_metadata): """Test the initialization of the Spectrum class.""" - spec = Spectrum("spec1", [100, 200], [0.1, 0.2], 150, rt, metadata) + spec = Spectrum("spec1", [100, 200], [0.1, 0.2], 150, 1, rt, metadata) assert spec.id == "spec1" assert spec.mz == [100, 200] assert spec.intensity == [0.1, 0.2] assert spec.precursor_mz == 150 + assert spec.precursor_charge == 1 assert spec.rt == rt assert spec.metadata == expected_metadata @@ -32,16 +33,16 @@ def test_init(rt, metadata, expected_metadata): def test_str_repr(): """Test the __str__ and __repr__ methods.""" - spec = Spectrum("spec1", [100, 200], [0.1, 0.2], 150) + spec = Spectrum("spec1", [100, 200], [0.1, 0.2], 150, 1) assert str(spec) == "Spectrum(id=spec1, #strains=0)" assert repr(spec) == "Spectrum(id=spec1, #strains=0)" def test_eq(): """Test the __eq__ method.""" - spec1 = Spectrum("spec1", [100, 200], [0.1, 0.2], 150, 0, {"info": "test"}) - spec2 = Spectrum("spec1", [100, 200], [0.1, 0.2], 150, 0, {"info": "test"}) - spec3 = Spectrum("spec2", [100, 200], [0.1, 0.2], 150, 0, {"info": "test"}) + spec1 = Spectrum("spec1", [100, 200], [0.1, 0.2], 150, 1, 0, {"info": "test"}) + spec2 = Spectrum("spec1", [100, 200], [0.1, 0.2], 150, 1, 0, {"info": "test"}) + spec3 = Spectrum("spec2", [100, 200], [0.1, 0.2], 150, 1, 0, {"info": "test"}) assert spec1 == spec2 assert spec1 != spec3 @@ -49,19 +50,19 @@ def test_eq(): def test_hash(): """Test the __hash__ method.""" - spec = Spectrum("spec1", [100, 200], [0.1, 0.2], 150) + spec = Spectrum("spec1", [100, 200], [0.1, 0.2], 150, 1) assert hash(spec) == hash(("spec1", 150)) def test_peaks(): """Test the peaks attribute.""" - spec = Spectrum("spec1", [100, 200], [0.1, 0.2], 150) + spec = Spectrum("spec1", [100, 200], [0.1, 0.2], 150, 1) assert np.array_equal(spec.peaks, np.array([[100, 0.1], [200, 0.2]])) def test_has_strain(): """Test the has_strain method.""" - spec = Spectrum("spec1", [100, 200], [0.1, 0.2], 150) + spec = Spectrum("spec1", [100, 200], [0.1, 0.2], 150, 1) strain1 = Strain("strain1") strain2 = Strain("strain2") diff --git a/tests/unit/metabolomics/test_utils.py b/tests/unit/metabolomics/test_utils.py index aa9bd2e4e..038531039 100644 --- a/tests/unit/metabolomics/test_utils.py +++ b/tests/unit/metabolomics/test_utils.py @@ -17,9 +17,9 @@ def spectra(): """Fixture for a list of Spectrum objects.""" # The order of the spectra is important for the tests. return [ - Spectrum("spec0", [100, 200], [0.1, 0.2], 150), - Spectrum("spec1", [100, 200], [0.1, 0.2], 150), - Spectrum("spec2", [100, 200], [0.1, 0.2], 150), + Spectrum("spec0", [100, 200], [0.1, 0.2], 150, 1), + Spectrum("spec1", [100, 200], [0.1, 0.2], 150, 1), + Spectrum("spec2", [100, 200], [0.1, 0.2], 150, 1), ] diff --git a/tests/unit/scoring/conftest.py b/tests/unit/scoring/conftest.py index 009a92523..c7b7ea518 100644 --- a/tests/unit/scoring/conftest.py +++ b/tests/unit/scoring/conftest.py @@ -37,11 +37,11 @@ def gcfs(strains_list) -> tuple[GCF, GCF, GCF]: @fixture(scope="session") def spectra(strains_list) -> tuple[Spectrum, Spectrum, Spectrum]: - spectrum1 = Spectrum("spectrum1", [1], [1], 10.0) + spectrum1 = Spectrum("spectrum1", [1], [1], 10.0, 1) spectrum1.strains.add(strains_list[0]) - spectrum2 = Spectrum("spectrum2", [1], [1], 10.0) + spectrum2 = Spectrum("spectrum2", [1], [1], 10.0, 1) spectrum2.strains.add(strains_list[1]) - spectrum3 = Spectrum("spectrum3", [1], [1], 10.0) + spectrum3 = Spectrum("spectrum3", [1], [1], 10.0, 1) spectrum3.strains.add(strains_list[0]) spectrum3.strains.add(strains_list[1]) return spectrum1, spectrum2, spectrum3 diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py index ce2f24b79..23d5fa8ec 100644 --- a/tests/unit/test_config.py +++ b/tests/unit/test_config.py @@ -4,7 +4,7 @@ def test_config(tmp_path): - """Test loading the default config file.""" + """Test loading config file.""" os.environ["NPLINKER_ROOT_DIR"] = str(tmp_path) # Create a temporary root dir for NPLinker config = load_config(CONFIG_FILE_LOCAL_MODE) @@ -13,17 +13,12 @@ def test_config(tmp_path): assert config["log.level"] == "DEBUG" assert config.get("log.level") == "DEBUG" - # The following are default values from nplinker_default.toml assert config.get("log.file") is None assert config.log.use_console is True assert config.mibig.to_use is True assert config.mibig.version == "3.1" - assert ( - config.bigscape.parameters - == "--mibig --clans-off --mix --include_singletons --cutoffs 0.30" - ) assert config.bigscape.cutoff == "0.30" assert config.bigscape.version == 1