From 1948b8d1a6ad1ba02c99dc183f4c63103f46938a Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Thu, 7 Mar 2024 11:52:33 +0100 Subject: [PATCH] Restructure codebase This PR changes the locations of code files (or code) to make the structure look more logical. Changes: - move runbigscape.py to bigscape folder - move podp antismash downloader to antismash folder - rename folder pairedomics to strain - move strain related modules to strain folder - move utils functions to related modules - remove unused and empty class GNPSLoader --- src/nplinker/arranger.py | 8 +- src/nplinker/genomics/__init__.py | 10 - src/nplinker/genomics/antismash/__init__.py | 12 +- .../antismash}/podp_antismash_downloader.py | 0 src/nplinker/genomics/bigscape/__init__.py | 3 +- .../bigscape}/runbigscape.py | 2 +- src/nplinker/genomics/gcf.py | 2 +- src/nplinker/genomics/utils.py | 130 ++++++- src/nplinker/loader.py | 16 +- src/nplinker/metabolomics/__init__.py | 6 - src/nplinker/metabolomics/abc.py | 12 +- src/nplinker/metabolomics/gnps/gnps_loader.py | 3 - .../gnps/gnps_molecular_family_loader.py | 2 +- src/nplinker/metabolomics/molecular_family.py | 4 +- src/nplinker/metabolomics/spectrum.py | 2 +- src/nplinker/metabolomics/utils.py | 102 +++++- src/nplinker/pairedomics/__init__.py | 29 -- .../pairedomics/strain_mappings_generator.py | 336 ------------------ src/nplinker/scoring/linking/data_links.py | 2 +- src/nplinker/strain/__init__.py | 8 + src/nplinker/{ => strain}/strain.py | 2 +- .../{ => strain}/strain_collection.py | 2 +- src/nplinker/strain/utils.py | 133 +++++++ src/nplinker/strain_loader.py | 35 -- .../test_antismash_downloader.py | 0 .../{antismash => }/test_antismash_loader.py | 4 +- tests/genomics/test_gcf.py | 2 +- tests/genomics/test_mibig_loader.py | 2 +- .../test_podp_antismash_downloader.py | 4 +- tests/genomics/test_utils.py | 144 +++++++- tests/metabolomics/test_molecular_family.py | 2 +- tests/metabolomics/test_spectrum.py | 2 +- tests/metabolomics/test_utils.py | 75 +++- tests/pairedomics/__init__.py | 0 .../test_strain_mappings_generator.py | 268 -------------- tests/scoring/conftest.py | 2 +- .../antismash => strain}/__init__.py | 0 tests/{ => strain}/test_strain.py | 0 tests/{ => strain}/test_strain_collection.py | 2 +- tests/strain/test_utils.py | 90 +++++ tests/test_gnps_loader.py | 6 - tests/test_strain_loader.py | 23 -- 42 files changed, 723 insertions(+), 764 deletions(-) rename src/nplinker/{pairedomics => genomics/antismash}/podp_antismash_downloader.py (100%) rename src/nplinker/{pairedomics => genomics/bigscape}/runbigscape.py (97%) delete mode 100644 src/nplinker/metabolomics/gnps/gnps_loader.py delete mode 100644 src/nplinker/pairedomics/__init__.py delete mode 100644 src/nplinker/pairedomics/strain_mappings_generator.py create mode 100644 src/nplinker/strain/__init__.py rename src/nplinker/{ => strain}/strain.py (98%) rename src/nplinker/{ => strain}/strain_collection.py (99%) create mode 100644 src/nplinker/strain/utils.py delete mode 100644 src/nplinker/strain_loader.py rename tests/genomics/{antismash => }/test_antismash_downloader.py (100%) rename tests/genomics/{antismash => }/test_antismash_loader.py (97%) rename tests/{pairedomics => genomics}/test_podp_antismash_downloader.py (99%) delete mode 100644 tests/pairedomics/__init__.py delete mode 100644 tests/pairedomics/test_strain_mappings_generator.py rename tests/{genomics/antismash => strain}/__init__.py (100%) rename tests/{ => strain}/test_strain.py (100%) rename tests/{ => strain}/test_strain_collection.py (99%) create mode 100644 tests/strain/test_utils.py delete mode 100644 tests/test_gnps_loader.py delete mode 100644 tests/test_strain_loader.py diff --git a/src/nplinker/arranger.py b/src/nplinker/arranger.py index ec61048a..e963e586 100644 --- a/src/nplinker/arranger.py +++ b/src/nplinker/arranger.py @@ -6,19 +6,19 @@ from jsonschema import validate import nplinker.globals as globals from nplinker.config import config -from nplinker.genomics import generate_mappings_genome_id_bgc_id +from nplinker.genomics.antismash import podp_download_and_extract_antismash_data +from nplinker.genomics.bigscape.runbigscape import run_bigscape from nplinker.genomics.mibig import download_and_extract_mibig_metadata +from nplinker.genomics.utils import generate_mappings_genome_id_bgc_id from nplinker.globals import GENOME_BGC_MAPPINGS_FILENAME from nplinker.globals import GENOME_STATUS_FILENAME from nplinker.globals import STRAIN_MAPPINGS_FILENAME from nplinker.metabolomics.gnps import GNPSDownloader from nplinker.metabolomics.gnps import GNPSExtractor -from nplinker.pairedomics import podp_download_and_extract_antismash_data -from nplinker.pairedomics.runbigscape import run_bigscape -from nplinker.pairedomics.strain_mappings_generator import podp_generate_strain_mappings from nplinker.schemas import STRAIN_MAPPINGS_SCHEMA from nplinker.schemas import USER_STRAINS_SCHEMA from nplinker.schemas import validate_podp_json +from nplinker.strain.utils import podp_generate_strain_mappings from nplinker.utils import download_url from nplinker.utils import list_dirs from nplinker.utils import list_files diff --git a/src/nplinker/genomics/__init__.py b/src/nplinker/genomics/__init__.py index c89bb4c8..710dad50 100644 --- a/src/nplinker/genomics/__init__.py +++ b/src/nplinker/genomics/__init__.py @@ -1,21 +1,11 @@ import logging -from .abc import BGCLoaderBase from .bgc import BGC from .gcf import GCF -from .utils import add_bgc_to_gcf -from .utils import add_strain_to_bgc -from .utils import generate_mappings_genome_id_bgc_id -from .utils import get_mibig_from_gcf logging.getLogger(__name__).addHandler(logging.NullHandler()) __all__ = [ - "BGCLoaderBase", "BGC", "GCF", - "add_bgc_to_gcf", - "add_strain_to_bgc", - "generate_mappings_genome_id_bgc_id", - "get_mibig_from_gcf", ] diff --git a/src/nplinker/genomics/antismash/__init__.py b/src/nplinker/genomics/antismash/__init__.py index ac386f27..0d813194 100644 --- a/src/nplinker/genomics/antismash/__init__.py +++ b/src/nplinker/genomics/antismash/__init__.py @@ -2,8 +2,18 @@ from .antismash_downloader import download_and_extract_antismash_data from .antismash_loader import AntismashBGCLoader from .antismash_loader import parse_bgc_genbank +from .podp_antismash_downloader import GenomeStatus +from .podp_antismash_downloader import get_best_available_genome_id +from .podp_antismash_downloader import podp_download_and_extract_antismash_data logging.getLogger(__name__).addHandler(logging.NullHandler()) -__all__ = ["AntismashBGCLoader", "parse_bgc_genbank", "download_and_extract_antismash_data"] +__all__ = [ + "download_and_extract_antismash_data", + "AntismashBGCLoader", + "parse_bgc_genbank", + "GenomeStatus", + "get_best_available_genome_id", + "podp_download_and_extract_antismash_data", +] diff --git a/src/nplinker/pairedomics/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py similarity index 100% rename from src/nplinker/pairedomics/podp_antismash_downloader.py rename to src/nplinker/genomics/antismash/podp_antismash_downloader.py diff --git a/src/nplinker/genomics/bigscape/__init__.py b/src/nplinker/genomics/bigscape/__init__.py index dd289d02..2e92197c 100644 --- a/src/nplinker/genomics/bigscape/__init__.py +++ b/src/nplinker/genomics/bigscape/__init__.py @@ -1,7 +1,8 @@ import logging from .bigscape_loader import BigscapeGCFLoader +from .runbigscape import run_bigscape logging.getLogger(__name__).addHandler(logging.NullHandler()) -__all__ = ["BigscapeGCFLoader"] +__all__ = ["BigscapeGCFLoader", "run_bigscape"] diff --git a/src/nplinker/pairedomics/runbigscape.py b/src/nplinker/genomics/bigscape/runbigscape.py similarity index 97% rename from src/nplinker/pairedomics/runbigscape.py rename to src/nplinker/genomics/bigscape/runbigscape.py index b0bb0859..9b2f96e8 100644 --- a/src/nplinker/pairedomics/runbigscape.py +++ b/src/nplinker/genomics/bigscape/runbigscape.py @@ -2,7 +2,7 @@ import subprocess import sys from os import PathLike -from ..logconfig import LogConfig +from ...logconfig import LogConfig logger = LogConfig.getLogger(__name__) diff --git a/src/nplinker/genomics/gcf.py b/src/nplinker/genomics/gcf.py index 8e99fee7..cc88981c 100644 --- a/src/nplinker/genomics/gcf.py +++ b/src/nplinker/genomics/gcf.py @@ -1,7 +1,7 @@ from __future__ import annotations from typing import TYPE_CHECKING from nplinker.logconfig import LogConfig -from nplinker.strain_collection import StrainCollection +from nplinker.strain import StrainCollection if TYPE_CHECKING: diff --git a/src/nplinker/genomics/utils.py b/src/nplinker/genomics/utils.py index 81138c50..93cf1efa 100644 --- a/src/nplinker/genomics/utils.py +++ b/src/nplinker/genomics/utils.py @@ -6,9 +6,12 @@ from nplinker.globals import GENOME_BGC_MAPPINGS_FILENAME from nplinker.logconfig import LogConfig from nplinker.schemas import GENOME_BGC_MAPPINGS_SCHEMA -from nplinker.strain_collection import StrainCollection +from nplinker.schemas import validate_podp_json +from nplinker.strain import StrainCollection from nplinker.utils import list_dirs from nplinker.utils import list_files +from ..genomics.antismash.podp_antismash_downloader import GenomeStatus +from ..genomics.antismash.podp_antismash_downloader import get_best_available_genome_id from .bgc import BGC from .gcf import GCF @@ -179,3 +182,128 @@ def get_mibig_from_gcf(gcfs: list[GCF]) -> tuple[list[BGC], StrainCollection]: if bgc.strain is not None: mibig_strains_in_use.add(bgc.strain) return mibig_bgcs_in_use, mibig_strains_in_use + + +# ------------------------------------------------------------------------------ +# Functions to extract mappings for genomics side: +# strain_id <-> original_geonme_id <-> resolved_genome_id <-> bgc_id +# ------------------------------------------------------------------------------ +def extract_mappings_strain_id_original_genome_id( + podp_project_json_file: str | PathLike +) -> dict[str, set[str]]: + """Extract mappings "strain id <-> original genome id". + + Args: + podp_project_json_file: The path to the PODP project + JSON file. + + Returns: + Key is strain id and value is a set of original genome ids. + + Notes: + The `podp_project_json_file` is the project JSON file downloaded from + PODP platform. For example, for project MSV000079284, its json file is + https://pairedomicsdata.bioinformatics.nl/api/projects/4b29ddc3-26d0-40d7-80c5-44fb6631dbf9.4. + """ + mappings_dict = {} + with open(podp_project_json_file, "r") as f: + json_data = json.load(f) + + validate_podp_json(json_data) + + for record in json_data["genomes"]: + strain_id = record["genome_label"] + genome_id = get_best_available_genome_id(record["genome_ID"]) + if genome_id is None: + logger.warning("Failed to extract genome ID from genome with label %s", strain_id) + continue + if strain_id in mappings_dict: + mappings_dict[strain_id].add(genome_id) + else: + mappings_dict[strain_id] = {genome_id} + return mappings_dict + + +def extract_mappings_original_genome_id_resolved_genome_id( + genome_status_json_file: str | PathLike +) -> dict[str, str]: + """Extract mappings "original_genome_id <-> resolved_genome_id". + + Args: + genome_status_json_file: The path to the genome status + JSON file. + + Returns: + Key is original genome id and value is resolved genome id. + + Notes: + The `genome_status_json_file` is usually generated by the + `podp_download_and_extract_antismash_data` function with + a default file name defined in `nplinker.globals.GENOME_STATUS_FILENAME`. + """ + gs_mappings_dict = GenomeStatus.read_json(genome_status_json_file) + return {gs.original_id: gs.resolved_refseq_id for gs in gs_mappings_dict.values()} + + +def extract_mappings_resolved_genome_id_bgc_id( + genome_bgc_mappings_file: str | PathLike +) -> dict[str, set[str]]: + """Extract mappings "resolved_genome_id <-> bgc_id". + + Args: + genome_bgc_mappings_file: The path to the genome BGC + mappings JSON file. + + Returns: + Key is resolved genome id and value is a set of BGC ids. + + Notes: + The `genome_bgc_mappings_file` is usually generated by the + `generate_mappings_genome_id_bgc_id` function with a default file name + defined in `nplinker.globals.GENOME_BGC_MAPPINGS_FILENAME`. + """ + with open(genome_bgc_mappings_file, "r") as f: + json_data = json.load(f) + + # validate the JSON data + validate(json_data, GENOME_BGC_MAPPINGS_SCHEMA) + + return {mapping["genome_ID"]: set(mapping["BGC_ID"]) for mapping in json_data["mappings"]} + + +def get_mappings_strain_id_bgc_id( + mappings_strain_id_original_genome_id: dict[str, set[str]], + mappings_original_genome_id_resolved_genome_id: dict[str, str], + mappings_resolved_genome_id_bgc_id: dict[str, set[str]], +) -> dict[str, set[str]]: + """Get mappings "strain_id <-> bgc_id". + + Args: + mappings_strain_id_original_genome_id: Mappings + "strain_id <-> original_genome_id". + mappings_original_genome_id_resolved_genome_id: Mappings + "original_genome_id <-> resolved_genome_id". + mappings_resolved_genome_id_bgc_id: Mappings + "resolved_genome_id <-> bgc_id". + + Returns: + Key is strain id and value is a set of BGC ids. + + See Also: + `extract_mappings_strain_id_original_genome_id`: Extract mappings + "strain_id <-> original_genome_id". + `extract_mappings_original_genome_id_resolved_genome_id`: Extract mappings + "original_genome_id <-> resolved_genome_id". + `extract_mappings_resolved_genome_id_bgc_id`: Extract mappings + "resolved_genome_id <-> bgc_id". + """ + mappings_dict = {} + for strain_id, original_genome_ids in mappings_strain_id_original_genome_id.items(): + bgc_ids = set() + for original_genome_id in original_genome_ids: + resolved_genome_id = mappings_original_genome_id_resolved_genome_id[original_genome_id] + if (bgc_id := mappings_resolved_genome_id_bgc_id.get(resolved_genome_id)) is not None: + bgc_ids.update(bgc_id) + if bgc_ids: + mappings_dict[strain_id] = bgc_ids + return mappings_dict diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py index cfadc0d7..fb313217 100644 --- a/src/nplinker/loader.py +++ b/src/nplinker/loader.py @@ -2,12 +2,12 @@ from deprecated import deprecated from nplinker import globals from nplinker.config import config -from nplinker.genomics import add_bgc_to_gcf -from nplinker.genomics import add_strain_to_bgc -from nplinker.genomics import get_mibig_from_gcf from nplinker.genomics.antismash import AntismashBGCLoader from nplinker.genomics.bigscape import BigscapeGCFLoader from nplinker.genomics.mibig import MibigLoader +from nplinker.genomics.utils import add_bgc_to_gcf +from nplinker.genomics.utils import add_strain_to_bgc +from nplinker.genomics.utils import get_mibig_from_gcf from nplinker.globals import GNPS_ANNOTATIONS_FILENAME from nplinker.globals import GNPS_DEFAULT_PATH from nplinker.globals import GNPS_MOLECULAR_FAMILY_FILENAME @@ -15,14 +15,14 @@ from nplinker.globals import STRAIN_MAPPINGS_FILENAME from nplinker.globals import STRAINS_SELECTED_FILENAME from nplinker.logconfig import LogConfig -from nplinker.metabolomics import add_annotation_to_spectrum -from nplinker.metabolomics import add_spectrum_to_mf -from nplinker.metabolomics import add_strains_to_spectrum from nplinker.metabolomics.gnps import GNPSAnnotationLoader from nplinker.metabolomics.gnps import GNPSMolecularFamilyLoader from nplinker.metabolomics.gnps import GNPSSpectrumLoader -from nplinker.strain_collection import StrainCollection -from nplinker.strain_loader import load_user_strains +from nplinker.metabolomics.utils import add_annotation_to_spectrum +from nplinker.metabolomics.utils import add_spectrum_to_mf +from nplinker.metabolomics.utils import add_strains_to_spectrum +from nplinker.strain import StrainCollection +from nplinker.strain.utils import load_user_strains try: diff --git a/src/nplinker/metabolomics/__init__.py b/src/nplinker/metabolomics/__init__.py index 8c723f5e..e7dc79c1 100644 --- a/src/nplinker/metabolomics/__init__.py +++ b/src/nplinker/metabolomics/__init__.py @@ -1,9 +1,6 @@ import logging from .molecular_family import MolecularFamily from .spectrum import Spectrum -from .utils import add_annotation_to_spectrum -from .utils import add_spectrum_to_mf -from .utils import add_strains_to_spectrum logging.getLogger(__name__).addHandler(logging.NullHandler()) @@ -12,7 +9,4 @@ __all__ = [ "MolecularFamily", "Spectrum", - "add_annotation_to_spectrum", - "add_spectrum_to_mf", - "add_strains_to_spectrum", ] diff --git a/src/nplinker/metabolomics/abc.py b/src/nplinker/metabolomics/abc.py index efc1c286..c55e209a 100644 --- a/src/nplinker/metabolomics/abc.py +++ b/src/nplinker/metabolomics/abc.py @@ -1,20 +1,24 @@ from abc import ABC from abc import abstractmethod from collections.abc import Sequence -from nplinker.metabolomics import MolecularFamily -from nplinker.metabolomics import Spectrum +from typing import TYPE_CHECKING + + +if TYPE_CHECKING: + from .molecular_family import MolecularFamily + from .spectrum import Spectrum class SpectrumLoaderBase(ABC): @property @abstractmethod - def spectra(self) -> Sequence[Spectrum]: + def spectra(self) -> Sequence["Spectrum"]: ... class MolecularFamilyLoaderBase(ABC): @abstractmethod - def get_mfs(self, keep_singleton: bool) -> Sequence[MolecularFamily]: + def get_mfs(self, keep_singleton: bool) -> Sequence["MolecularFamily"]: """Get MolecularFamily objects. Args: diff --git a/src/nplinker/metabolomics/gnps/gnps_loader.py b/src/nplinker/metabolomics/gnps/gnps_loader.py deleted file mode 100644 index 608ae5bc..00000000 --- a/src/nplinker/metabolomics/gnps/gnps_loader.py +++ /dev/null @@ -1,3 +0,0 @@ -class GNPSLoader: - def __init__(self): - pass diff --git a/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py b/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py index c787271b..ff5f44d3 100644 --- a/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py +++ b/src/nplinker/metabolomics/gnps/gnps_molecular_family_loader.py @@ -1,8 +1,8 @@ import csv from os import PathLike -from nplinker.metabolomics import MolecularFamily from nplinker.metabolomics.abc import MolecularFamilyLoaderBase from nplinker.utils import is_file_format +from ..molecular_family import MolecularFamily class GNPSMolecularFamilyLoader(MolecularFamilyLoaderBase): diff --git a/src/nplinker/metabolomics/molecular_family.py b/src/nplinker/metabolomics/molecular_family.py index 29457382..29f75696 100644 --- a/src/nplinker/metabolomics/molecular_family.py +++ b/src/nplinker/metabolomics/molecular_family.py @@ -1,7 +1,7 @@ from __future__ import annotations from typing import TYPE_CHECKING -from nplinker.strain import Strain -from nplinker.strain_collection import StrainCollection +from ..strain.strain import Strain +from ..strain.strain_collection import StrainCollection if TYPE_CHECKING: diff --git a/src/nplinker/metabolomics/spectrum.py b/src/nplinker/metabolomics/spectrum.py index e39bb96f..4727a9f5 100644 --- a/src/nplinker/metabolomics/spectrum.py +++ b/src/nplinker/metabolomics/spectrum.py @@ -3,7 +3,7 @@ from typing import TYPE_CHECKING import numpy as np from nplinker.strain import Strain -from nplinker.strain_collection import StrainCollection +from nplinker.strain import StrainCollection if TYPE_CHECKING: diff --git a/src/nplinker/metabolomics/utils.py b/src/nplinker/metabolomics/utils.py index aa341d55..1f758a76 100644 --- a/src/nplinker/metabolomics/utils.py +++ b/src/nplinker/metabolomics/utils.py @@ -1,5 +1,10 @@ +import json +from os import PathLike +from pathlib import Path from nplinker.logconfig import LogConfig -from nplinker.strain_collection import StrainCollection +from nplinker.schemas import validate_podp_json +from nplinker.strain import StrainCollection +from .gnps.gnps_file_mapping_loader import GNPSFileMappingLoader from .molecular_family import MolecularFamily from .spectrum import Spectrum @@ -113,3 +118,98 @@ def add_spectrum_to_mf( f"{len(mf_missing_spec)} MolecularFamily objects have missing Spectrum objects." ) return mf_with_spec, mf_without_spec, mf_missing_spec + + +# ------------------------------------------------------------------------------ +# Functions to extract mappings for metabolomics side: +# strain_id <-> MS_filename <-> spectrum_id +# ------------------------------------------------------------------------------ +def extract_mappings_strain_id_ms_filename( + podp_project_json_file: str | PathLike +) -> dict[str, set[str]]: + """Extract mappings "strain_id <-> MS_filename". + + Args: + podp_project_json_file: The path to the PODP project + JSON file. + + Returns: + Key is strain id and value is a set of MS filenames. + + Notes: + The `podp_project_json_file` is the project JSON file downloaded from + PODP platform. For example, for project MSV000079284, its json file is + https://pairedomicsdata.bioinformatics.nl/api/projects/4b29ddc3-26d0-40d7-80c5-44fb6631dbf9.4. + """ + mappings_dict = {} + with open(podp_project_json_file, "r") as f: + json_data = json.load(f) + + validate_podp_json(json_data) + + # Extract mappings strain id <-> metabolomics filename + for record in json_data["genome_metabolome_links"]: + strain_id = record["genome_label"] + # get the actual filename of the mzXML URL + filename = Path(record["metabolomics_file"]).name + if strain_id in mappings_dict: + mappings_dict[strain_id].add(filename) + else: + mappings_dict[strain_id] = {filename} + return mappings_dict + + +def extract_mappings_ms_filename_spectrum_id( + gnps_file_mappings_file: str | PathLike +) -> dict[str, set[str]]: + """Extract mappings "MS_filename <-> spectrum_id". + + Args: + gnps_file_mappings_file: The path to the GNPS file mappings file (csv or + tsv). + + Returns: + Key is MS filename and value is a set of spectrum ids. + + Notes: + The `gnps_file_mappings_file` is generated by GNPS molecular networking. It's downloaded + from GNPS website to a file with a default name defined in `GNPS_FILE_MAPPINGS_FILENAME`. + + See Also: + GNPSFileMappingLoader: A class to load GNPS file mappings file. + """ + loader = GNPSFileMappingLoader(gnps_file_mappings_file) + return loader.mapping_reversed + + +def get_mappings_strain_id_spectrum_id( + mappings_strain_id_ms_filename: dict[str, set[str]], + mappings_ms_filename_spectrum_id: dict[str, set[str]], +) -> dict[str, set[str]]: + """Get mappings "strain_id <-> spectrum_id". + + Args: + mappings_strain_id_ms_filename: Mappings + "strain_id <-> MS_filename". + mappings_ms_filename_spectrum_id: Mappings + "MS_filename <-> spectrum_id". + + Returns: + Key is strain id and value is a set of spectrum ids. + + + See Also: + `extract_mappings_strain_id_ms_filename`: Extract mappings + "strain_id <-> MS_filename". + `extract_mappings_ms_filename_spectrum_id`: Extract mappings + "MS_filename <-> spectrum_id". + """ + mappings_dict = {} + for strain_id, ms_filenames in mappings_strain_id_ms_filename.items(): + spectrum_ids = set() + for ms_filename in ms_filenames: + if (sid := mappings_ms_filename_spectrum_id.get(ms_filename)) is not None: + spectrum_ids.update(sid) + if spectrum_ids: + mappings_dict[strain_id] = spectrum_ids + return mappings_dict diff --git a/src/nplinker/pairedomics/__init__.py b/src/nplinker/pairedomics/__init__.py deleted file mode 100644 index 03641982..00000000 --- a/src/nplinker/pairedomics/__init__.py +++ /dev/null @@ -1,29 +0,0 @@ -import logging -from .podp_antismash_downloader import GenomeStatus -from .podp_antismash_downloader import get_best_available_genome_id -from .podp_antismash_downloader import podp_download_and_extract_antismash_data -from .strain_mappings_generator import extract_mappings_ms_filename_spectrum_id -from .strain_mappings_generator import extract_mappings_original_genome_id_resolved_genome_id -from .strain_mappings_generator import extract_mappings_resolved_genome_id_bgc_id -from .strain_mappings_generator import extract_mappings_strain_id_ms_filename -from .strain_mappings_generator import extract_mappings_strain_id_original_genome_id -from .strain_mappings_generator import get_mappings_strain_id_bgc_id -from .strain_mappings_generator import get_mappings_strain_id_spectrum_id -from .strain_mappings_generator import podp_generate_strain_mappings - - -logging.getLogger(__name__).addHandler(logging.NullHandler()) - -__all__ = [ - "GenomeStatus", - "get_best_available_genome_id", - "podp_download_and_extract_antismash_data", - "podp_generate_strain_mappings", - "extract_mappings_strain_id_original_genome_id", - "extract_mappings_original_genome_id_resolved_genome_id", - "extract_mappings_resolved_genome_id_bgc_id", - "get_mappings_strain_id_bgc_id", - "extract_mappings_strain_id_ms_filename", - "extract_mappings_ms_filename_spectrum_id", - "get_mappings_strain_id_spectrum_id", -] diff --git a/src/nplinker/pairedomics/strain_mappings_generator.py b/src/nplinker/pairedomics/strain_mappings_generator.py deleted file mode 100644 index 2d074e58..00000000 --- a/src/nplinker/pairedomics/strain_mappings_generator.py +++ /dev/null @@ -1,336 +0,0 @@ -import json -import logging -from os import PathLike -from pathlib import Path -from jsonschema import validate -from nplinker.metabolomics.gnps import GNPSFileMappingLoader -from nplinker.schemas import GENOME_BGC_MAPPINGS_SCHEMA -from nplinker.schemas import validate_podp_json -from nplinker.strain import Strain -from nplinker.strain_collection import StrainCollection -from .podp_antismash_downloader import GenomeStatus -from .podp_antismash_downloader import get_best_available_genome_id - - -logger = logging.getLogger(__name__) - -__all__ = [ - "podp_generate_strain_mappings", - "extract_mappings_strain_id_original_genome_id", - "extract_mappings_original_genome_id_resolved_genome_id", - "extract_mappings_resolved_genome_id_bgc_id", - "get_mappings_strain_id_bgc_id", - "extract_mappings_strain_id_ms_filename", - "extract_mappings_ms_filename_spectrum_id", - "get_mappings_strain_id_spectrum_id", -] - - -def podp_generate_strain_mappings( - podp_project_json_file: str | PathLike, - genome_status_json_file: str | PathLike, - genome_bgc_mappings_file: str | PathLike, - gnps_file_mappings_file: str | PathLike, - output_json_file: str | PathLike, -) -> StrainCollection: - """Generate strain mappings JSON file for PODP pipeline. - - To get the strain mappings, we need to combine the following mappings: - - strain_id <-> original_genome_id <-> resolved_genome_id <-> bgc_id - - strain_id <-> MS_filename <-> spectrum_id - - These mappings are extracted from the following files: - - "strain_id <-> original_genome_id" is extracted from `podp_project_json_file`. - - "original_genome_id <-> resolved_genome_id" is extracted from `genome_status_json_file`. - - "resolved_genome_id <-> bgc_id" is extracted from `genome_bgc_mappings_file`. - - "strain_id <-> MS_filename" is extracted from `podp_project_json_file`. - - "MS_filename <-> spectrum_id" is extracted from `gnps_file_mappings_file`. - - Args: - podp_project_json_file: The path to the PODP project - JSON file. - genome_status_json_file: The path to the genome status - JSON file. - genome_bgc_mappings_file: The path to the genome BGC - mappings JSON file. - gnps_file_mappings_file: The path to the GNPS file - mappings file (csv or tsv). - output_json_file: The path to the output JSON file. - - Returns: - The strain mappings stored in a StrainCollection object. - - See Also: - `extract_mappings_strain_id_original_genome_id`: Extract mappings - "strain_id <-> original_genome_id". - `extract_mappings_original_genome_id_resolved_genome_id`: Extract mappings - "original_genome_id <-> resolved_genome_id". - `extract_mappings_resolved_genome_id_bgc_id`: Extract mappings - "resolved_genome_id <-> bgc_id". - `get_mappings_strain_id_bgc_id`: Get mappings "strain_id <-> bgc_id". - `extract_mappings_strain_id_ms_filename`: Extract mappings - "strain_id <-> MS_filename". - `extract_mappings_ms_filename_spectrum_id`: Extract mappings - "MS_filename <-> spectrum_id". - `get_mappings_strain_id_spectrum_id`: Get mappings "strain_id <-> spectrum_id". - """ - # Get mappings strain_id <-> original_geonme_id <-> resolved_genome_id <-> bgc_id - mappings_strain_id_bgc_id = get_mappings_strain_id_bgc_id( - extract_mappings_strain_id_original_genome_id(podp_project_json_file), - extract_mappings_original_genome_id_resolved_genome_id(genome_status_json_file), - extract_mappings_resolved_genome_id_bgc_id(genome_bgc_mappings_file), - ) - - # Get mappings strain_id <-> MS_filename <-> spectrum_id - mappings_strain_id_spectrum_id = get_mappings_strain_id_spectrum_id( - extract_mappings_strain_id_ms_filename(podp_project_json_file), - extract_mappings_ms_filename_spectrum_id(gnps_file_mappings_file), - ) - - # Get mappings strain_id <-> bgc_id / spectrum_id - mappings = mappings_strain_id_bgc_id.copy() - for strain_id, spectrum_ids in mappings_strain_id_spectrum_id.items(): - if strain_id in mappings: - mappings[strain_id].update(spectrum_ids) - else: - mappings[strain_id] = spectrum_ids.copy() - - # Create StrainCollection - sc = StrainCollection() - for strain_id, bgc_ids in mappings.items(): - if not sc.has_name(strain_id): - strain = Strain(strain_id) - for bgc_id in bgc_ids: - strain.add_alias(bgc_id) - sc.add(strain) - else: - # strain_list has only one element - strain_list = sc.lookup(strain_id) - for bgc_id in bgc_ids: - strain_list[0].add_alias(bgc_id) - - # Write strain mappings JSON file - sc.to_json(output_json_file) - logger.info("Generated strain mappings JSON file: %s", output_json_file) - - return sc - - -# ------------------------------------------------------------------------------ -# Functions to extract mappings for genomics side: -# strain_id <-> original_geonme_id <-> resolved_genome_id <-> bgc_id -# ------------------------------------------------------------------------------ -def extract_mappings_strain_id_original_genome_id( - podp_project_json_file: str | PathLike -) -> dict[str, set[str]]: - """Extract mappings "strain id <-> original genome id". - - Args: - podp_project_json_file: The path to the PODP project - JSON file. - - Returns: - Key is strain id and value is a set of original genome ids. - - Notes: - The `podp_project_json_file` is the project JSON file downloaded from - PODP platform. For example, for project MSV000079284, its json file is - https://pairedomicsdata.bioinformatics.nl/api/projects/4b29ddc3-26d0-40d7-80c5-44fb6631dbf9.4. - """ - mappings_dict = {} - with open(podp_project_json_file, "r") as f: - json_data = json.load(f) - - validate_podp_json(json_data) - - for record in json_data["genomes"]: - strain_id = record["genome_label"] - genome_id = get_best_available_genome_id(record["genome_ID"]) - if genome_id is None: - logger.warning("Failed to extract genome ID from genome with label %s", strain_id) - continue - if strain_id in mappings_dict: - mappings_dict[strain_id].add(genome_id) - else: - mappings_dict[strain_id] = {genome_id} - return mappings_dict - - -def extract_mappings_original_genome_id_resolved_genome_id( - genome_status_json_file: str | PathLike -) -> dict[str, str]: - """Extract mappings "original_genome_id <-> resolved_genome_id". - - Args: - genome_status_json_file: The path to the genome status - JSON file. - - Returns: - Key is original genome id and value is resolved genome id. - - Notes: - The `genome_status_json_file` is usually generated by the - `podp_download_and_extract_antismash_data` function with - a default file name defined in `nplinker.globals.GENOME_STATUS_FILENAME`. - """ - gs_mappings_dict = GenomeStatus.read_json(genome_status_json_file) - return {gs.original_id: gs.resolved_refseq_id for gs in gs_mappings_dict.values()} - - -def extract_mappings_resolved_genome_id_bgc_id( - genome_bgc_mappings_file: str | PathLike -) -> dict[str, set[str]]: - """Extract mappings "resolved_genome_id <-> bgc_id". - - Args: - genome_bgc_mappings_file: The path to the genome BGC - mappings JSON file. - - Returns: - Key is resolved genome id and value is a set of BGC ids. - - Notes: - The `genome_bgc_mappings_file` is usually generated by the - `generate_mappings_genome_id_bgc_id` function with a default file name - defined in `nplinker.globals.GENOME_BGC_MAPPINGS_FILENAME`. - """ - with open(genome_bgc_mappings_file, "r") as f: - json_data = json.load(f) - - # validate the JSON data - validate(json_data, GENOME_BGC_MAPPINGS_SCHEMA) - - return {mapping["genome_ID"]: set(mapping["BGC_ID"]) for mapping in json_data["mappings"]} - - -def get_mappings_strain_id_bgc_id( - mappings_strain_id_original_genome_id: dict[str, set[str]], - mappings_original_genome_id_resolved_genome_id: dict[str, str], - mappings_resolved_genome_id_bgc_id: dict[str, set[str]], -) -> dict[str, set[str]]: - """Get mappings "strain_id <-> bgc_id". - - Args: - mappings_strain_id_original_genome_id: Mappings - "strain_id <-> original_genome_id". - mappings_original_genome_id_resolved_genome_id: Mappings - "original_genome_id <-> resolved_genome_id". - mappings_resolved_genome_id_bgc_id: Mappings - "resolved_genome_id <-> bgc_id". - - Returns: - Key is strain id and value is a set of BGC ids. - - See Also: - `extract_mappings_strain_id_original_genome_id`: Extract mappings - "strain_id <-> original_genome_id". - `extract_mappings_original_genome_id_resolved_genome_id`: Extract mappings - "original_genome_id <-> resolved_genome_id". - `extract_mappings_resolved_genome_id_bgc_id`: Extract mappings - "resolved_genome_id <-> bgc_id". - """ - mappings_dict = {} - for strain_id, original_genome_ids in mappings_strain_id_original_genome_id.items(): - bgc_ids = set() - for original_genome_id in original_genome_ids: - resolved_genome_id = mappings_original_genome_id_resolved_genome_id[original_genome_id] - if (bgc_id := mappings_resolved_genome_id_bgc_id.get(resolved_genome_id)) is not None: - bgc_ids.update(bgc_id) - if bgc_ids: - mappings_dict[strain_id] = bgc_ids - return mappings_dict - - -# ------------------------------------------------------------------------------ -# Functions to extract mappings for metabolomics side: -# strain_id <-> MS_filename <-> spectrum_id -# ------------------------------------------------------------------------------ -def extract_mappings_strain_id_ms_filename( - podp_project_json_file: str | PathLike -) -> dict[str, set[str]]: - """Extract mappings "strain_id <-> MS_filename". - - Args: - podp_project_json_file: The path to the PODP project - JSON file. - - Returns: - Key is strain id and value is a set of MS filenames. - - Notes: - The `podp_project_json_file` is the project JSON file downloaded from - PODP platform. For example, for project MSV000079284, its json file is - https://pairedomicsdata.bioinformatics.nl/api/projects/4b29ddc3-26d0-40d7-80c5-44fb6631dbf9.4. - """ - mappings_dict = {} - with open(podp_project_json_file, "r") as f: - json_data = json.load(f) - - validate_podp_json(json_data) - - # Extract mappings strain id <-> metabolomics filename - for record in json_data["genome_metabolome_links"]: - strain_id = record["genome_label"] - # get the actual filename of the mzXML URL - filename = Path(record["metabolomics_file"]).name - if strain_id in mappings_dict: - mappings_dict[strain_id].add(filename) - else: - mappings_dict[strain_id] = {filename} - return mappings_dict - - -def extract_mappings_ms_filename_spectrum_id( - gnps_file_mappings_file: str | PathLike -) -> dict[str, set[str]]: - """Extract mappings "MS_filename <-> spectrum_id". - - Args: - gnps_file_mappings_file: The path to the GNPS file mappings file (csv or - tsv). - - Returns: - Key is MS filename and value is a set of spectrum ids. - - Notes: - The `gnps_file_mappings_file` is generated by GNPS molecular networking. It's downloaded - from GNPS website to a file with a default name defined in `GNPS_FILE_MAPPINGS_FILENAME`. - - See Also: - GNPSFileMappingLoader: A class to load GNPS file mappings file. - """ - loader = GNPSFileMappingLoader(gnps_file_mappings_file) - return loader.mapping_reversed - - -def get_mappings_strain_id_spectrum_id( - mappings_strain_id_ms_filename: dict[str, set[str]], - mappings_ms_filename_spectrum_id: dict[str, set[str]], -) -> dict[str, set[str]]: - """Get mappings "strain_id <-> spectrum_id". - - Args: - mappings_strain_id_ms_filename: Mappings - "strain_id <-> MS_filename". - mappings_ms_filename_spectrum_id: Mappings - "MS_filename <-> spectrum_id". - - Returns: - Key is strain id and value is a set of spectrum ids. - - - See Also: - `extract_mappings_strain_id_ms_filename`: Extract mappings - "strain_id <-> MS_filename". - `extract_mappings_ms_filename_spectrum_id`: Extract mappings - "MS_filename <-> spectrum_id". - """ - mappings_dict = {} - for strain_id, ms_filenames in mappings_strain_id_ms_filename.items(): - spectrum_ids = set() - for ms_filename in ms_filenames: - if (sid := mappings_ms_filename_spectrum_id.get(ms_filename)) is not None: - spectrum_ids.update(sid) - if spectrum_ids: - mappings_dict[strain_id] = spectrum_ids - return mappings_dict diff --git a/src/nplinker/scoring/linking/data_links.py b/src/nplinker/scoring/linking/data_links.py index 3c3da12b..c1726461 100644 --- a/src/nplinker/scoring/linking/data_links.py +++ b/src/nplinker/scoring/linking/data_links.py @@ -13,7 +13,7 @@ if TYPE_CHECKING: from nplinker.strain import Strain - from nplinker.strain_collection import StrainCollection + from nplinker.strain import StrainCollection logger = LogConfig.getLogger(__name__) diff --git a/src/nplinker/strain/__init__.py b/src/nplinker/strain/__init__.py new file mode 100644 index 00000000..d4ad376d --- /dev/null +++ b/src/nplinker/strain/__init__.py @@ -0,0 +1,8 @@ +import logging +from .strain import Strain +from .strain_collection import StrainCollection + + +logging.getLogger(__name__).addHandler(logging.NullHandler()) + +__all__ = ["Strain", "StrainCollection"] diff --git a/src/nplinker/strain.py b/src/nplinker/strain/strain.py similarity index 98% rename from src/nplinker/strain.py rename to src/nplinker/strain/strain.py index 9a2e30d3..dbc77ea2 100644 --- a/src/nplinker/strain.py +++ b/src/nplinker/strain/strain.py @@ -1,5 +1,5 @@ from __future__ import annotations -from .logconfig import LogConfig +from nplinker.logconfig import LogConfig logger = LogConfig.getLogger(__name__) diff --git a/src/nplinker/strain_collection.py b/src/nplinker/strain/strain_collection.py similarity index 99% rename from src/nplinker/strain_collection.py rename to src/nplinker/strain/strain_collection.py index bb77c1dc..2b697882 100644 --- a/src/nplinker/strain_collection.py +++ b/src/nplinker/strain/strain_collection.py @@ -3,8 +3,8 @@ from os import PathLike from typing import Iterator from jsonschema import validate +from nplinker.logconfig import LogConfig from nplinker.schemas import STRAIN_MAPPINGS_SCHEMA -from .logconfig import LogConfig from .strain import Strain diff --git a/src/nplinker/strain/utils.py b/src/nplinker/strain/utils.py new file mode 100644 index 00000000..282441e7 --- /dev/null +++ b/src/nplinker/strain/utils.py @@ -0,0 +1,133 @@ +import json +from os import PathLike +from jsonschema import validate +from nplinker.logconfig import LogConfig +from nplinker.schemas import USER_STRAINS_SCHEMA +from ..genomics.utils import extract_mappings_original_genome_id_resolved_genome_id +from ..genomics.utils import extract_mappings_resolved_genome_id_bgc_id +from ..genomics.utils import extract_mappings_strain_id_original_genome_id +from ..genomics.utils import get_mappings_strain_id_bgc_id +from ..metabolomics.utils import extract_mappings_ms_filename_spectrum_id +from ..metabolomics.utils import extract_mappings_strain_id_ms_filename +from ..metabolomics.utils import get_mappings_strain_id_spectrum_id +from .strain import Strain +from .strain_collection import StrainCollection + + +logger = LogConfig.getLogger(__name__) + + +def load_user_strains(json_file: str | PathLike) -> set[Strain]: + """Load user specified strains from a JSON file. + + The JSON file must follow the schema defined in "nplinker/schemas/user_strains.json". + An example content of the JSON file: + {"strain_ids": ["strain1", "strain2"]} + + Args: + json_file: Path to the JSON file containing user specified strains. + + Returns: + set[Strain]: A set of user specified strains. + """ + with open(json_file, "r") as f: + json_data = json.load(f) + + # validate json data + validate(instance=json_data, schema=USER_STRAINS_SCHEMA) + + strains = set() + for strain_id in json_data["strain_ids"]: + strains.add(Strain(strain_id)) + + return strains + + +def podp_generate_strain_mappings( + podp_project_json_file: str | PathLike, + genome_status_json_file: str | PathLike, + genome_bgc_mappings_file: str | PathLike, + gnps_file_mappings_file: str | PathLike, + output_json_file: str | PathLike, +) -> StrainCollection: + """Generate strain mappings JSON file for PODP pipeline. + + To get the strain mappings, we need to combine the following mappings: + - strain_id <-> original_genome_id <-> resolved_genome_id <-> bgc_id + - strain_id <-> MS_filename <-> spectrum_id + + These mappings are extracted from the following files: + - "strain_id <-> original_genome_id" is extracted from `podp_project_json_file`. + - "original_genome_id <-> resolved_genome_id" is extracted from `genome_status_json_file`. + - "resolved_genome_id <-> bgc_id" is extracted from `genome_bgc_mappings_file`. + - "strain_id <-> MS_filename" is extracted from `podp_project_json_file`. + - "MS_filename <-> spectrum_id" is extracted from `gnps_file_mappings_file`. + + Args: + podp_project_json_file: The path to the PODP project + JSON file. + genome_status_json_file: The path to the genome status + JSON file. + genome_bgc_mappings_file: The path to the genome BGC + mappings JSON file. + gnps_file_mappings_file: The path to the GNPS file + mappings file (csv or tsv). + output_json_file: The path to the output JSON file. + + Returns: + The strain mappings stored in a StrainCollection object. + + See Also: + `extract_mappings_strain_id_original_genome_id`: Extract mappings + "strain_id <-> original_genome_id". + `extract_mappings_original_genome_id_resolved_genome_id`: Extract mappings + "original_genome_id <-> resolved_genome_id". + `extract_mappings_resolved_genome_id_bgc_id`: Extract mappings + "resolved_genome_id <-> bgc_id". + `get_mappings_strain_id_bgc_id`: Get mappings "strain_id <-> bgc_id". + `extract_mappings_strain_id_ms_filename`: Extract mappings + "strain_id <-> MS_filename". + `extract_mappings_ms_filename_spectrum_id`: Extract mappings + "MS_filename <-> spectrum_id". + `get_mappings_strain_id_spectrum_id`: Get mappings "strain_id <-> spectrum_id". + """ + # Get mappings strain_id <-> original_geonme_id <-> resolved_genome_id <-> bgc_id + mappings_strain_id_bgc_id = get_mappings_strain_id_bgc_id( + extract_mappings_strain_id_original_genome_id(podp_project_json_file), + extract_mappings_original_genome_id_resolved_genome_id(genome_status_json_file), + extract_mappings_resolved_genome_id_bgc_id(genome_bgc_mappings_file), + ) + + # Get mappings strain_id <-> MS_filename <-> spectrum_id + mappings_strain_id_spectrum_id = get_mappings_strain_id_spectrum_id( + extract_mappings_strain_id_ms_filename(podp_project_json_file), + extract_mappings_ms_filename_spectrum_id(gnps_file_mappings_file), + ) + + # Get mappings strain_id <-> bgc_id / spectrum_id + mappings = mappings_strain_id_bgc_id.copy() + for strain_id, spectrum_ids in mappings_strain_id_spectrum_id.items(): + if strain_id in mappings: + mappings[strain_id].update(spectrum_ids) + else: + mappings[strain_id] = spectrum_ids.copy() + + # Create StrainCollection + sc = StrainCollection() + for strain_id, bgc_ids in mappings.items(): + if not sc.has_name(strain_id): + strain = Strain(strain_id) + for bgc_id in bgc_ids: + strain.add_alias(bgc_id) + sc.add(strain) + else: + # strain_list has only one element + strain_list = sc.lookup(strain_id) + for bgc_id in bgc_ids: + strain_list[0].add_alias(bgc_id) + + # Write strain mappings JSON file + sc.to_json(output_json_file) + logger.info("Generated strain mappings JSON file: %s", output_json_file) + + return sc diff --git a/src/nplinker/strain_loader.py b/src/nplinker/strain_loader.py deleted file mode 100644 index b78a559b..00000000 --- a/src/nplinker/strain_loader.py +++ /dev/null @@ -1,35 +0,0 @@ -import json -from os import PathLike -from jsonschema import validate -from nplinker.logconfig import LogConfig -from nplinker.schemas import USER_STRAINS_SCHEMA -from .strain import Strain - - -logger = LogConfig.getLogger(__name__) - - -def load_user_strains(json_file: str | PathLike) -> set[Strain]: - """Load user specified strains from a JSON file. - - The JSON file must follow the schema defined in "nplinker/schemas/user_strains.json". - An example content of the JSON file: - {"strain_ids": ["strain1", "strain2"]} - - Args: - json_file: Path to the JSON file containing user specified strains. - - Returns: - set[Strain]: A set of user specified strains. - """ - with open(json_file, "r") as f: - json_data = json.load(f) - - # validate json data - validate(instance=json_data, schema=USER_STRAINS_SCHEMA) - - strains = set() - for strain_id in json_data["strain_ids"]: - strains.add(Strain(strain_id)) - - return strains diff --git a/tests/genomics/antismash/test_antismash_downloader.py b/tests/genomics/test_antismash_downloader.py similarity index 100% rename from tests/genomics/antismash/test_antismash_downloader.py rename to tests/genomics/test_antismash_downloader.py diff --git a/tests/genomics/antismash/test_antismash_loader.py b/tests/genomics/test_antismash_loader.py similarity index 97% rename from tests/genomics/antismash/test_antismash_loader.py rename to tests/genomics/test_antismash_loader.py index be6b7408..2cee2b13 100644 --- a/tests/genomics/antismash/test_antismash_loader.py +++ b/tests/genomics/test_antismash_loader.py @@ -1,9 +1,9 @@ import pytest from nplinker.genomics import BGC -from nplinker.genomics import BGCLoaderBase +from nplinker.genomics.abc import BGCLoaderBase from nplinker.genomics.antismash import AntismashBGCLoader from nplinker.genomics.antismash import parse_bgc_genbank -from ... import DATA_DIR +from .. import DATA_DIR class TestAntismashBGCLoader: diff --git a/tests/genomics/test_gcf.py b/tests/genomics/test_gcf.py index 5df2d5ce..ded23e27 100644 --- a/tests/genomics/test_gcf.py +++ b/tests/genomics/test_gcf.py @@ -2,7 +2,7 @@ from nplinker.genomics import BGC from nplinker.genomics import GCF from nplinker.strain import Strain -from nplinker.strain_collection import StrainCollection +from nplinker.strain import StrainCollection @pytest.fixture() diff --git a/tests/genomics/test_mibig_loader.py b/tests/genomics/test_mibig_loader.py index 0ed8fa9b..0cb59103 100644 --- a/tests/genomics/test_mibig_loader.py +++ b/tests/genomics/test_mibig_loader.py @@ -1,7 +1,7 @@ import os.path import pytest from nplinker.genomics import BGC -from nplinker.genomics import BGCLoaderBase +from nplinker.genomics.abc import BGCLoaderBase from nplinker.genomics.mibig import MibigLoader from nplinker.genomics.mibig import download_and_extract_mibig_metadata from nplinker.genomics.mibig import parse_bgc_metadata_json diff --git a/tests/pairedomics/test_podp_antismash_downloader.py b/tests/genomics/test_podp_antismash_downloader.py similarity index 99% rename from tests/pairedomics/test_podp_antismash_downloader.py rename to tests/genomics/test_podp_antismash_downloader.py index ab83b097..4d2259e7 100644 --- a/tests/pairedomics/test_podp_antismash_downloader.py +++ b/tests/genomics/test_podp_antismash_downloader.py @@ -1,9 +1,9 @@ import json from pathlib import Path import pytest +from nplinker.genomics.antismash import GenomeStatus +from nplinker.genomics.antismash import podp_download_and_extract_antismash_data from nplinker.globals import GENOME_STATUS_FILENAME -from nplinker.pairedomics import GenomeStatus -from nplinker.pairedomics import podp_download_and_extract_antismash_data from nplinker.utils import list_files diff --git a/tests/genomics/test_utils.py b/tests/genomics/test_utils.py index ab41cdb1..6e1ec742 100644 --- a/tests/genomics/test_utils.py +++ b/tests/genomics/test_utils.py @@ -3,13 +3,17 @@ import pytest from nplinker.genomics import BGC from nplinker.genomics import GCF -from nplinker.genomics import add_bgc_to_gcf -from nplinker.genomics import add_strain_to_bgc -from nplinker.genomics import generate_mappings_genome_id_bgc_id -from nplinker.genomics import get_mibig_from_gcf +from nplinker.genomics.utils import add_bgc_to_gcf +from nplinker.genomics.utils import add_strain_to_bgc +from nplinker.genomics.utils import extract_mappings_original_genome_id_resolved_genome_id +from nplinker.genomics.utils import extract_mappings_resolved_genome_id_bgc_id +from nplinker.genomics.utils import extract_mappings_strain_id_original_genome_id +from nplinker.genomics.utils import generate_mappings_genome_id_bgc_id +from nplinker.genomics.utils import get_mappings_strain_id_bgc_id +from nplinker.genomics.utils import get_mibig_from_gcf from nplinker.globals import GENOME_BGC_MAPPINGS_FILENAME from nplinker.strain import Strain -from nplinker.strain_collection import StrainCollection +from nplinker.strain import StrainCollection from .. import DATA_DIR @@ -159,3 +163,133 @@ def test_get_mibig_from_gcf(): assert len(mibig_strains_in_use) == 2 assert bgc3 not in mibig_bgcs_in_use assert bgc3.strain not in mibig_strains_in_use + + +def test_extract_mappings_strain_id_original_genome_id(tmp_path): + test_data = { + "genomes": [ + {"genome_label": "strain1", "genome_ID": {"RefSeq_accession": "id1"}}, + {"genome_label": "strain1", "genome_ID": {"RefSeq_accession": "id2"}}, + {"genome_label": "strain2", "genome_ID": {"RefSeq_accession": "id3"}}, + ], + "metabolomics": {"project": {"molecular_network": "01234567890123456789012345678901"}}, + "genome_metabolome_links": [ + {"metabolomics_file": "ftp://example.org/001.mzXML", "genome_label": "strain1"}, + ], + "version": "3", + } + test_file = tmp_path / "test_data.json" + with open(test_file, "w") as f: + json.dump(test_data, f) + + expected_result = { + "strain1": {"id1", "id2"}, + "strain2": {"id3"}, + } + assert extract_mappings_strain_id_original_genome_id(test_file) == expected_result + + +def test_extract_mappings_original_genome_id_resolved_genome_id(tmp_path): + test_data = { + "genome_status": [ + { + "original_id": "id1", + "resolved_refseq_id": "refseq1", + "resolve_attempted": True, + "bgc_path": "", + }, + { + "original_id": "id2", + "resolved_refseq_id": "refseq2", + "resolve_attempted": True, + "bgc_path": "", + }, + { + "original_id": "id3", + "resolved_refseq_id": "refseq3", + "resolve_attempted": True, + "bgc_path": "", + }, + ], + "version": "1.0", + } + test_file = tmp_path / "test_data.json" + with open(test_file, "w") as f: + json.dump(test_data, f) + + expected_result = {"id1": "refseq1", "id2": "refseq2", "id3": "refseq3"} + + assert extract_mappings_original_genome_id_resolved_genome_id(test_file) == expected_result + + +def test_extract_mappings_resolved_genome_id_bgc_id(tmp_path): + test_data = { + "mappings": [ + {"genome_ID": "id1", "BGC_ID": ["bgc1", "bgc2"]}, + {"genome_ID": "id2", "BGC_ID": ["bgc3"]}, + ], + "version": "1.0", + } + test_file = tmp_path / "test_data.json" + with open(test_file, "w") as f: + json.dump(test_data, f) + expected_result = {"id1": {"bgc1", "bgc2"}, "id2": {"bgc3"}} + assert extract_mappings_resolved_genome_id_bgc_id(test_file) == expected_result + + +def test_get_mappings_strain_id_bgc_id(): + # Test case 1: Test with empty mappings + mappings_strain_id_original_genome_id = {} + mappings_original_genome_id_resolved_genome_id = {} + mappings_resolved_genome_id_bgc_id = {} + expected_result = {} + assert ( + get_mappings_strain_id_bgc_id( + mappings_strain_id_original_genome_id, + mappings_original_genome_id_resolved_genome_id, + mappings_resolved_genome_id_bgc_id, + ) + == expected_result + ) + + # Test case 2: Test with one strain and one genome + mappings_strain_id_original_genome_id = {"strain1": {"genome1"}} + mappings_original_genome_id_resolved_genome_id = {"genome1": "resolved_genome1"} + mappings_resolved_genome_id_bgc_id = {"resolved_genome1": {"bgc1"}} + expected_result = {"strain1": {"bgc1"}} + assert ( + get_mappings_strain_id_bgc_id( + mappings_strain_id_original_genome_id, + mappings_original_genome_id_resolved_genome_id, + mappings_resolved_genome_id_bgc_id, + ) + == expected_result + ) + + # Test case 3: Test with multiple strains and genomes + mappings_strain_id_original_genome_id = { + "strain1": {"genome1", "genome2"}, + "strain2": {"genome3"}, + "strain3": {"genome4"}, + } + mappings_original_genome_id_resolved_genome_id = { + "genome1": "resolved_genome1", + "genome2": "resolved_genome1", + "genome3": "resolved_genome2", + "genome4": "", + } + mappings_resolved_genome_id_bgc_id = { + "resolved_genome1": { + "bgc1", + }, + "resolved_genome2": {"bgc2", "bgc3"}, + } + expected_result = {"strain1": {"bgc1"}, "strain2": {"bgc2", "bgc3"}} + assert ( + get_mappings_strain_id_bgc_id( + mappings_strain_id_original_genome_id, + mappings_original_genome_id_resolved_genome_id, + mappings_resolved_genome_id_bgc_id, + ) + == expected_result + ) diff --git a/tests/metabolomics/test_molecular_family.py b/tests/metabolomics/test_molecular_family.py index e5731320..7713e189 100644 --- a/tests/metabolomics/test_molecular_family.py +++ b/tests/metabolomics/test_molecular_family.py @@ -2,7 +2,7 @@ from nplinker.metabolomics import MolecularFamily from nplinker.metabolomics import Spectrum from nplinker.strain import Strain -from nplinker.strain_collection import StrainCollection +from nplinker.strain import StrainCollection @pytest.fixture() diff --git a/tests/metabolomics/test_spectrum.py b/tests/metabolomics/test_spectrum.py index 689f9c25..3c4d22ed 100644 --- a/tests/metabolomics/test_spectrum.py +++ b/tests/metabolomics/test_spectrum.py @@ -2,7 +2,7 @@ import pytest from nplinker.metabolomics import Spectrum from nplinker.strain import Strain -from nplinker.strain_collection import StrainCollection +from nplinker.strain import StrainCollection @pytest.mark.parametrize( diff --git a/tests/metabolomics/test_utils.py b/tests/metabolomics/test_utils.py index 84a11aa9..aa9bd2e4 100644 --- a/tests/metabolomics/test_utils.py +++ b/tests/metabolomics/test_utils.py @@ -1,11 +1,15 @@ +import json import pytest from nplinker.metabolomics import MolecularFamily from nplinker.metabolomics import Spectrum -from nplinker.metabolomics import add_annotation_to_spectrum -from nplinker.metabolomics import add_spectrum_to_mf -from nplinker.metabolomics import add_strains_to_spectrum +from nplinker.metabolomics.utils import add_annotation_to_spectrum +from nplinker.metabolomics.utils import add_spectrum_to_mf +from nplinker.metabolomics.utils import add_strains_to_spectrum +from nplinker.metabolomics.utils import extract_mappings_ms_filename_spectrum_id +from nplinker.metabolomics.utils import extract_mappings_strain_id_ms_filename +from nplinker.metabolomics.utils import get_mappings_strain_id_spectrum_id from nplinker.strain import Strain -from nplinker.strain_collection import StrainCollection +from nplinker.strain import StrainCollection @pytest.fixture @@ -81,3 +85,66 @@ def test_add_spectrum_to_mf(spectra): assert mf0.spectra == {spectra[0], spectra[1]} assert mf1.spectra == {spectra[2]} assert mf2.spectra == set() + + +def test_extract_mappings_strain_id_ms_filename(tmp_path): + test_data = { + "genome_metabolome_links": [ + {"genome_label": "strain1", "metabolomics_file": "http://example.com/file1.mzXML"}, + {"genome_label": "strain1", "metabolomics_file": "http://example.com/file2.mzXML"}, + {"genome_label": "strain2", "metabolomics_file": "http://example.com/file3.mzXML"}, + {"genome_label": "strain3", "metabolomics_file": "http://example.com/file4.mzXML"}, + ], + "genomes": [ + {"genome_label": "strain1", "genome_ID": {"RefSeq_accession": "id1"}}, + ], + "metabolomics": {"project": {"molecular_network": "01234567890123456789012345678901"}}, + "version": "3", + } + test_file = tmp_path / "test_data.json" + with open(test_file, "w") as f: + json.dump(test_data, f) + expected_result = { + "strain1": {"file1.mzXML", "file2.mzXML"}, + "strain2": {"file3.mzXML"}, + "strain3": {"file4.mzXML"}, + } + + assert extract_mappings_strain_id_ms_filename(test_file) == expected_result + + +def test_extract_mappings_ms_filename_spectrum_id(tmp_path): + test_data = "cluster index\tAllFiles\nspec1\tfile1.mzXML:123###\nspec2\tfile2.mzXML:123###\nspec3\tfile2.mzXML:123###file3.mzXML:123###\n" + test_file = tmp_path / "test_data.tsv" + with open(test_file, "w") as f: + f.write(test_data) + expected_result = { + "file1.mzXML": {"spec1"}, + "file2.mzXML": {"spec2", "spec3"}, + "file3.mzXML": {"spec3"}, + } + + assert extract_mappings_ms_filename_spectrum_id(test_file) == expected_result + + +def test_get_mappings_strain_id_spectrum_id(): + mappings_strain_id_ms_filename = { + "strain1": {"file1.mzXML", "file2.mzXML"}, + "strain2": {"file3.mzXML"}, + "strain3": {"file4.mzXML"}, + } + mappings_ms_filename_spectrum_id = { + "file1.mzXML": {"spec1"}, + "file2.mzXML": {"spec2", "spec3"}, + "file3.mzXML": {"spec3"}, + } + + expected_mappings_dict = { + "strain1": {"spec1", "spec2", "spec3"}, + "strain2": {"spec3"}, + } + actual_mappings_dict = get_mappings_strain_id_spectrum_id( + mappings_strain_id_ms_filename, mappings_ms_filename_spectrum_id + ) + + assert actual_mappings_dict == expected_mappings_dict diff --git a/tests/pairedomics/__init__.py b/tests/pairedomics/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/pairedomics/test_strain_mappings_generator.py b/tests/pairedomics/test_strain_mappings_generator.py deleted file mode 100644 index e6515b1a..00000000 --- a/tests/pairedomics/test_strain_mappings_generator.py +++ /dev/null @@ -1,268 +0,0 @@ -import json -from nplinker.pairedomics import extract_mappings_ms_filename_spectrum_id -from nplinker.pairedomics import extract_mappings_original_genome_id_resolved_genome_id -from nplinker.pairedomics import extract_mappings_resolved_genome_id_bgc_id -from nplinker.pairedomics import extract_mappings_strain_id_ms_filename -from nplinker.pairedomics import extract_mappings_strain_id_original_genome_id -from nplinker.pairedomics import get_mappings_strain_id_bgc_id -from nplinker.pairedomics import get_mappings_strain_id_spectrum_id -from nplinker.pairedomics import podp_generate_strain_mappings -from nplinker.strain import Strain -from nplinker.strain_collection import StrainCollection - - -def test_podp_generate_strain_mappings(monkeypatch, tmp_path): - # mock functions called by the tested function - mappings_strain_bgc = { - "strain1": {"bgc1", "bgc2"}, - "strain2": {"bgc3"}, - } - mappings_strain_spectrum = {"strain1": {"spec1", "spec2"}, "strain2": {"spec3"}} - - monkeypatch.setattr( - "nplinker.pairedomics.strain_mappings_generator.extract_mappings_strain_id_original_genome_id", - lambda *args: {}, - ) # any return value is fine - monkeypatch.setattr( - "nplinker.pairedomics.strain_mappings_generator.extract_mappings_original_genome_id_resolved_genome_id", - lambda *args: {}, - ) - monkeypatch.setattr( - "nplinker.pairedomics.strain_mappings_generator.extract_mappings_resolved_genome_id_bgc_id", - lambda *args: {}, - ) - monkeypatch.setattr( - "nplinker.pairedomics.strain_mappings_generator.get_mappings_strain_id_bgc_id", - lambda *args: mappings_strain_bgc, - ) - - monkeypatch.setattr( - "nplinker.pairedomics.strain_mappings_generator.extract_mappings_strain_id_ms_filename", - lambda *args: {}, - ) - monkeypatch.setattr( - "nplinker.pairedomics.strain_mappings_generator.extract_mappings_ms_filename_spectrum_id", - lambda *args: {}, - ) - monkeypatch.setattr( - "nplinker.pairedomics.strain_mappings_generator.get_mappings_strain_id_spectrum_id", - lambda *args: mappings_strain_spectrum, - ) - - # Create the expected - expected_dict = {"strain1": {"bgc1", "bgc2", "spec1", "spec2"}, "strain2": {"bgc3", "spec3"}} - expected_sc = StrainCollection() - for strain_id, ids in expected_dict.items(): - strain = Strain(strain_id) - for iid in ids: - strain.add_alias(iid) - expected_sc.add(strain) - - # Call function to generate strain mappings - output_file = tmp_path / "output.json" - result = podp_generate_strain_mappings( - "dummy_podp_project_file", - "dummy_genome_status_file", - "dummy_genome_bgc_mappings_file", - "dummy_gnps_file_mapping_file", - output_file, - ) - # check returned value - assert isinstance(result, StrainCollection) - assert result == expected_sc - # check output file - sc = StrainCollection.read_json(output_file) - assert sc == expected_sc - - -def test_extract_mappings_strain_id_original_genome_id(tmp_path): - test_data = { - "genomes": [ - {"genome_label": "strain1", "genome_ID": {"RefSeq_accession": "id1"}}, - {"genome_label": "strain1", "genome_ID": {"RefSeq_accession": "id2"}}, - {"genome_label": "strain2", "genome_ID": {"RefSeq_accession": "id3"}}, - ], - "metabolomics": {"project": {"molecular_network": "01234567890123456789012345678901"}}, - "genome_metabolome_links": [ - {"metabolomics_file": "ftp://example.org/001.mzXML", "genome_label": "strain1"}, - ], - "version": "3", - } - test_file = tmp_path / "test_data.json" - with open(test_file, "w") as f: - json.dump(test_data, f) - - expected_result = { - "strain1": {"id1", "id2"}, - "strain2": {"id3"}, - } - assert extract_mappings_strain_id_original_genome_id(test_file) == expected_result - - -def test_extract_mappings_original_genome_id_resolved_genome_id(tmp_path): - test_data = { - "genome_status": [ - { - "original_id": "id1", - "resolved_refseq_id": "refseq1", - "resolve_attempted": True, - "bgc_path": "", - }, - { - "original_id": "id2", - "resolved_refseq_id": "refseq2", - "resolve_attempted": True, - "bgc_path": "", - }, - { - "original_id": "id3", - "resolved_refseq_id": "refseq3", - "resolve_attempted": True, - "bgc_path": "", - }, - ], - "version": "1.0", - } - test_file = tmp_path / "test_data.json" - with open(test_file, "w") as f: - json.dump(test_data, f) - - expected_result = {"id1": "refseq1", "id2": "refseq2", "id3": "refseq3"} - - assert extract_mappings_original_genome_id_resolved_genome_id(test_file) == expected_result - - -def test_extract_mappings_resolved_genome_id_bgc_id(tmp_path): - test_data = { - "mappings": [ - {"genome_ID": "id1", "BGC_ID": ["bgc1", "bgc2"]}, - {"genome_ID": "id2", "BGC_ID": ["bgc3"]}, - ], - "version": "1.0", - } - test_file = tmp_path / "test_data.json" - with open(test_file, "w") as f: - json.dump(test_data, f) - expected_result = {"id1": {"bgc1", "bgc2"}, "id2": {"bgc3"}} - assert extract_mappings_resolved_genome_id_bgc_id(test_file) == expected_result - - -def test_get_mappings_strain_id_bgc_id(): - # Test case 1: Test with empty mappings - mappings_strain_id_original_genome_id = {} - mappings_original_genome_id_resolved_genome_id = {} - mappings_resolved_genome_id_bgc_id = {} - expected_result = {} - assert ( - get_mappings_strain_id_bgc_id( - mappings_strain_id_original_genome_id, - mappings_original_genome_id_resolved_genome_id, - mappings_resolved_genome_id_bgc_id, - ) - == expected_result - ) - - # Test case 2: Test with one strain and one genome - mappings_strain_id_original_genome_id = {"strain1": {"genome1"}} - mappings_original_genome_id_resolved_genome_id = {"genome1": "resolved_genome1"} - mappings_resolved_genome_id_bgc_id = {"resolved_genome1": {"bgc1"}} - expected_result = {"strain1": {"bgc1"}} - assert ( - get_mappings_strain_id_bgc_id( - mappings_strain_id_original_genome_id, - mappings_original_genome_id_resolved_genome_id, - mappings_resolved_genome_id_bgc_id, - ) - == expected_result - ) - - # Test case 3: Test with multiple strains and genomes - mappings_strain_id_original_genome_id = { - "strain1": {"genome1", "genome2"}, - "strain2": {"genome3"}, - "strain3": {"genome4"}, - } - mappings_original_genome_id_resolved_genome_id = { - "genome1": "resolved_genome1", - "genome2": "resolved_genome1", - "genome3": "resolved_genome2", - "genome4": "", - } - mappings_resolved_genome_id_bgc_id = { - "resolved_genome1": { - "bgc1", - }, - "resolved_genome2": {"bgc2", "bgc3"}, - } - expected_result = {"strain1": {"bgc1"}, "strain2": {"bgc2", "bgc3"}} - assert ( - get_mappings_strain_id_bgc_id( - mappings_strain_id_original_genome_id, - mappings_original_genome_id_resolved_genome_id, - mappings_resolved_genome_id_bgc_id, - ) - == expected_result - ) - - -def test_extract_mappings_strain_id_ms_filename(tmp_path): - test_data = { - "genome_metabolome_links": [ - {"genome_label": "strain1", "metabolomics_file": "http://example.com/file1.mzXML"}, - {"genome_label": "strain1", "metabolomics_file": "http://example.com/file2.mzXML"}, - {"genome_label": "strain2", "metabolomics_file": "http://example.com/file3.mzXML"}, - {"genome_label": "strain3", "metabolomics_file": "http://example.com/file4.mzXML"}, - ], - "genomes": [ - {"genome_label": "strain1", "genome_ID": {"RefSeq_accession": "id1"}}, - ], - "metabolomics": {"project": {"molecular_network": "01234567890123456789012345678901"}}, - "version": "3", - } - test_file = tmp_path / "test_data.json" - with open(test_file, "w") as f: - json.dump(test_data, f) - expected_result = { - "strain1": {"file1.mzXML", "file2.mzXML"}, - "strain2": {"file3.mzXML"}, - "strain3": {"file4.mzXML"}, - } - - assert extract_mappings_strain_id_ms_filename(test_file) == expected_result - - -def test_extract_mappings_ms_filename_spectrum_id(tmp_path): - test_data = "cluster index\tAllFiles\nspec1\tfile1.mzXML:123###\nspec2\tfile2.mzXML:123###\nspec3\tfile2.mzXML:123###file3.mzXML:123###\n" - test_file = tmp_path / "test_data.tsv" - with open(test_file, "w") as f: - f.write(test_data) - expected_result = { - "file1.mzXML": {"spec1"}, - "file2.mzXML": {"spec2", "spec3"}, - "file3.mzXML": {"spec3"}, - } - - assert extract_mappings_ms_filename_spectrum_id(test_file) == expected_result - - -def test_get_mappings_strain_id_spectrum_id(): - mappings_strain_id_ms_filename = { - "strain1": {"file1.mzXML", "file2.mzXML"}, - "strain2": {"file3.mzXML"}, - "strain3": {"file4.mzXML"}, - } - mappings_ms_filename_spectrum_id = { - "file1.mzXML": {"spec1"}, - "file2.mzXML": {"spec2", "spec3"}, - "file3.mzXML": {"spec3"}, - } - - expected_mappings_dict = { - "strain1": {"spec1", "spec2", "spec3"}, - "strain2": {"spec3"}, - } - actual_mappings_dict = get_mappings_strain_id_spectrum_id( - mappings_strain_id_ms_filename, mappings_ms_filename_spectrum_id - ) - - assert actual_mappings_dict == expected_mappings_dict diff --git a/tests/scoring/conftest.py b/tests/scoring/conftest.py index 1f279c49..67a6a774 100644 --- a/tests/scoring/conftest.py +++ b/tests/scoring/conftest.py @@ -7,7 +7,7 @@ from nplinker.scoring.linking import DataLinks from nplinker.scoring.linking import LinkFinder from nplinker.strain import Strain -from nplinker.strain_collection import StrainCollection +from nplinker.strain import StrainCollection @fixture(scope="session") diff --git a/tests/genomics/antismash/__init__.py b/tests/strain/__init__.py similarity index 100% rename from tests/genomics/antismash/__init__.py rename to tests/strain/__init__.py diff --git a/tests/test_strain.py b/tests/strain/test_strain.py similarity index 100% rename from tests/test_strain.py rename to tests/strain/test_strain.py diff --git a/tests/test_strain_collection.py b/tests/strain/test_strain_collection.py similarity index 99% rename from tests/test_strain_collection.py rename to tests/strain/test_strain_collection.py index c84be529..50b844c4 100644 --- a/tests/test_strain_collection.py +++ b/tests/strain/test_strain_collection.py @@ -1,7 +1,7 @@ import json import pytest from nplinker.strain import Strain -from nplinker.strain_collection import StrainCollection +from nplinker.strain import StrainCollection @pytest.fixture diff --git a/tests/strain/test_utils.py b/tests/strain/test_utils.py new file mode 100644 index 00000000..acfae43d --- /dev/null +++ b/tests/strain/test_utils.py @@ -0,0 +1,90 @@ +import json +import pytest +from nplinker.strain import Strain +from nplinker.strain import StrainCollection +from nplinker.strain.utils import load_user_strains +from nplinker.strain.utils import podp_generate_strain_mappings + + +@pytest.fixture +def user_strains_file(tmp_path): + """Create a JSON file containing user specified strains.""" + data = { + "strain_ids": ["strain1", "strain2", "strain3"], + } + file_path = tmp_path / "user_strains.json" + with open(file_path, "w") as f: + json.dump(data, f) + return file_path + + +def test_load_user_strains(user_strains_file): + """Test load_user_strains function.""" + actual = load_user_strains(user_strains_file) + expected = {Strain("strain1"), Strain("strain2"), Strain("strain3")} + assert actual == expected + + +def test_podp_generate_strain_mappings(monkeypatch, tmp_path): + # mock functions called by the tested function + mappings_strain_bgc = { + "strain1": {"bgc1", "bgc2"}, + "strain2": {"bgc3"}, + } + mappings_strain_spectrum = {"strain1": {"spec1", "spec2"}, "strain2": {"spec3"}} + + # monkeypatch requires the mocked function is in the same scope of the tested function + monkeypatch.setattr( + "nplinker.strain.utils.extract_mappings_strain_id_original_genome_id", + lambda *args: {}, + ) # any return value is fine + monkeypatch.setattr( + "nplinker.strain.utils.extract_mappings_original_genome_id_resolved_genome_id", + lambda *args: {}, + ) + monkeypatch.setattr( + "nplinker.strain.utils.extract_mappings_resolved_genome_id_bgc_id", + lambda *args: {}, + ) + monkeypatch.setattr( + "nplinker.strain.utils.get_mappings_strain_id_bgc_id", + lambda *args: mappings_strain_bgc, + ) + + monkeypatch.setattr( + "nplinker.strain.utils.extract_mappings_strain_id_ms_filename", + lambda *args: {}, + ) + monkeypatch.setattr( + "nplinker.strain.utils.extract_mappings_ms_filename_spectrum_id", + lambda *args: {}, + ) + monkeypatch.setattr( + "nplinker.strain.utils.get_mappings_strain_id_spectrum_id", + lambda *args: mappings_strain_spectrum, + ) + + # Create the expected + expected_dict = {"strain1": {"bgc1", "bgc2", "spec1", "spec2"}, "strain2": {"bgc3", "spec3"}} + expected_sc = StrainCollection() + for strain_id, ids in expected_dict.items(): + strain = Strain(strain_id) + for iid in ids: + strain.add_alias(iid) + expected_sc.add(strain) + + # Call function to generate strain mappings + output_file = tmp_path / "output.json" + result = podp_generate_strain_mappings( + "dummy_podp_project_file", + "dummy_genome_status_file", + "dummy_genome_bgc_mappings_file", + "dummy_gnps_file_mapping_file", + output_file, + ) + # check returned value + assert isinstance(result, StrainCollection) + assert result == expected_sc + # check output file + sc = StrainCollection.read_json(output_file) + assert sc == expected_sc diff --git a/tests/test_gnps_loader.py b/tests/test_gnps_loader.py deleted file mode 100644 index 5ebf5ca1..00000000 --- a/tests/test_gnps_loader.py +++ /dev/null @@ -1,6 +0,0 @@ -from nplinker.metabolomics.gnps.gnps_loader import GNPSLoader - - -def test_default(): - sut = GNPSLoader() - assert sut is not None diff --git a/tests/test_strain_loader.py b/tests/test_strain_loader.py deleted file mode 100644 index f3042790..00000000 --- a/tests/test_strain_loader.py +++ /dev/null @@ -1,23 +0,0 @@ -import json -import pytest -from nplinker.strain import Strain -from nplinker.strain_loader import load_user_strains - - -@pytest.fixture -def user_strains_file(tmp_path): - """Create a JSON file containing user specified strains.""" - data = { - "strain_ids": ["strain1", "strain2", "strain3"], - } - file_path = tmp_path / "user_strains.json" - with open(file_path, "w") as f: - json.dump(data, f) - return file_path - - -def test_load_user_strains(user_strains_file): - """Test load_user_strains function.""" - actual = load_user_strains(user_strains_file) - expected = {Strain("strain1"), Strain("strain2"), Strain("strain3")} - assert actual == expected