Skip to content

Commit

Permalink
Restructure codebase
Browse files Browse the repository at this point in the history
This PR changes the locations of code files (or code) to make the structure look more logical. 

Changes:
- move runbigscape.py to bigscape folder
- move podp antismash downloader to antismash folder
- rename folder pairedomics to strain
- move strain related modules to strain folder
- move utils functions to related modules
- remove unused and empty class GNPSLoader
  • Loading branch information
CunliangGeng authored Mar 7, 2024
1 parent d79b15c commit 1948b8d
Show file tree
Hide file tree
Showing 42 changed files with 723 additions and 764 deletions.
8 changes: 4 additions & 4 deletions src/nplinker/arranger.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,19 @@
from jsonschema import validate
import nplinker.globals as globals
from nplinker.config import config
from nplinker.genomics import generate_mappings_genome_id_bgc_id
from nplinker.genomics.antismash import podp_download_and_extract_antismash_data
from nplinker.genomics.bigscape.runbigscape import run_bigscape
from nplinker.genomics.mibig import download_and_extract_mibig_metadata
from nplinker.genomics.utils import generate_mappings_genome_id_bgc_id
from nplinker.globals import GENOME_BGC_MAPPINGS_FILENAME
from nplinker.globals import GENOME_STATUS_FILENAME
from nplinker.globals import STRAIN_MAPPINGS_FILENAME
from nplinker.metabolomics.gnps import GNPSDownloader
from nplinker.metabolomics.gnps import GNPSExtractor
from nplinker.pairedomics import podp_download_and_extract_antismash_data
from nplinker.pairedomics.runbigscape import run_bigscape
from nplinker.pairedomics.strain_mappings_generator import podp_generate_strain_mappings
from nplinker.schemas import STRAIN_MAPPINGS_SCHEMA
from nplinker.schemas import USER_STRAINS_SCHEMA
from nplinker.schemas import validate_podp_json
from nplinker.strain.utils import podp_generate_strain_mappings
from nplinker.utils import download_url
from nplinker.utils import list_dirs
from nplinker.utils import list_files
Expand Down
10 changes: 0 additions & 10 deletions src/nplinker/genomics/__init__.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,11 @@
import logging
from .abc import BGCLoaderBase
from .bgc import BGC
from .gcf import GCF
from .utils import add_bgc_to_gcf
from .utils import add_strain_to_bgc
from .utils import generate_mappings_genome_id_bgc_id
from .utils import get_mibig_from_gcf


logging.getLogger(__name__).addHandler(logging.NullHandler())

__all__ = [
"BGCLoaderBase",
"BGC",
"GCF",
"add_bgc_to_gcf",
"add_strain_to_bgc",
"generate_mappings_genome_id_bgc_id",
"get_mibig_from_gcf",
]
12 changes: 11 additions & 1 deletion src/nplinker/genomics/antismash/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,18 @@
from .antismash_downloader import download_and_extract_antismash_data
from .antismash_loader import AntismashBGCLoader
from .antismash_loader import parse_bgc_genbank
from .podp_antismash_downloader import GenomeStatus
from .podp_antismash_downloader import get_best_available_genome_id
from .podp_antismash_downloader import podp_download_and_extract_antismash_data


logging.getLogger(__name__).addHandler(logging.NullHandler())

__all__ = ["AntismashBGCLoader", "parse_bgc_genbank", "download_and_extract_antismash_data"]
__all__ = [
"download_and_extract_antismash_data",
"AntismashBGCLoader",
"parse_bgc_genbank",
"GenomeStatus",
"get_best_available_genome_id",
"podp_download_and_extract_antismash_data",
]
3 changes: 2 additions & 1 deletion src/nplinker/genomics/bigscape/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import logging
from .bigscape_loader import BigscapeGCFLoader
from .runbigscape import run_bigscape


logging.getLogger(__name__).addHandler(logging.NullHandler())

__all__ = ["BigscapeGCFLoader"]
__all__ = ["BigscapeGCFLoader", "run_bigscape"]
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import subprocess
import sys
from os import PathLike
from ..logconfig import LogConfig
from ...logconfig import LogConfig


logger = LogConfig.getLogger(__name__)
Expand Down
2 changes: 1 addition & 1 deletion src/nplinker/genomics/gcf.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations
from typing import TYPE_CHECKING
from nplinker.logconfig import LogConfig
from nplinker.strain_collection import StrainCollection
from nplinker.strain import StrainCollection


if TYPE_CHECKING:
Expand Down
130 changes: 129 additions & 1 deletion src/nplinker/genomics/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,12 @@
from nplinker.globals import GENOME_BGC_MAPPINGS_FILENAME
from nplinker.logconfig import LogConfig
from nplinker.schemas import GENOME_BGC_MAPPINGS_SCHEMA
from nplinker.strain_collection import StrainCollection
from nplinker.schemas import validate_podp_json
from nplinker.strain import StrainCollection
from nplinker.utils import list_dirs
from nplinker.utils import list_files
from ..genomics.antismash.podp_antismash_downloader import GenomeStatus
from ..genomics.antismash.podp_antismash_downloader import get_best_available_genome_id
from .bgc import BGC
from .gcf import GCF

Expand Down Expand Up @@ -179,3 +182,128 @@ def get_mibig_from_gcf(gcfs: list[GCF]) -> tuple[list[BGC], StrainCollection]:
if bgc.strain is not None:
mibig_strains_in_use.add(bgc.strain)
return mibig_bgcs_in_use, mibig_strains_in_use


# ------------------------------------------------------------------------------
# Functions to extract mappings for genomics side:
# strain_id <-> original_geonme_id <-> resolved_genome_id <-> bgc_id
# ------------------------------------------------------------------------------
def extract_mappings_strain_id_original_genome_id(
podp_project_json_file: str | PathLike
) -> dict[str, set[str]]:
"""Extract mappings "strain id <-> original genome id".
Args:
podp_project_json_file: The path to the PODP project
JSON file.
Returns:
Key is strain id and value is a set of original genome ids.
Notes:
The `podp_project_json_file` is the project JSON file downloaded from
PODP platform. For example, for project MSV000079284, its json file is
https://pairedomicsdata.bioinformatics.nl/api/projects/4b29ddc3-26d0-40d7-80c5-44fb6631dbf9.4.
"""
mappings_dict = {}
with open(podp_project_json_file, "r") as f:
json_data = json.load(f)

validate_podp_json(json_data)

for record in json_data["genomes"]:
strain_id = record["genome_label"]
genome_id = get_best_available_genome_id(record["genome_ID"])
if genome_id is None:
logger.warning("Failed to extract genome ID from genome with label %s", strain_id)
continue
if strain_id in mappings_dict:
mappings_dict[strain_id].add(genome_id)
else:
mappings_dict[strain_id] = {genome_id}
return mappings_dict


def extract_mappings_original_genome_id_resolved_genome_id(
genome_status_json_file: str | PathLike
) -> dict[str, str]:
"""Extract mappings "original_genome_id <-> resolved_genome_id".
Args:
genome_status_json_file: The path to the genome status
JSON file.
Returns:
Key is original genome id and value is resolved genome id.
Notes:
The `genome_status_json_file` is usually generated by the
`podp_download_and_extract_antismash_data` function with
a default file name defined in `nplinker.globals.GENOME_STATUS_FILENAME`.
"""
gs_mappings_dict = GenomeStatus.read_json(genome_status_json_file)
return {gs.original_id: gs.resolved_refseq_id for gs in gs_mappings_dict.values()}


def extract_mappings_resolved_genome_id_bgc_id(
genome_bgc_mappings_file: str | PathLike
) -> dict[str, set[str]]:
"""Extract mappings "resolved_genome_id <-> bgc_id".
Args:
genome_bgc_mappings_file: The path to the genome BGC
mappings JSON file.
Returns:
Key is resolved genome id and value is a set of BGC ids.
Notes:
The `genome_bgc_mappings_file` is usually generated by the
`generate_mappings_genome_id_bgc_id` function with a default file name
defined in `nplinker.globals.GENOME_BGC_MAPPINGS_FILENAME`.
"""
with open(genome_bgc_mappings_file, "r") as f:
json_data = json.load(f)

# validate the JSON data
validate(json_data, GENOME_BGC_MAPPINGS_SCHEMA)

return {mapping["genome_ID"]: set(mapping["BGC_ID"]) for mapping in json_data["mappings"]}


def get_mappings_strain_id_bgc_id(
mappings_strain_id_original_genome_id: dict[str, set[str]],
mappings_original_genome_id_resolved_genome_id: dict[str, str],
mappings_resolved_genome_id_bgc_id: dict[str, set[str]],
) -> dict[str, set[str]]:
"""Get mappings "strain_id <-> bgc_id".
Args:
mappings_strain_id_original_genome_id: Mappings
"strain_id <-> original_genome_id".
mappings_original_genome_id_resolved_genome_id: Mappings
"original_genome_id <-> resolved_genome_id".
mappings_resolved_genome_id_bgc_id: Mappings
"resolved_genome_id <-> bgc_id".
Returns:
Key is strain id and value is a set of BGC ids.
See Also:
`extract_mappings_strain_id_original_genome_id`: Extract mappings
"strain_id <-> original_genome_id".
`extract_mappings_original_genome_id_resolved_genome_id`: Extract mappings
"original_genome_id <-> resolved_genome_id".
`extract_mappings_resolved_genome_id_bgc_id`: Extract mappings
"resolved_genome_id <-> bgc_id".
"""
mappings_dict = {}
for strain_id, original_genome_ids in mappings_strain_id_original_genome_id.items():
bgc_ids = set()
for original_genome_id in original_genome_ids:
resolved_genome_id = mappings_original_genome_id_resolved_genome_id[original_genome_id]
if (bgc_id := mappings_resolved_genome_id_bgc_id.get(resolved_genome_id)) is not None:
bgc_ids.update(bgc_id)
if bgc_ids:
mappings_dict[strain_id] = bgc_ids
return mappings_dict
16 changes: 8 additions & 8 deletions src/nplinker/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,27 @@
from deprecated import deprecated
from nplinker import globals
from nplinker.config import config
from nplinker.genomics import add_bgc_to_gcf
from nplinker.genomics import add_strain_to_bgc
from nplinker.genomics import get_mibig_from_gcf
from nplinker.genomics.antismash import AntismashBGCLoader
from nplinker.genomics.bigscape import BigscapeGCFLoader
from nplinker.genomics.mibig import MibigLoader
from nplinker.genomics.utils import add_bgc_to_gcf
from nplinker.genomics.utils import add_strain_to_bgc
from nplinker.genomics.utils import get_mibig_from_gcf
from nplinker.globals import GNPS_ANNOTATIONS_FILENAME
from nplinker.globals import GNPS_DEFAULT_PATH
from nplinker.globals import GNPS_MOLECULAR_FAMILY_FILENAME
from nplinker.globals import GNPS_SPECTRA_FILENAME
from nplinker.globals import STRAIN_MAPPINGS_FILENAME
from nplinker.globals import STRAINS_SELECTED_FILENAME
from nplinker.logconfig import LogConfig
from nplinker.metabolomics import add_annotation_to_spectrum
from nplinker.metabolomics import add_spectrum_to_mf
from nplinker.metabolomics import add_strains_to_spectrum
from nplinker.metabolomics.gnps import GNPSAnnotationLoader
from nplinker.metabolomics.gnps import GNPSMolecularFamilyLoader
from nplinker.metabolomics.gnps import GNPSSpectrumLoader
from nplinker.strain_collection import StrainCollection
from nplinker.strain_loader import load_user_strains
from nplinker.metabolomics.utils import add_annotation_to_spectrum
from nplinker.metabolomics.utils import add_spectrum_to_mf
from nplinker.metabolomics.utils import add_strains_to_spectrum
from nplinker.strain import StrainCollection
from nplinker.strain.utils import load_user_strains


try:
Expand Down
6 changes: 0 additions & 6 deletions src/nplinker/metabolomics/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
import logging
from .molecular_family import MolecularFamily
from .spectrum import Spectrum
from .utils import add_annotation_to_spectrum
from .utils import add_spectrum_to_mf
from .utils import add_strains_to_spectrum


logging.getLogger(__name__).addHandler(logging.NullHandler())
Expand All @@ -12,7 +9,4 @@
__all__ = [
"MolecularFamily",
"Spectrum",
"add_annotation_to_spectrum",
"add_spectrum_to_mf",
"add_strains_to_spectrum",
]
12 changes: 8 additions & 4 deletions src/nplinker/metabolomics/abc.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,24 @@
from abc import ABC
from abc import abstractmethod
from collections.abc import Sequence
from nplinker.metabolomics import MolecularFamily
from nplinker.metabolomics import Spectrum
from typing import TYPE_CHECKING


if TYPE_CHECKING:
from .molecular_family import MolecularFamily
from .spectrum import Spectrum


class SpectrumLoaderBase(ABC):
@property
@abstractmethod
def spectra(self) -> Sequence[Spectrum]:
def spectra(self) -> Sequence["Spectrum"]:
...


class MolecularFamilyLoaderBase(ABC):
@abstractmethod
def get_mfs(self, keep_singleton: bool) -> Sequence[MolecularFamily]:
def get_mfs(self, keep_singleton: bool) -> Sequence["MolecularFamily"]:
"""Get MolecularFamily objects.
Args:
Expand Down
3 changes: 0 additions & 3 deletions src/nplinker/metabolomics/gnps/gnps_loader.py

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import csv
from os import PathLike
from nplinker.metabolomics import MolecularFamily
from nplinker.metabolomics.abc import MolecularFamilyLoaderBase
from nplinker.utils import is_file_format
from ..molecular_family import MolecularFamily


class GNPSMolecularFamilyLoader(MolecularFamilyLoaderBase):
Expand Down
4 changes: 2 additions & 2 deletions src/nplinker/metabolomics/molecular_family.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations
from typing import TYPE_CHECKING
from nplinker.strain import Strain
from nplinker.strain_collection import StrainCollection
from ..strain.strain import Strain
from ..strain.strain_collection import StrainCollection


if TYPE_CHECKING:
Expand Down
2 changes: 1 addition & 1 deletion src/nplinker/metabolomics/spectrum.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from typing import TYPE_CHECKING
import numpy as np
from nplinker.strain import Strain
from nplinker.strain_collection import StrainCollection
from nplinker.strain import StrainCollection


if TYPE_CHECKING:
Expand Down
Loading

0 comments on commit 1948b8d

Please sign in to comment.