diff --git a/src/nplinker/genomics/__init__.py b/src/nplinker/genomics/__init__.py index 119bd8f2..c89bb4c8 100644 --- a/src/nplinker/genomics/__init__.py +++ b/src/nplinker/genomics/__init__.py @@ -5,6 +5,7 @@ from .utils import add_bgc_to_gcf from .utils import add_strain_to_bgc from .utils import generate_mappings_genome_id_bgc_id +from .utils import get_mibig_from_gcf logging.getLogger(__name__).addHandler(logging.NullHandler()) @@ -13,7 +14,8 @@ "BGCLoaderBase", "BGC", "GCF", - "generate_mappings_genome_id_bgc_id", "add_bgc_to_gcf", "add_strain_to_bgc", + "generate_mappings_genome_id_bgc_id", + "get_mibig_from_gcf", ] diff --git a/src/nplinker/genomics/mibig/mibig_loader.py b/src/nplinker/genomics/mibig/mibig_loader.py index abe65bb2..cd3c6238 100644 --- a/src/nplinker/genomics/mibig/mibig_loader.py +++ b/src/nplinker/genomics/mibig/mibig_loader.py @@ -17,6 +17,9 @@ def __init__(self, data_dir: str): MIBiG metadata file (json) contains annotations/metadata information for each BGC. See https://mibig.secondarymetabolites.org/download. + The MiBIG accession is used as BGC id and strain name. The loaded BGC + objects have Strain object as their strain attribute (i.e. `BGC.strain`). + Args: data_dir(str): Path to the directory of MIBiG metadata json files """ @@ -25,16 +28,6 @@ def __init__(self, data_dir: str): self._metadata_dict = self._parse_metadatas() self._bgcs = self._parse_bgcs() - def get_strain_bgc_mapping(self) -> dict[str, str]: - """Get the mapping from strain to BGC. - - Note that for MIBiG BGC, same value is used for strain name and BGC id. - - Returns: - dict[str, str]: key is strain name, value is BGC id. - """ - return {bid: bid for bid in self._file_dict} - def get_files(self) -> dict[str, str]: """Get the path of all MIBiG metadata json files. @@ -46,8 +39,7 @@ def get_files(self) -> dict[str, str]: @staticmethod def parse_data_dir(data_dir: str) -> dict[str, str]: - """Parse metadata directory and return pathes to all metadata json - files. + """Parse metadata directory and return paths to all metadata json files. Args: data_dir(str): path to the directory of MIBiG metadata json files @@ -88,6 +80,10 @@ def _parse_metadatas(self) -> dict[str, MibigMetadata]: def get_bgcs(self) -> list[BGC]: """Get BGC objects. + The BGC objects use MiBIG accession as id and have Strain object as + their strain attribute (i.e. `BGC.strain`), where the name of the Strain + object is also MiBIG accession. + Returns: list[str, BGC]: a list of :class:`nplinker.genomics.BGC` objects """ @@ -105,6 +101,9 @@ def _parse_bgcs(self) -> list[BGC]: def parse_bgc_metadata_json(file: str) -> BGC: """Parse MIBiG metadata file and return BGC object. + Note that the MiBIG accession is used as the BGC id and strain name. The BGC + object has Strain object as its strain attribute. + Args: file(str): Path to the MIBiG metadata json file diff --git a/src/nplinker/genomics/utils.py b/src/nplinker/genomics/utils.py index 5187bca8..4849eea8 100644 --- a/src/nplinker/genomics/utils.py +++ b/src/nplinker/genomics/utils.py @@ -158,3 +158,25 @@ def add_bgc_to_gcf( f"{len(gcf_missing_bgc)} GCF objects have missing BGC objects." ) return gcf_with_bgc, gcf_without_bgc, gcf_missing_bgc + + +def get_mibig_from_gcf(gcfs: list[GCF]) -> tuple[list[BGC], StrainCollection]: + """Get MIBiG BGCs and strains from GCF objects. + + Args: + gcfs(list[GCF]): A list of GCF objects. + + Returns: + tuple[list[BGC], StrainCollection]: The first is a list of MIBiG BGC + objects used in the GCFs; the second is a StrainCollection object + that contains all Strain objects used in the GCFs. + """ + mibig_bgcs_in_use = [] + mibig_strains_in_use = StrainCollection() + for gcf in gcfs: + for bgc in gcf.bgcs: + if bgc.is_mibig(): + mibig_bgcs_in_use.append(bgc) + if bgc.strain is not None: + mibig_strains_in_use.add(bgc.strain) + return mibig_bgcs_in_use, mibig_strains_in_use diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py index 4ceda3ac..ccdedb39 100644 --- a/src/nplinker/loader.py +++ b/src/nplinker/loader.py @@ -8,6 +8,7 @@ from nplinker.genomics import add_bgc_to_gcf from nplinker.genomics import add_strain_to_bgc from nplinker.genomics import generate_mappings_genome_id_bgc_id +from nplinker.genomics import get_mibig_from_gcf from nplinker.genomics.antismash import AntismashBGCLoader from nplinker.genomics.bigscape import BigscapeGCFLoader from nplinker.genomics.mibig import MibigLoader @@ -26,7 +27,6 @@ from nplinker.pairedomics.downloader import PODPDownloader from nplinker.pairedomics.runbigscape import run_bigscape from nplinker.pairedomics.strain_mappings_generator import podp_generate_strain_mappings -from nplinker.strain import Strain from nplinker.strain_collection import StrainCollection from nplinker.strain_loader import load_user_strains @@ -124,7 +124,7 @@ def __init__(self, config_data): ) self.bgcs, self.gcfs, self.spectra, self.molfams = [], [], [], [] self.mibig_bgcs = [] - self._mibig_strain_bgc_mapping = {} + self.mibig_strains_in_use = StrainCollection() self.product_types = [] self.strains = StrainCollection() self.webapp_scoring_cutoff = self._config_webapp.get( @@ -185,10 +185,6 @@ def generate_strain_mappings(self): ) def load(self): - if self._use_mibig: - if not self._load_mibig(): - return False - if not self._load_strain_mappings(): return False @@ -198,6 +194,9 @@ def load(self): if not self._load_genomics(): return False + # set self.strains with all strains from input plus mibig strains in use + self.strains = self.strains + self.mibig_strains_in_use + if len(self.strains) == 0: raise Exception(f"Failed to find *ANY* strains, missing {STRAIN_MAPPINGS_FILENAME}?") @@ -356,12 +355,6 @@ def _validate_paths(self): if not os.path.exists(str(f)): logger.warning('Optional file/directory "%s" does not exist', f) - def _load_mibig(self): - mibig_bgc_loader = MibigLoader(self.mibig_json_dir) - self.mibig_bgcs = mibig_bgc_loader.get_bgcs() - self._mibig_strain_bgc_mapping = mibig_bgc_loader.get_strain_bgc_mapping() - return True - def _load_strain_mappings(self): # 1. load strain mappings sc = StrainCollection.read_json(self.strain_mappings_file) @@ -377,14 +370,7 @@ def _load_strain_mappings(self): logger.info(f"Loaded {len(user_strains)} user specified strains.") self.strains.filter(user_strains) - # 3. load MiBIG strain mappings - if self._mibig_strain_bgc_mapping: - for k, v in self._mibig_strain_bgc_mapping.items(): - strain = Strain(k) - strain.add_alias(v) - self.strains.add(strain) logger.info("Loaded {} Strain objects in total".format(len(self.strains))) - return True def _load_metabolomics(self): @@ -427,7 +413,7 @@ def _load_genomics(self): The attribute of `self.bgcs` is set to the loaded BGC objects that have the Strain object added (i.e. `BGC.strain` updated). If a BGC object does not have the Strain object, it is - not added to `self.bgcs`. + not added to `self.bgcs`. For MIBiG BGC objects, only those in use are added to `self.bgcs`. The attribute of `self.gcfs` is set to the loaded GCF objects that have the Strain objects added (i.e. `GCF._strains` updated). This means only BGC objects with updated Strain objects @@ -435,27 +421,39 @@ def _load_genomics(self): """ logger.debug("\nLoading genomics data starts...") - # Step 1: load all BGC objects + # Step 1: load antismash BGC objects & add strain info logger.debug("Parsing AntiSMASH directory...") antismash_bgcs = AntismashBGCLoader(self.antismash_dir).get_bgcs() - raw_bgcs = antismash_bgcs + self.mibig_bgcs + antismash_bgcs_with_strain, _ = add_strain_to_bgc(self.strains, antismash_bgcs) - # Step 2: load all GCF objects + # Step 2: load mibig BGC objects (having strain info) + if self._use_mibig: + self.mibig_bgcs = MibigLoader(self.mibig_json_dir).get_bgcs() + + # Step 3: get all BGC objects with strain info + all_bgcs_with_strain = antismash_bgcs_with_strain + self.mibig_bgcs + + # Step 4: load all GCF objects # TODO: create a config for "bigscape_cluster_file" and discard "bigscape_dir" and "bigscape_cutoff"? bigscape_cluster_file = ( Path(self.bigscape_dir) / "mix" / f"mix_clustering_c0.{self._bigscape_cutoff:02d}.tsv" ) raw_gcfs = BigscapeGCFLoader(bigscape_cluster_file).get_gcfs() - # Step 3: add Strain object to BGC - bgc_with_strain, _ = add_strain_to_bgc(self.strains, raw_bgcs) + # Step 5: add BGC objects to GCF + all_gcfs_with_bgc, _, _ = add_bgc_to_gcf(all_bgcs_with_strain, raw_gcfs) - # Step 4: add BGC objects to GCF - gcf_with_bgc, _, _ = add_bgc_to_gcf(bgc_with_strain, raw_gcfs) + # Step 6: get mibig bgcs and strains in use from GCFs + mibig_strains_in_use = StrainCollection() + if self._use_mibig: + mibig_bgcs_in_use, mibig_strains_in_use = get_mibig_from_gcf(all_gcfs_with_bgc) + else: + mibig_bgcs_in_use = [] - # Step 5: set attributes of self.bgcs and self.gcfs with valid objects - self.bgcs = bgc_with_strain - self.gcfs = gcf_with_bgc + # Step 7: set attributes with valid objects + self.bgcs = antismash_bgcs_with_strain + mibig_bgcs_in_use + self.gcfs = all_gcfs_with_bgc + self.mibig_strains_in_use = mibig_strains_in_use logger.debug("Loading genomics data completed\n") return True diff --git a/tests/genomics/test_mibig_loader.py b/tests/genomics/test_mibig_loader.py index bd1d536f..0ed8fa9b 100644 --- a/tests/genomics/test_mibig_loader.py +++ b/tests/genomics/test_mibig_loader.py @@ -30,13 +30,6 @@ def test_abc(self, loader): def test_init(self, loader, data_dir): assert loader.data_dir == data_dir - def test_get_strain_bgc_mapping(self, loader): - mapping = loader.get_strain_bgc_mapping() - assert isinstance(mapping, dict) - assert len(mapping) == 2502 - for bid in mapping: - assert bid == mapping[bid] - def test_get_files(self, loader): files = loader.get_files() assert isinstance(files, dict) diff --git a/tests/genomics/test_utils.py b/tests/genomics/test_utils.py index 5e1cf0f3..ab41cdb1 100644 --- a/tests/genomics/test_utils.py +++ b/tests/genomics/test_utils.py @@ -6,6 +6,7 @@ from nplinker.genomics import add_bgc_to_gcf from nplinker.genomics import add_strain_to_bgc from nplinker.genomics import generate_mappings_genome_id_bgc_id +from nplinker.genomics import get_mibig_from_gcf from nplinker.globals import GENOME_BGC_MAPPINGS_FILENAME from nplinker.strain import Strain from nplinker.strain_collection import StrainCollection @@ -135,3 +136,26 @@ def test_add_bgc_to_gcf(bgcs): assert gcf_with_bgc[0].bgcs == {bgcs[0], bgcs[1]} assert gcf_with_bgc[1].bgcs == {bgcs[2]} assert gcf_without_bgc[0].bgcs == set() + + +def test_get_mibig_from_gcf(): + """Test get_mibig_from_gcf function.""" + bgc1 = BGC("BGC_01", "NPR") + bgc1.strain = Strain("BGC_01") + bgc2 = BGC("BGC_02", "Alkaloid") + bgc2.strain = Strain("BGC_02") + bgc3 = BGC("antismash_c", "Polyketide") + bgc3.strain = Strain("strain_01") + gcf1 = GCF("1") + gcf1.add_bgc(bgc1) + gcf2 = GCF("2") + gcf2.add_bgc(bgc2) + gcf2.add_bgc(bgc3) + gcfs = [gcf1, gcf2] + + mibig_bgcs_in_use, mibig_strains_in_use = get_mibig_from_gcf(gcfs) + + assert len(mibig_bgcs_in_use) == 2 + assert len(mibig_strains_in_use) == 2 + assert bgc3 not in mibig_bgcs_in_use + assert bgc3.strain not in mibig_strains_in_use