update logics of loading mibig data

Loading mibig data is optional, controlled by the setting `DatasetLoader._use_mibig`. Mibig data is genomics information and can be loaded to BGC objects and also Strain objects. This PR refactored the logics of loading mibig data according to the workflow below: ![image](https://github.com/NPLinker/nplinker/assets/9798985/63a878a6-9736-47bd-97ab-6d109e1d4f77) Major changes: - move mibig loading code from `self._load` to `self._load_genomics` - Usually mibig has >2k bgcs, most of which will not be used in nplinker. So it's better to add the used mibig bgcs only to the attribute `self.bgcs` , and add the used mibig strains only to attribute `self.strains`. - remove method `get_strain_bgc_mapping` from MibigLoader - add util function `get_mibig_from_gcf`
NPLinker · Jan 24, 2024 · e94e151 · e94e151
1 parent 4de30e3
commit e94e151
Show file tree

Hide file tree

Showing 6 changed files with 88 additions and 50 deletions.
diff --git a/src/nplinker/genomics/__init__.py b/src/nplinker/genomics/__init__.py
@@ -5,6 +5,7 @@
 from .utils import add_bgc_to_gcf
 from .utils import add_strain_to_bgc
 from .utils import generate_mappings_genome_id_bgc_id
+from .utils import get_mibig_from_gcf
 
 
 logging.getLogger(__name__).addHandler(logging.NullHandler())
@@ -13,7 +14,8 @@
     "BGCLoaderBase",
     "BGC",
     "GCF",
-    "generate_mappings_genome_id_bgc_id",
     "add_bgc_to_gcf",
     "add_strain_to_bgc",
+    "generate_mappings_genome_id_bgc_id",
+    "get_mibig_from_gcf",
 ]
diff --git a/src/nplinker/genomics/mibig/mibig_loader.py b/src/nplinker/genomics/mibig/mibig_loader.py
@@ -17,6 +17,9 @@ def __init__(self, data_dir: str):
         MIBiG metadata file (json) contains annotations/metadata information
         for each BGC. See https://mibig.secondarymetabolites.org/download.
 
+        The MiBIG accession is used as BGC id and strain name. The loaded BGC
+        objects have Strain object as their strain attribute (i.e. `BGC.strain`).
+
         Args:
             data_dir(str): Path to the directory of MIBiG metadata json files
         """
@@ -25,16 +28,6 @@ def __init__(self, data_dir: str):
         self._metadata_dict = self._parse_metadatas()
         self._bgcs = self._parse_bgcs()
 
-    def get_strain_bgc_mapping(self) -> dict[str, str]:
-        """Get the mapping from strain to BGC.
-
-        Note that for MIBiG BGC, same value is used for strain name and BGC id.
-
-        Returns:
-            dict[str, str]: key is strain name, value is BGC id.
-        """
-        return {bid: bid for bid in self._file_dict}
-
     def get_files(self) -> dict[str, str]:
         """Get the path of all MIBiG metadata json files.
 
@@ -46,8 +39,7 @@ def get_files(self) -> dict[str, str]:
 
     @staticmethod
     def parse_data_dir(data_dir: str) -> dict[str, str]:
-        """Parse metadata directory and return pathes to all metadata json
-            files.
+        """Parse metadata directory and return paths to all metadata json files.
 
         Args:
             data_dir(str): path to the directory of MIBiG metadata json files
@@ -88,6 +80,10 @@ def _parse_metadatas(self) -> dict[str, MibigMetadata]:
     def get_bgcs(self) -> list[BGC]:
         """Get BGC objects.
 
+        The BGC objects use MiBIG accession as id and have Strain object as
+        their strain attribute (i.e. `BGC.strain`), where the name of the Strain
+        object is also MiBIG accession.
+
         Returns:
             list[str, BGC]: a list of :class:`nplinker.genomics.BGC` objects
         """
@@ -105,6 +101,9 @@ def _parse_bgcs(self) -> list[BGC]:
 def parse_bgc_metadata_json(file: str) -> BGC:
     """Parse MIBiG metadata file and return BGC object.
 
+    Note that the MiBIG accession is used as the BGC id and strain name. The BGC
+    object has Strain object as its strain attribute.
+
     Args:
         file(str): Path to the MIBiG metadata json file
 

diff --git a/src/nplinker/genomics/utils.py b/src/nplinker/genomics/utils.py
@@ -158,3 +158,25 @@ def add_bgc_to_gcf(
         f"{len(gcf_missing_bgc)} GCF objects have missing BGC objects."
     )
     return gcf_with_bgc, gcf_without_bgc, gcf_missing_bgc
+
+
+def get_mibig_from_gcf(gcfs: list[GCF]) -> tuple[list[BGC], StrainCollection]:
+    """Get MIBiG BGCs and strains from GCF objects.
+
+    Args:
+        gcfs(list[GCF]): A list of GCF objects.
+
+    Returns:
+        tuple[list[BGC], StrainCollection]: The first is a list of MIBiG BGC
+            objects used in the GCFs; the second is a StrainCollection object
+            that contains all Strain objects used in the GCFs.
+    """
+    mibig_bgcs_in_use = []
+    mibig_strains_in_use = StrainCollection()
+    for gcf in gcfs:
+        for bgc in gcf.bgcs:
+            if bgc.is_mibig():
+                mibig_bgcs_in_use.append(bgc)
+                if bgc.strain is not None:
+                    mibig_strains_in_use.add(bgc.strain)
+    return mibig_bgcs_in_use, mibig_strains_in_use
diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py
@@ -8,6 +8,7 @@
 from nplinker.genomics import add_bgc_to_gcf
 from nplinker.genomics import add_strain_to_bgc
 from nplinker.genomics import generate_mappings_genome_id_bgc_id
+from nplinker.genomics import get_mibig_from_gcf
 from nplinker.genomics.antismash import AntismashBGCLoader
 from nplinker.genomics.bigscape import BigscapeGCFLoader
 from nplinker.genomics.mibig import MibigLoader
@@ -26,7 +27,6 @@
 from nplinker.pairedomics.downloader import PODPDownloader
 from nplinker.pairedomics.runbigscape import run_bigscape
 from nplinker.pairedomics.strain_mappings_generator import podp_generate_strain_mappings
-from nplinker.strain import Strain
 from nplinker.strain_collection import StrainCollection
 from nplinker.strain_loader import load_user_strains
 
@@ -124,7 +124,7 @@ def __init__(self, config_data):
         )
         self.bgcs, self.gcfs, self.spectra, self.molfams = [], [], [], []
         self.mibig_bgcs = []
-        self._mibig_strain_bgc_mapping = {}
+        self.mibig_strains_in_use = StrainCollection()
         self.product_types = []
         self.strains = StrainCollection()
         self.webapp_scoring_cutoff = self._config_webapp.get(
@@ -185,10 +185,6 @@ def generate_strain_mappings(self):
         )
 
     def load(self):
-        if self._use_mibig:
-            if not self._load_mibig():
-                return False
-
         if not self._load_strain_mappings():
             return False
 
@@ -198,6 +194,9 @@ def load(self):
         if not self._load_genomics():
             return False
 
+        # set self.strains with all strains from input plus mibig strains in use
+        self.strains = self.strains + self.mibig_strains_in_use
+
         if len(self.strains) == 0:
             raise Exception(f"Failed to find *ANY* strains, missing {STRAIN_MAPPINGS_FILENAME}?")
 
@@ -356,12 +355,6 @@ def _validate_paths(self):
             if not os.path.exists(str(f)):
                 logger.warning('Optional file/directory "%s" does not exist', f)
 
-    def _load_mibig(self):
-        mibig_bgc_loader = MibigLoader(self.mibig_json_dir)
-        self.mibig_bgcs = mibig_bgc_loader.get_bgcs()
-        self._mibig_strain_bgc_mapping = mibig_bgc_loader.get_strain_bgc_mapping()
-        return True
-
     def _load_strain_mappings(self):
         # 1. load strain mappings
         sc = StrainCollection.read_json(self.strain_mappings_file)
@@ -377,14 +370,7 @@ def _load_strain_mappings(self):
             logger.info(f"Loaded {len(user_strains)} user specified strains.")
             self.strains.filter(user_strains)
 
-        # 3. load MiBIG strain mappings
-        if self._mibig_strain_bgc_mapping:
-            for k, v in self._mibig_strain_bgc_mapping.items():
-                strain = Strain(k)
-                strain.add_alias(v)
-                self.strains.add(strain)
         logger.info("Loaded {} Strain objects in total".format(len(self.strains)))
-
         return True
 
     def _load_metabolomics(self):
@@ -427,35 +413,47 @@ def _load_genomics(self):
 
         The attribute of `self.bgcs` is set to the loaded BGC objects that have the Strain object
         added (i.e. `BGC.strain` updated). If a BGC object does not have the Strain object, it is
-        not added to `self.bgcs`.
+        not added to `self.bgcs`. For MIBiG BGC objects, only those in use are added to `self.bgcs`.
 
         The attribute of `self.gcfs` is set to the loaded GCF objects that have the Strain objects
         added (i.e. `GCF._strains` updated). This means only BGC objects with updated Strain objects
         (i.e. `self.bgcs`) can be added to GCF objects.
         """
         logger.debug("\nLoading genomics data starts...")
 
-        # Step 1: load all BGC objects
+        # Step 1: load antismash BGC objects & add strain info
         logger.debug("Parsing AntiSMASH directory...")
         antismash_bgcs = AntismashBGCLoader(self.antismash_dir).get_bgcs()
-        raw_bgcs = antismash_bgcs + self.mibig_bgcs
+        antismash_bgcs_with_strain, _ = add_strain_to_bgc(self.strains, antismash_bgcs)
 
-        # Step 2: load all GCF objects
+        # Step 2: load mibig BGC objects (having strain info)
+        if self._use_mibig:
+            self.mibig_bgcs = MibigLoader(self.mibig_json_dir).get_bgcs()
+
+        # Step 3: get all BGC objects with strain info
+        all_bgcs_with_strain = antismash_bgcs_with_strain + self.mibig_bgcs
+
+        # Step 4: load all GCF objects
         # TODO: create a config for "bigscape_cluster_file" and discard "bigscape_dir" and "bigscape_cutoff"?
         bigscape_cluster_file = (
             Path(self.bigscape_dir) / "mix" / f"mix_clustering_c0.{self._bigscape_cutoff:02d}.tsv"
         )
         raw_gcfs = BigscapeGCFLoader(bigscape_cluster_file).get_gcfs()
 
-        # Step 3: add Strain object to BGC
-        bgc_with_strain, _ = add_strain_to_bgc(self.strains, raw_bgcs)
+        # Step 5: add BGC objects to GCF
+        all_gcfs_with_bgc, _, _ = add_bgc_to_gcf(all_bgcs_with_strain, raw_gcfs)
 
-        # Step 4: add BGC objects to GCF
-        gcf_with_bgc, _, _ = add_bgc_to_gcf(bgc_with_strain, raw_gcfs)
+        # Step 6: get mibig bgcs and strains in use from GCFs
+        mibig_strains_in_use = StrainCollection()
+        if self._use_mibig:
+            mibig_bgcs_in_use, mibig_strains_in_use = get_mibig_from_gcf(all_gcfs_with_bgc)
+        else:
+            mibig_bgcs_in_use = []
 
-        # Step 5: set attributes of self.bgcs and self.gcfs with valid objects
-        self.bgcs = bgc_with_strain
-        self.gcfs = gcf_with_bgc
+        # Step 7: set attributes with valid objects
+        self.bgcs = antismash_bgcs_with_strain + mibig_bgcs_in_use
+        self.gcfs = all_gcfs_with_bgc
+        self.mibig_strains_in_use = mibig_strains_in_use
 
         logger.debug("Loading genomics data completed\n")
         return True

diff --git a/tests/genomics/test_mibig_loader.py b/tests/genomics/test_mibig_loader.py
@@ -30,13 +30,6 @@ def test_abc(self, loader):
     def test_init(self, loader, data_dir):
         assert loader.data_dir == data_dir
 
-    def test_get_strain_bgc_mapping(self, loader):
-        mapping = loader.get_strain_bgc_mapping()
-        assert isinstance(mapping, dict)
-        assert len(mapping) == 2502
-        for bid in mapping:
-            assert bid == mapping[bid]
-
     def test_get_files(self, loader):
         files = loader.get_files()
         assert isinstance(files, dict)

diff --git a/tests/genomics/test_utils.py b/tests/genomics/test_utils.py
@@ -6,6 +6,7 @@
 from nplinker.genomics import add_bgc_to_gcf
 from nplinker.genomics import add_strain_to_bgc
 from nplinker.genomics import generate_mappings_genome_id_bgc_id
+from nplinker.genomics import get_mibig_from_gcf
 from nplinker.globals import GENOME_BGC_MAPPINGS_FILENAME
 from nplinker.strain import Strain
 from nplinker.strain_collection import StrainCollection
@@ -135,3 +136,26 @@ def test_add_bgc_to_gcf(bgcs):
     assert gcf_with_bgc[0].bgcs == {bgcs[0], bgcs[1]}
     assert gcf_with_bgc[1].bgcs == {bgcs[2]}
     assert gcf_without_bgc[0].bgcs == set()
+
+
+def test_get_mibig_from_gcf():
+    """Test get_mibig_from_gcf function."""
+    bgc1 = BGC("BGC_01", "NPR")
+    bgc1.strain = Strain("BGC_01")
+    bgc2 = BGC("BGC_02", "Alkaloid")
+    bgc2.strain = Strain("BGC_02")
+    bgc3 = BGC("antismash_c", "Polyketide")
+    bgc3.strain = Strain("strain_01")
+    gcf1 = GCF("1")
+    gcf1.add_bgc(bgc1)
+    gcf2 = GCF("2")
+    gcf2.add_bgc(bgc2)
+    gcf2.add_bgc(bgc3)
+    gcfs = [gcf1, gcf2]
+
+    mibig_bgcs_in_use, mibig_strains_in_use = get_mibig_from_gcf(gcfs)
+
+    assert len(mibig_bgcs_in_use) == 2
+    assert len(mibig_strains_in_use) == 2
+    assert bgc3 not in mibig_bgcs_in_use
+    assert bgc3.strain not in mibig_strains_in_use