Skip to content

Commit

Permalink
update logics of loading mibig data
Browse files Browse the repository at this point in the history
Loading mibig data is optional, controlled by the setting `DatasetLoader._use_mibig`.  
Mibig data is genomics information and can be loaded to BGC objects and also Strain objects. This PR refactored the logics of loading mibig data according to the workflow below:

![image](https://github.com/NPLinker/nplinker/assets/9798985/63a878a6-9736-47bd-97ab-6d109e1d4f77)


Major changes:
- move mibig loading code from `self._load` to `self._load_genomics`
- Usually mibig has >2k bgcs, most of which will not be used in nplinker. So it's better to add the used mibig bgcs only to the attribute `self.bgcs` , and add the used mibig strains only to attribute `self.strains`.
- remove method `get_strain_bgc_mapping` from MibigLoader
- add util function `get_mibig_from_gcf`
  • Loading branch information
CunliangGeng authored Jan 24, 2024
1 parent 4de30e3 commit e94e151
Show file tree
Hide file tree
Showing 6 changed files with 88 additions and 50 deletions.
4 changes: 3 additions & 1 deletion src/nplinker/genomics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from .utils import add_bgc_to_gcf
from .utils import add_strain_to_bgc
from .utils import generate_mappings_genome_id_bgc_id
from .utils import get_mibig_from_gcf


logging.getLogger(__name__).addHandler(logging.NullHandler())
Expand All @@ -13,7 +14,8 @@
"BGCLoaderBase",
"BGC",
"GCF",
"generate_mappings_genome_id_bgc_id",
"add_bgc_to_gcf",
"add_strain_to_bgc",
"generate_mappings_genome_id_bgc_id",
"get_mibig_from_gcf",
]
23 changes: 11 additions & 12 deletions src/nplinker/genomics/mibig/mibig_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ def __init__(self, data_dir: str):
MIBiG metadata file (json) contains annotations/metadata information
for each BGC. See https://mibig.secondarymetabolites.org/download.
The MiBIG accession is used as BGC id and strain name. The loaded BGC
objects have Strain object as their strain attribute (i.e. `BGC.strain`).
Args:
data_dir(str): Path to the directory of MIBiG metadata json files
"""
Expand All @@ -25,16 +28,6 @@ def __init__(self, data_dir: str):
self._metadata_dict = self._parse_metadatas()
self._bgcs = self._parse_bgcs()

def get_strain_bgc_mapping(self) -> dict[str, str]:
"""Get the mapping from strain to BGC.
Note that for MIBiG BGC, same value is used for strain name and BGC id.
Returns:
dict[str, str]: key is strain name, value is BGC id.
"""
return {bid: bid for bid in self._file_dict}

def get_files(self) -> dict[str, str]:
"""Get the path of all MIBiG metadata json files.
Expand All @@ -46,8 +39,7 @@ def get_files(self) -> dict[str, str]:

@staticmethod
def parse_data_dir(data_dir: str) -> dict[str, str]:
"""Parse metadata directory and return pathes to all metadata json
files.
"""Parse metadata directory and return paths to all metadata json files.
Args:
data_dir(str): path to the directory of MIBiG metadata json files
Expand Down Expand Up @@ -88,6 +80,10 @@ def _parse_metadatas(self) -> dict[str, MibigMetadata]:
def get_bgcs(self) -> list[BGC]:
"""Get BGC objects.
The BGC objects use MiBIG accession as id and have Strain object as
their strain attribute (i.e. `BGC.strain`), where the name of the Strain
object is also MiBIG accession.
Returns:
list[str, BGC]: a list of :class:`nplinker.genomics.BGC` objects
"""
Expand All @@ -105,6 +101,9 @@ def _parse_bgcs(self) -> list[BGC]:
def parse_bgc_metadata_json(file: str) -> BGC:
"""Parse MIBiG metadata file and return BGC object.
Note that the MiBIG accession is used as the BGC id and strain name. The BGC
object has Strain object as its strain attribute.
Args:
file(str): Path to the MIBiG metadata json file
Expand Down
22 changes: 22 additions & 0 deletions src/nplinker/genomics/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,25 @@ def add_bgc_to_gcf(
f"{len(gcf_missing_bgc)} GCF objects have missing BGC objects."
)
return gcf_with_bgc, gcf_without_bgc, gcf_missing_bgc


def get_mibig_from_gcf(gcfs: list[GCF]) -> tuple[list[BGC], StrainCollection]:
"""Get MIBiG BGCs and strains from GCF objects.
Args:
gcfs(list[GCF]): A list of GCF objects.
Returns:
tuple[list[BGC], StrainCollection]: The first is a list of MIBiG BGC
objects used in the GCFs; the second is a StrainCollection object
that contains all Strain objects used in the GCFs.
"""
mibig_bgcs_in_use = []
mibig_strains_in_use = StrainCollection()
for gcf in gcfs:
for bgc in gcf.bgcs:
if bgc.is_mibig():
mibig_bgcs_in_use.append(bgc)
if bgc.strain is not None:
mibig_strains_in_use.add(bgc.strain)
return mibig_bgcs_in_use, mibig_strains_in_use
58 changes: 28 additions & 30 deletions src/nplinker/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from nplinker.genomics import add_bgc_to_gcf
from nplinker.genomics import add_strain_to_bgc
from nplinker.genomics import generate_mappings_genome_id_bgc_id
from nplinker.genomics import get_mibig_from_gcf
from nplinker.genomics.antismash import AntismashBGCLoader
from nplinker.genomics.bigscape import BigscapeGCFLoader
from nplinker.genomics.mibig import MibigLoader
Expand All @@ -26,7 +27,6 @@
from nplinker.pairedomics.downloader import PODPDownloader
from nplinker.pairedomics.runbigscape import run_bigscape
from nplinker.pairedomics.strain_mappings_generator import podp_generate_strain_mappings
from nplinker.strain import Strain
from nplinker.strain_collection import StrainCollection
from nplinker.strain_loader import load_user_strains

Expand Down Expand Up @@ -124,7 +124,7 @@ def __init__(self, config_data):
)
self.bgcs, self.gcfs, self.spectra, self.molfams = [], [], [], []
self.mibig_bgcs = []
self._mibig_strain_bgc_mapping = {}
self.mibig_strains_in_use = StrainCollection()
self.product_types = []
self.strains = StrainCollection()
self.webapp_scoring_cutoff = self._config_webapp.get(
Expand Down Expand Up @@ -185,10 +185,6 @@ def generate_strain_mappings(self):
)

def load(self):
if self._use_mibig:
if not self._load_mibig():
return False

if not self._load_strain_mappings():
return False

Expand All @@ -198,6 +194,9 @@ def load(self):
if not self._load_genomics():
return False

# set self.strains with all strains from input plus mibig strains in use
self.strains = self.strains + self.mibig_strains_in_use

if len(self.strains) == 0:
raise Exception(f"Failed to find *ANY* strains, missing {STRAIN_MAPPINGS_FILENAME}?")

Expand Down Expand Up @@ -356,12 +355,6 @@ def _validate_paths(self):
if not os.path.exists(str(f)):
logger.warning('Optional file/directory "%s" does not exist', f)

def _load_mibig(self):
mibig_bgc_loader = MibigLoader(self.mibig_json_dir)
self.mibig_bgcs = mibig_bgc_loader.get_bgcs()
self._mibig_strain_bgc_mapping = mibig_bgc_loader.get_strain_bgc_mapping()
return True

def _load_strain_mappings(self):
# 1. load strain mappings
sc = StrainCollection.read_json(self.strain_mappings_file)
Expand All @@ -377,14 +370,7 @@ def _load_strain_mappings(self):
logger.info(f"Loaded {len(user_strains)} user specified strains.")
self.strains.filter(user_strains)

# 3. load MiBIG strain mappings
if self._mibig_strain_bgc_mapping:
for k, v in self._mibig_strain_bgc_mapping.items():
strain = Strain(k)
strain.add_alias(v)
self.strains.add(strain)
logger.info("Loaded {} Strain objects in total".format(len(self.strains)))

return True

def _load_metabolomics(self):
Expand Down Expand Up @@ -427,35 +413,47 @@ def _load_genomics(self):
The attribute of `self.bgcs` is set to the loaded BGC objects that have the Strain object
added (i.e. `BGC.strain` updated). If a BGC object does not have the Strain object, it is
not added to `self.bgcs`.
not added to `self.bgcs`. For MIBiG BGC objects, only those in use are added to `self.bgcs`.
The attribute of `self.gcfs` is set to the loaded GCF objects that have the Strain objects
added (i.e. `GCF._strains` updated). This means only BGC objects with updated Strain objects
(i.e. `self.bgcs`) can be added to GCF objects.
"""
logger.debug("\nLoading genomics data starts...")

# Step 1: load all BGC objects
# Step 1: load antismash BGC objects & add strain info
logger.debug("Parsing AntiSMASH directory...")
antismash_bgcs = AntismashBGCLoader(self.antismash_dir).get_bgcs()
raw_bgcs = antismash_bgcs + self.mibig_bgcs
antismash_bgcs_with_strain, _ = add_strain_to_bgc(self.strains, antismash_bgcs)

# Step 2: load all GCF objects
# Step 2: load mibig BGC objects (having strain info)
if self._use_mibig:
self.mibig_bgcs = MibigLoader(self.mibig_json_dir).get_bgcs()

# Step 3: get all BGC objects with strain info
all_bgcs_with_strain = antismash_bgcs_with_strain + self.mibig_bgcs

# Step 4: load all GCF objects
# TODO: create a config for "bigscape_cluster_file" and discard "bigscape_dir" and "bigscape_cutoff"?
bigscape_cluster_file = (
Path(self.bigscape_dir) / "mix" / f"mix_clustering_c0.{self._bigscape_cutoff:02d}.tsv"
)
raw_gcfs = BigscapeGCFLoader(bigscape_cluster_file).get_gcfs()

# Step 3: add Strain object to BGC
bgc_with_strain, _ = add_strain_to_bgc(self.strains, raw_bgcs)
# Step 5: add BGC objects to GCF
all_gcfs_with_bgc, _, _ = add_bgc_to_gcf(all_bgcs_with_strain, raw_gcfs)

# Step 4: add BGC objects to GCF
gcf_with_bgc, _, _ = add_bgc_to_gcf(bgc_with_strain, raw_gcfs)
# Step 6: get mibig bgcs and strains in use from GCFs
mibig_strains_in_use = StrainCollection()
if self._use_mibig:
mibig_bgcs_in_use, mibig_strains_in_use = get_mibig_from_gcf(all_gcfs_with_bgc)
else:
mibig_bgcs_in_use = []

# Step 5: set attributes of self.bgcs and self.gcfs with valid objects
self.bgcs = bgc_with_strain
self.gcfs = gcf_with_bgc
# Step 7: set attributes with valid objects
self.bgcs = antismash_bgcs_with_strain + mibig_bgcs_in_use
self.gcfs = all_gcfs_with_bgc
self.mibig_strains_in_use = mibig_strains_in_use

logger.debug("Loading genomics data completed\n")
return True
Expand Down
7 changes: 0 additions & 7 deletions tests/genomics/test_mibig_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,6 @@ def test_abc(self, loader):
def test_init(self, loader, data_dir):
assert loader.data_dir == data_dir

def test_get_strain_bgc_mapping(self, loader):
mapping = loader.get_strain_bgc_mapping()
assert isinstance(mapping, dict)
assert len(mapping) == 2502
for bid in mapping:
assert bid == mapping[bid]

def test_get_files(self, loader):
files = loader.get_files()
assert isinstance(files, dict)
Expand Down
24 changes: 24 additions & 0 deletions tests/genomics/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from nplinker.genomics import add_bgc_to_gcf
from nplinker.genomics import add_strain_to_bgc
from nplinker.genomics import generate_mappings_genome_id_bgc_id
from nplinker.genomics import get_mibig_from_gcf
from nplinker.globals import GENOME_BGC_MAPPINGS_FILENAME
from nplinker.strain import Strain
from nplinker.strain_collection import StrainCollection
Expand Down Expand Up @@ -135,3 +136,26 @@ def test_add_bgc_to_gcf(bgcs):
assert gcf_with_bgc[0].bgcs == {bgcs[0], bgcs[1]}
assert gcf_with_bgc[1].bgcs == {bgcs[2]}
assert gcf_without_bgc[0].bgcs == set()


def test_get_mibig_from_gcf():
"""Test get_mibig_from_gcf function."""
bgc1 = BGC("BGC_01", "NPR")
bgc1.strain = Strain("BGC_01")
bgc2 = BGC("BGC_02", "Alkaloid")
bgc2.strain = Strain("BGC_02")
bgc3 = BGC("antismash_c", "Polyketide")
bgc3.strain = Strain("strain_01")
gcf1 = GCF("1")
gcf1.add_bgc(bgc1)
gcf2 = GCF("2")
gcf2.add_bgc(bgc2)
gcf2.add_bgc(bgc3)
gcfs = [gcf1, gcf2]

mibig_bgcs_in_use, mibig_strains_in_use = get_mibig_from_gcf(gcfs)

assert len(mibig_bgcs_in_use) == 2
assert len(mibig_strains_in_use) == 2
assert bgc3 not in mibig_bgcs_in_use
assert bgc3.strain not in mibig_strains_in_use

0 comments on commit e94e151

Please sign in to comment.