From 61bf359bf734129b1aedd172cefd248ecea85f17 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 25 Oct 2023 16:30:16 +0200 Subject: [PATCH 1/4] update logics of checking mibig-only --- src/nplinker/genomics/gcf.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/nplinker/genomics/gcf.py b/src/nplinker/genomics/gcf.py index 0af3e407..b52c1677 100644 --- a/src/nplinker/genomics/gcf.py +++ b/src/nplinker/genomics/gcf.py @@ -94,9 +94,9 @@ def has_strain(self, strain: Strain) -> bool: return strain in self.strains def has_mibig_only(self) -> bool: - """Check if the GCF's children are only BGC objects from MIBiG. + """Check if the GCF's children are only MIBiG BGCs. Returns: - bool: True if `GCF.bgcs` are only MIBiG BGC objects + bool: True if `GCF.bgc_ids` are only MIBiG BGC ids. """ - return all(map(lambda bgc: bgc.is_mibig(), self.bgcs)) + return all(map(lambda id: id.startswith("BGC"), self.bgc_ids)) From ba8e29ad29a49d0d9de73a48853181584d84b130 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 25 Oct 2023 16:31:13 +0200 Subject: [PATCH 2/4] add parameter `keep_mibig_only` to `GCFLoaderBase` --- src/nplinker/genomics/abc.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/nplinker/genomics/abc.py b/src/nplinker/genomics/abc.py index 458be5f7..ab81b109 100644 --- a/src/nplinker/genomics/abc.py +++ b/src/nplinker/genomics/abc.py @@ -34,13 +34,16 @@ def get_bgcs(self) -> dict[str, BGC]: """ - class GCFLoaderBase(ABC): @abstractmethod - def get_gcfs(self) -> Sequence[GCF]: + def get_gcfs(self, keep_mibig_only) -> Sequence[GCF]: """Get GCF objects + Args: + keep_mibig_only(bool): True to keep GCFs that contain only MIBiG + BGCs. + Returns: Sequence[GCF]: a list of :class:`~nplinker.genomic.GCF` objects """ From ae739715b427c1d0739387e24512d54eda205ddf Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 25 Oct 2023 16:39:35 +0200 Subject: [PATCH 3/4] add `keep_mibig_only` to `BigscapeGCFLoader.get_gcfs` method - add parameter `keep_mibig_only` - update and add unit tests --- src/nplinker/genomics/bigscape/bigscape_loader.py | 14 ++++++++++++-- tests/genomics/test_bigscape_loader.py | 10 ++++++++-- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/src/nplinker/genomics/bigscape/bigscape_loader.py b/src/nplinker/genomics/bigscape/bigscape_loader.py index 825d32dd..df7aa37a 100644 --- a/src/nplinker/genomics/bigscape/bigscape_loader.py +++ b/src/nplinker/genomics/bigscape/bigscape_loader.py @@ -25,8 +25,18 @@ def __init__(self, cluster_file: str | PathLike, /) -> None: self._gcf_dict = self._parse_gcf(self.cluster_file) self._gcf_list = list(self._gcf_dict.values()) - def get_gcfs(self) -> list[GCF]: - """Get all GCF objects.""" + def get_gcfs(self, keep_mibig_only=False) -> list[GCF]: + """Get all GCF objects. + + Args: + keep_mibig_only(bool): True to keep GCFs that contain only MIBiG + BGCs. + + Returns: + list[GCF]: a list of GCF objects. + """ + if not keep_mibig_only: + return [gcf for gcf in self._gcf_list if not gcf.has_mibig_only()] return self._gcf_list @staticmethod diff --git a/tests/genomics/test_bigscape_loader.py b/tests/genomics/test_bigscape_loader.py index 89d9079e..d2e2bf3a 100644 --- a/tests/genomics/test_bigscape_loader.py +++ b/tests/genomics/test_bigscape_loader.py @@ -22,13 +22,19 @@ def test_init(self, loader): "mix_clustering_c0.30.tsv") def test_get_gcfs(self, loader): - gcfs = loader.get_gcfs() + gcfs = loader.get_gcfs(keep_mibig_only=True) assert isinstance(gcfs, list) assert len(gcfs) == 114 assert isinstance(gcfs[0], GCF) + def test_get_gcfs_without_mibig_only(self, loader): + gcfs = loader.get_gcfs(keep_mibig_only=False) + assert isinstance(gcfs, list) + assert len(gcfs) == 113 + assert isinstance(gcfs[0], GCF) + def test_parse_gcf(self, loader): - gcf_dict = BigscapeGCFLoader._parse_gcf(loader.cluster_file) # noqa + gcf_dict = BigscapeGCFLoader._parse_gcf(loader.cluster_file) # noqa assert isinstance(gcf_dict, dict) assert len(gcf_dict) == 114 gcf = gcf_dict["135"] From 6dd2513db1f554560bc0d9701f0c38f1edf516fa Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 25 Oct 2023 16:56:20 +0200 Subject: [PATCH 4/4] change return value to a list for `_parse_gcf` method This change will simplify the `__init__` method. --- src/nplinker/genomics/bigscape/bigscape_loader.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/nplinker/genomics/bigscape/bigscape_loader.py b/src/nplinker/genomics/bigscape/bigscape_loader.py index df7aa37a..eb2eab9a 100644 --- a/src/nplinker/genomics/bigscape/bigscape_loader.py +++ b/src/nplinker/genomics/bigscape/bigscape_loader.py @@ -22,8 +22,7 @@ def __init__(self, cluster_file: str | PathLike, /) -> None: cluster_file(str): path to the BiG-SCAPE cluster file. """ self.cluster_file = str(cluster_file) - self._gcf_dict = self._parse_gcf(self.cluster_file) - self._gcf_list = list(self._gcf_dict.values()) + self._gcf_list = self._parse_gcf(self.cluster_file) def get_gcfs(self, keep_mibig_only=False) -> list[GCF]: """Get all GCF objects. @@ -40,9 +39,9 @@ def get_gcfs(self, keep_mibig_only=False) -> list[GCF]: return self._gcf_list @staticmethod - def _parse_gcf(cluster_file: str) -> dict[str, GCF]: + def _parse_gcf(cluster_file: str) -> list[GCF]: """Parse BiG-SCAPE cluster file to return GCF objects.""" - gcf_dict = {} + gcf_dict: dict[str, GCF] = {} with open(cluster_file, "rt", encoding="utf-8") as f: reader = csv.reader(f, delimiter='\t') next(reader) # skip headers @@ -51,7 +50,7 @@ def _parse_gcf(cluster_file: str) -> dict[str, GCF]: if family_id not in gcf_dict: gcf_dict[family_id] = GCF(family_id) gcf_dict[family_id].bgc_ids.add(bgc_id) - return gcf_dict + return list(gcf_dict.values()) # register as virtual class to prevent metaclass conflicts