Skip to content

Commit

Permalink
Merge pull request #171 from NPLinker/add_gcf_filter
Browse files Browse the repository at this point in the history
allow GCF loader to filter mibig-only GCF objects
  • Loading branch information
CunliangGeng authored Oct 31, 2023
2 parents 4d230e9 + 6dd2513 commit f577e59
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 14 deletions.
7 changes: 5 additions & 2 deletions src/nplinker/genomics/abc.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,16 @@ def get_bgcs(self) -> dict[str, BGC]:
"""



class GCFLoaderBase(ABC):

@abstractmethod
def get_gcfs(self) -> Sequence[GCF]:
def get_gcfs(self, keep_mibig_only) -> Sequence[GCF]:
"""Get GCF objects
Args:
keep_mibig_only(bool): True to keep GCFs that contain only MIBiG
BGCs.
Returns:
Sequence[GCF]: a list of :class:`~nplinker.genomic.GCF` objects
"""
23 changes: 16 additions & 7 deletions src/nplinker/genomics/bigscape/bigscape_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,26 @@ def __init__(self, cluster_file: str | PathLike, /) -> None:
cluster_file(str): path to the BiG-SCAPE cluster file.
"""
self.cluster_file = str(cluster_file)
self._gcf_dict = self._parse_gcf(self.cluster_file)
self._gcf_list = list(self._gcf_dict.values())
self._gcf_list = self._parse_gcf(self.cluster_file)

def get_gcfs(self) -> list[GCF]:
"""Get all GCF objects."""
def get_gcfs(self, keep_mibig_only=False) -> list[GCF]:
"""Get all GCF objects.
Args:
keep_mibig_only(bool): True to keep GCFs that contain only MIBiG
BGCs.
Returns:
list[GCF]: a list of GCF objects.
"""
if not keep_mibig_only:
return [gcf for gcf in self._gcf_list if not gcf.has_mibig_only()]
return self._gcf_list

@staticmethod
def _parse_gcf(cluster_file: str) -> dict[str, GCF]:
def _parse_gcf(cluster_file: str) -> list[GCF]:
"""Parse BiG-SCAPE cluster file to return GCF objects."""
gcf_dict = {}
gcf_dict: dict[str, GCF] = {}
with open(cluster_file, "rt", encoding="utf-8") as f:
reader = csv.reader(f, delimiter='\t')
next(reader) # skip headers
Expand All @@ -41,7 +50,7 @@ def _parse_gcf(cluster_file: str) -> dict[str, GCF]:
if family_id not in gcf_dict:
gcf_dict[family_id] = GCF(family_id)
gcf_dict[family_id].bgc_ids.add(bgc_id)
return gcf_dict
return list(gcf_dict.values())


# register as virtual class to prevent metaclass conflicts
Expand Down
6 changes: 3 additions & 3 deletions src/nplinker/genomics/gcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,9 @@ def has_strain(self, strain: Strain) -> bool:
return strain in self.strains

def has_mibig_only(self) -> bool:
"""Check if the GCF's children are only BGC objects from MIBiG.
"""Check if the GCF's children are only MIBiG BGCs.
Returns:
bool: True if `GCF.bgcs` are only MIBiG BGC objects
bool: True if `GCF.bgc_ids` are only MIBiG BGC ids.
"""
return all(map(lambda bgc: bgc.is_mibig(), self.bgcs))
return all(map(lambda id: id.startswith("BGC"), self.bgc_ids))
10 changes: 8 additions & 2 deletions tests/genomics/test_bigscape_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,19 @@ def test_init(self, loader):
"mix_clustering_c0.30.tsv")

def test_get_gcfs(self, loader):
gcfs = loader.get_gcfs()
gcfs = loader.get_gcfs(keep_mibig_only=True)
assert isinstance(gcfs, list)
assert len(gcfs) == 114
assert isinstance(gcfs[0], GCF)

def test_get_gcfs_without_mibig_only(self, loader):
gcfs = loader.get_gcfs(keep_mibig_only=False)
assert isinstance(gcfs, list)
assert len(gcfs) == 113
assert isinstance(gcfs[0], GCF)

def test_parse_gcf(self, loader):
gcf_dict = BigscapeGCFLoader._parse_gcf(loader.cluster_file) # noqa
gcf_dict = BigscapeGCFLoader._parse_gcf(loader.cluster_file) # noqa
assert isinstance(gcf_dict, dict)
assert len(gcf_dict) == 114
gcf = gcf_dict["135"]
Expand Down

0 comments on commit f577e59

Please sign in to comment.