Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

allow GCF loader to filter mibig-only GCF objects #171

Merged
merged 4 commits into from
Oct 31, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions src/nplinker/genomics/abc.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,16 @@ def get_bgcs(self) -> dict[str, BGC]:
"""



class GCFLoaderBase(ABC):

@abstractmethod
def get_gcfs(self) -> Sequence[GCF]:
def get_gcfs(self, keep_mibig_only) -> Sequence[GCF]:
"""Get GCF objects

Args:
keep_mibig_only(bool): True to keep GCFs that contain only MIBiG
BGCs.

Returns:
Sequence[GCF]: a list of :class:`~nplinker.genomic.GCF` objects
"""
23 changes: 16 additions & 7 deletions src/nplinker/genomics/bigscape/bigscape_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,26 @@ def __init__(self, cluster_file: str | PathLike, /) -> None:
cluster_file(str): path to the BiG-SCAPE cluster file.
"""
self.cluster_file = str(cluster_file)
self._gcf_dict = self._parse_gcf(self.cluster_file)
self._gcf_list = list(self._gcf_dict.values())
self._gcf_list = self._parse_gcf(self.cluster_file)

def get_gcfs(self) -> list[GCF]:
"""Get all GCF objects."""
def get_gcfs(self, keep_mibig_only=False) -> list[GCF]:
"""Get all GCF objects.

Args:
keep_mibig_only(bool): True to keep GCFs that contain only MIBiG
BGCs.

Returns:
list[GCF]: a list of GCF objects.
"""
if not keep_mibig_only:
return [gcf for gcf in self._gcf_list if not gcf.has_mibig_only()]
return self._gcf_list

@staticmethod
def _parse_gcf(cluster_file: str) -> dict[str, GCF]:
def _parse_gcf(cluster_file: str) -> list[GCF]:
"""Parse BiG-SCAPE cluster file to return GCF objects."""
gcf_dict = {}
gcf_dict: dict[str, GCF] = {}
with open(cluster_file, "rt", encoding="utf-8") as f:
reader = csv.reader(f, delimiter='\t')
next(reader) # skip headers
Expand All @@ -41,7 +50,7 @@ def _parse_gcf(cluster_file: str) -> dict[str, GCF]:
if family_id not in gcf_dict:
gcf_dict[family_id] = GCF(family_id)
gcf_dict[family_id].bgc_ids.add(bgc_id)
return gcf_dict
return list(gcf_dict.values())


# register as virtual class to prevent metaclass conflicts
Expand Down
6 changes: 3 additions & 3 deletions src/nplinker/genomics/gcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,9 @@ def has_strain(self, strain: Strain) -> bool:
return strain in self.strains

def has_mibig_only(self) -> bool:
"""Check if the GCF's children are only BGC objects from MIBiG.
"""Check if the GCF's children are only MIBiG BGCs.

Returns:
bool: True if `GCF.bgcs` are only MIBiG BGC objects
bool: True if `GCF.bgc_ids` are only MIBiG BGC ids.
"""
return all(map(lambda bgc: bgc.is_mibig(), self.bgcs))
return all(map(lambda id: id.startswith("BGC"), self.bgc_ids))
10 changes: 8 additions & 2 deletions tests/genomics/test_bigscape_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,19 @@ def test_init(self, loader):
"mix_clustering_c0.30.tsv")

def test_get_gcfs(self, loader):
gcfs = loader.get_gcfs()
gcfs = loader.get_gcfs(keep_mibig_only=True)
assert isinstance(gcfs, list)
assert len(gcfs) == 114
assert isinstance(gcfs[0], GCF)

def test_get_gcfs_without_mibig_only(self, loader):
gcfs = loader.get_gcfs(keep_mibig_only=False)
assert isinstance(gcfs, list)
assert len(gcfs) == 113
assert isinstance(gcfs[0], GCF)

def test_parse_gcf(self, loader):
gcf_dict = BigscapeGCFLoader._parse_gcf(loader.cluster_file) # noqa
gcf_dict = BigscapeGCFLoader._parse_gcf(loader.cluster_file) # noqa
assert isinstance(gcf_dict, dict)
assert len(gcf_dict) == 114
gcf = gcf_dict["135"]
Expand Down
Loading