Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: create antismash downloader module and move there inherent code #119

Closed
wants to merge 20 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
8f64998
creat antismash downloader module
gcroci2 Feb 28, 2023
ec2b048
fix tests
gcroci2 Feb 28, 2023
d49f5fc
Merge branch 'dev' into 98_add_antismash_downloader_gcroci2
gcroci2 Feb 28, 2023
26e685f
Update src/nplinker/genomics/antismash/antismash_downloader.py
gcroci2 Mar 3, 2023
da904c6
add a new function for downloading and extracting antismash data
gcroci2 Mar 3, 2023
8be7e44
Merge branch '98_add_antismash_downloader_gcroci2' of github.com:NPLi…
gcroci2 Mar 3, 2023
7b07a7b
create podp_antismash_downloader module
gcroci2 Mar 7, 2023
4879181
properly define download_and_extract_antismash_metadata function
gcroci2 Mar 7, 2023
f64a17c
Merge branch 'dev' into 98_add_antismash_downloader_gcroci2
gcroci2 Mar 8, 2023
392a736
add internal funcs and doc string
gcroci2 Mar 8, 2023
850b746
add tests and create antismash test folder
gcroci2 Mar 9, 2023
e186ba8
format properly extract_path
gcroci2 Mar 9, 2023
61e17fe
run linting and formatting for modified files using yapf
gcroci2 Mar 9, 2023
601957d
fix prospector errors
gcroci2 Mar 9, 2023
e9fbe99
make refseq_assembly_id class variable for tests
gcroci2 Mar 9, 2023
9a22c06
reorder imports
gcroci2 Mar 9, 2023
decf286
add minor static typing
gcroci2 Mar 9, 2023
e55fb33
xMerge branch 'dev' into 98_add_antismash_downloader_gcroci2
gcroci2 Mar 9, 2023
0c5b6bd
Revert "Merge branch 'dev' into 98_add_antismash_downloader_gcroci2"
gcroci2 Mar 15, 2023
c66a10a
Revert "Revert "Merge branch 'dev' into 98_add_antismash_downloader_g…
gcroci2 Mar 15, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 4 additions & 19 deletions src/nplinker/genomics/__init__.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,10 @@
import logging

from .abc import BGCLoaderBase
logging.getLogger(__name__).addHandler(logging.NullHandler())

from .bgc import BGC
from .gcf import GCF
from .genomics import filter_mibig_only_gcf
from .genomics import get_bgcs_from_gcfs
from .genomics import get_strains_from_bgcs
from .genomics import load_gcfs
from .genomics import map_bgc_to_gcf
from .genomics import map_strain_to_bgc

logging.getLogger(__name__).addHandler(logging.NullHandler())
from .abc import BGCLoaderBase

__all__ = [
"BGCLoaderBase",
"BGC",
"GCF",
"filter_mibig_only_gcf",
"get_bgcs_from_gcfs",
"get_strains_from_bgcs",
"load_gcfs",
"map_bgc_to_gcf",
"map_strain_to_bgc"
]
__all__ = ["bgc", "gcf", "load_gcfs", "BGCLoaderBase"]
14 changes: 0 additions & 14 deletions src/nplinker/genomics/abc.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
from abc import ABC
from abc import abstractmethod
from collections.abc import Sequence
from .bgc import BGC
from .gcf import GCF


class BGCLoaderBase(ABC):
Expand Down Expand Up @@ -32,15 +30,3 @@ def get_bgcs(self) -> dict[str, BGC]:
dict[str, BGC]: key is BGC name and value is
:class:`~nplinker.genomic.BGC` objects
"""



class GCFLoaderBase(ABC):

@abstractmethod
def get_gcfs(self) -> Sequence[GCF]:
"""Get GCF objects

Returns:
Sequence[GCF]: a list of :class:`~nplinker.genomic.GCF` objects
"""
4 changes: 3 additions & 1 deletion src/nplinker/genomics/antismash/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import logging
from .antismash_downloader import download_and_extract_antismash_metadata
from .antismash_loader import AntismashBGCLoader
from .antismash_loader import parse_bgc_genbank


logging.getLogger(__name__).addHandler(logging.NullHandler())

__all__ = ["AntismashBGCLoader", "parse_bgc_genbank"]
__all__ = ["AntismashBGCLoader", "parse_bgc_genbank", "download_and_extract_antismash_metadata"]
85 changes: 85 additions & 0 deletions src/nplinker/genomics/antismash/antismash_downloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import os
import shutil
from nplinker.logconfig import LogConfig
from nplinker.utils import download_and_extract_archive
from nplinker.utils import list_dirs
from nplinker.utils import list_files


logger = LogConfig.getLogger(__name__)

# urls to be given to download antismash data
ANTISMASH_DB_PAGE_URL = 'https://antismash-db.secondarymetabolites.org/output/{}/'
ANTISMASH_DB_DOWNLOAD_URL = 'https://antismash-db.secondarymetabolites.org/output/{}/{}'

# The antiSMASH DBV2 is for the availability of the old version, better to keep it.
ANTISMASH_DBV2_PAGE_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/'
gcroci2 marked this conversation as resolved.
Show resolved Hide resolved
ANTISMASH_DBV2_DOWNLOAD_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/{}'


def _check_roots(download_root: str, extract_root: str):
if download_root == extract_root:
raise ValueError(
"Identical path of download directory and extract directory")


def _check_extract_path(extract_path: str):
if os.path.exists(extract_path):
# check if extract_path is empty
files = list(os.listdir(extract_path))
if len(files) != 0:
raise ValueError(f'Nonempty directory: "{extract_path}"')
else:
os.makedirs(extract_path, exist_ok=True)


def download_and_extract_antismash_metadata(refseq_assembly_id: str,
download_root: str,
extract_root: str):
"""Download and extract Antismash files for a specified refseq_assembly_id.

Args:
refseq_assembly_id(str): Assembled genomic RefSeq (reference sequence) id.
If the id is versioned (e.g., "GCF_004339725.1") please be sure to
specify the version as well.
download_root(str): Path to the directory to place downloaded archive in.
extract_root(str): Path to the directory data files will be extracted to.
Note that if it will create an antismash/ directory in the specified extract_root, if
it doesn't already exist.
The files will be extracted to <extract_root>/antismash/<refseq_assembly_id>/ dir.

Raises:
ValueError: if download_root and extract_root dirs are the same.
ValueError: if <extract_root>/antismash/<refseq_assembly_id> dir is not empty.

Examples:
>>> download_and_extract_antismash_metadata("GCF_004339725.1", "/data/download", "/data/extracted")
"""
extract_path = os.path.join(extract_root, "antismash", refseq_assembly_id)

_check_roots(download_root, extract_root)
_check_extract_path(extract_path)

for base_url in [ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL]:
url = base_url.format(refseq_assembly_id, refseq_assembly_id + '.zip')

download_and_extract_archive(url, download_root, extract_path,
refseq_assembly_id + '.zip')
logger.info(
'Genome data successfully extracted for %s', refseq_assembly_id)
break

# delete subdirs
logger.info('Deleting unnecessary extracted subdirs and files')
subdirs = list_dirs(extract_path)
for subdir_path in subdirs:
shutil.rmtree(subdir_path)

files_to_keep = list_files(extract_path, suffix=(".json", ".gbk"))

for file in list_files(extract_path):
if file not in files_to_keep:
os.remove(file)
logger.info(
'download_and_extract_antismash_metadata process for %s is over', refseq_assembly_id
)
3 changes: 1 addition & 2 deletions src/nplinker/genomics/antismash/antismash_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,7 @@ def parse_bgc_genbank(file: str) -> BGC:
product_prediction = features.get("product")
if product_prediction is None:
raise ValueError(
"Not found product prediction in antiSMASH Genbank file {}".format(
file))
f"Not found product prediction in antiSMASH Genbank file {file}")

# init BGC
bgc = BGC(bgc_id=fname, product_prediction=product_prediction)
Expand Down
6 changes: 1 addition & 5 deletions src/nplinker/genomics/bgc.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,11 +105,7 @@ def add_parent(self, gcf: GCF) -> None:
Args:
gcf(GCF): gene cluster family
"""
gcf.add_bgc(self)

def detach_parent(self, gcf: GCF) -> None:
"""Remove a parent GCF"""
gcf.detach_bgc(self)
self.parents.add(gcf)

@property
def strain(self) -> Strain | None:
Expand Down
7 changes: 0 additions & 7 deletions src/nplinker/genomics/bigscape/__init__.py

This file was deleted.

48 changes: 0 additions & 48 deletions src/nplinker/genomics/bigscape/bigscape_loader.py

This file was deleted.

25 changes: 2 additions & 23 deletions src/nplinker/genomics/gcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,11 @@ def __init__(self, gcf_id: str) -> None:
"""
self.gcf_id = gcf_id
self._bgcs: set[BGC] = set()
self.strains: StrainCollection = StrainCollection()
self.bigscape_class: str | None = None
# CG TODO: remove attribute id, see issue 103
# https://github.com/NPLinker/nplinker/issues/103
self.id: int | None = None
self.bgc_ids: set[str] = set()
self.strains: StrainCollection = StrainCollection()

def __str__(self):
return f"GCF(id={self.gcf_id}, #bgcs={len(self.bgcs)}, #strains={len(self.strains)})."
Expand All @@ -59,25 +58,13 @@ def bgcs(self) -> set[BGC]:

def add_bgc(self, bgc: BGC) -> None:
"""Add a BGC object to the GCF."""
bgc.parents.add(self)
self._bgcs.add(bgc)
self.bgc_ids.add(bgc.bgc_id)
bgc.add_parent(self)
if bgc.strain is not None:
self.strains.add(bgc.strain)
else:
logger.warning("No strain specified for the BGC %s", bgc.bgc_id)

def detach_bgc(self, bgc: BGC) -> None:
"""Remove a child BGC object."""
bgc.parents.remove(self)
self._bgcs.remove(bgc)
self.bgc_ids.remove(bgc.bgc_id)
if bgc.strain is not None:
for other_bgc in self._bgcs:
if other_bgc.strain == bgc.strain:
return
self.strains.remove(bgc.strain)

def has_strain(self, strain: str | Strain) -> bool:
"""Check if the given strain exists.

Expand All @@ -88,11 +75,3 @@ def has_strain(self, strain: str | Strain) -> bool:
bool: True when the given strain exist.
"""
return strain in self.strains

def has_mibig_only(self) -> bool:
"""Check if the GCF's children are only BGC objects from MIBiG.

Returns:
bool: True if `GCF.bgcs` are only MIBiG BGC objects
"""
return all(map(lambda bgc: bgc.is_mibig(), self.bgcs))
Loading