Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: create antismash downloader module and move there inherent code #119

Closed
wants to merge 20 commits into from
Closed
Changes from 1 commit
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
8f64998
creat antismash downloader module
gcroci2 Feb 28, 2023
ec2b048
fix tests
gcroci2 Feb 28, 2023
d49f5fc
Merge branch 'dev' into 98_add_antismash_downloader_gcroci2
gcroci2 Feb 28, 2023
26e685f
Update src/nplinker/genomics/antismash/antismash_downloader.py
gcroci2 Mar 3, 2023
da904c6
add a new function for downloading and extracting antismash data
gcroci2 Mar 3, 2023
8be7e44
Merge branch '98_add_antismash_downloader_gcroci2' of github.com:NPLi…
gcroci2 Mar 3, 2023
7b07a7b
create podp_antismash_downloader module
gcroci2 Mar 7, 2023
4879181
properly define download_and_extract_antismash_metadata function
gcroci2 Mar 7, 2023
f64a17c
Merge branch 'dev' into 98_add_antismash_downloader_gcroci2
gcroci2 Mar 8, 2023
392a736
add internal funcs and doc string
gcroci2 Mar 8, 2023
850b746
add tests and create antismash test folder
gcroci2 Mar 9, 2023
e186ba8
format properly extract_path
gcroci2 Mar 9, 2023
61e17fe
run linting and formatting for modified files using yapf
gcroci2 Mar 9, 2023
601957d
fix prospector errors
gcroci2 Mar 9, 2023
e9fbe99
make refseq_assembly_id class variable for tests
gcroci2 Mar 9, 2023
9a22c06
reorder imports
gcroci2 Mar 9, 2023
decf286
add minor static typing
gcroci2 Mar 9, 2023
e55fb33
xMerge branch 'dev' into 98_add_antismash_downloader_gcroci2
gcroci2 Mar 9, 2023
0c5b6bd
Revert "Merge branch 'dev' into 98_add_antismash_downloader_gcroci2"
gcroci2 Mar 15, 2023
c66a10a
Revert "Revert "Merge branch 'dev' into 98_add_antismash_downloader_g…
gcroci2 Mar 15, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 83 additions & 2 deletions src/nplinker/genomics/antismash/antismash_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@
import time
import zipfile
import httpx
from deprecated import deprecated
from bs4 import BeautifulSoup
from progress.bar import Bar
from nplinker.logconfig import LogConfig


logger = LogConfig.getLogger(__name__)


# urls to be given to download antismash data
ANTISMASH_DB_PAGE_URL = 'https://antismash-db.secondarymetabolites.org/output/{}/'
ANTISMASH_DB_DOWNLOAD_URL = 'https://antismash-db.secondarymetabolites.org/output/{}/{}'

Expand Down Expand Up @@ -322,7 +323,87 @@ def _extract_antismash_zip(antismash_obj, project_file_cache):

return True

def download_antismash_data(genome_records, project_download_cache, project_file_cache):
def download_and_extract_antismash_data(item_id, download_root, extract_path):
gcroci2 marked this conversation as resolved.
Show resolved Hide resolved
genome_status = {}

# this file records genome IDs and local filenames to avoid having to repeat HTTP requests
# each time the app is loaded (this can take a lot of time if there are dozens of genomes)
genome_status_file = os.path.join(download_root,
'genome_status.txt')

# genome lookup status info
if os.path.exists(genome_status_file):
with open(genome_status_file) as f:
for line in csv.reader(f):
asobj = GenomeStatus.from_csv(*line)
genome_status[asobj.original_id] = asobj

# use this to check if the lookup has already been attempted and if
# so if the file is cached locally
if item_id not in genome_status:
genome_status[item_id] = GenomeStatus(item_id, None)

genome_obj = genome_status[item_id]

logger.info(
'Checking for antismash data for genome ID={}'.
format(item_id))
# first check if file is cached locally
if os.path.exists(genome_obj.filename):
# file already downloaded
logger.info('Genome ID {} already downloaded to {}'.format(
item_id, genome_obj.filename))
elif genome_obj.attempted:
# lookup attempted previously but failed
logger.info(
'Genome ID {} skipped due to previous failure'.format(
item_id))
else:
# if no existing file and no lookup attempted, can start process of
# trying to retrieve the data

# lookup the ID
logger.info('Beginning lookup process for genome ID {}'.format(
item_id))

genome_obj.resolved_id = item_id # TO CHECK (Cunliang) not sure if this is what we want; in a general case,
gcroci2 marked this conversation as resolved.
Show resolved Hide resolved
# I don't think we have different possible ids (as in podp json file, for genome_ID nested dicts),
# so maybe it makes sense to put genome_obj.resolved_id equal to the item_id and only in podp case do the check
# (through _resolve_genome_id_data, was done here before) outside this function.
# If this is true, then I think we need to modify GenomeStatus class attributes logic for original_id and resolved_id,
# which in this way would be the same thing. Then we should modify also the code below, which assumes original_id
# to be eventually different
genome_obj.attempted = True

if genome_obj.resolved_id is None:
# give up on this one
logger.warning(
f'Failed lookup for genome ID {item_id}')
with open(genome_status_file, 'a+') as f:
f.write(genome_obj.to_csv() + '\n')

# if we got a refseq ID, now try to download the data from antismash
if _download_antismash_zip(genome_obj, download_root):
logger.info(
'Genome data successfully downloaded for {}'.format(
item_id))
else:
logger.warning(
'Failed to download antiSMASH data for genome ID {} ({})'
.format(genome_obj.resolved_id,
genome_obj.original_id))

with open(genome_status_file, 'a+', newline='\n') as f:
f.write(genome_obj.to_csv() + '\n')

_extract_antismash_zip(genome_obj, extract_path)

with open(genome_status_file, 'w', newline='\n') as f:
for obj in genome_status.values():
f.write(obj.to_csv() + '\n')

@deprecated(version="1.3.3", reason="Use download_and_extract_antismash_data class instead.")
def download_antismash_data(genome_records, project_download_cache, project_file_cache):
genome_status = {}

# this file records genome IDs and local filenames to avoid having to repeat HTTP requests
Expand Down