From 8f6499878b73af99f5fb8e6e4b53ab0d8699c469 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Tue, 28 Feb 2023 15:56:05 +0100 Subject: [PATCH 01/16] creat antismash downloader module --- .../antismash/antismash_downloader.py | 420 +++++++++++++++++ src/nplinker/pairedomics/downloader.py | 421 +----------------- 2 files changed, 422 insertions(+), 419 deletions(-) create mode 100644 src/nplinker/genomics/antismash/antismash_downloader.py diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py new file mode 100644 index 00000000..e5d66577 --- /dev/null +++ b/src/nplinker/genomics/antismash/antismash_downloader.py @@ -0,0 +1,420 @@ +import csv +import os +import re +import time +import zipfile +import httpx +from bs4 import BeautifulSoup +from progress.bar import Bar +from nplinker.logconfig import LogConfig + + +logger = LogConfig.getLogger(__name__) + + +ANTISMASH_DB_PAGE_URL = 'https://antismash-db.secondarymetabolites.org/output/{}/' +ANTISMASH_DB_DOWNLOAD_URL = 'https://antismash-db.secondarymetabolites.org/output/{}/{}' + +ANTISMASH_DBV2_PAGE_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/' +ANTISMASH_DBV2_DOWNLOAD_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/{}' + +NCBI_LOOKUP_URL_NEW = 'https://www.ncbi.nlm.nih.gov/assembly/?term={}' + +JGI_GENOME_LOOKUP_URL = 'https://img.jgi.doe.gov/cgi-bin/m/main.cgi?section=TaxonDetail&page=taxonDetail&taxon_oid={}' + +USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0' + +class GenomeStatus: + + def __init__(self, original_id, resolved_id, attempted=False, filename=""): + self.original_id = ';'.join(original_id.split(',')) + self.resolved_id = None if resolved_id == 'None' else resolved_id + self.attempted = True if attempted == 'True' else False + self.filename = filename + + @classmethod + def from_csv(cls, original_id, resolved_id, attempted, filename): + return cls(original_id, resolved_id, attempted, filename) + + def to_csv(self): + return ','.join([ + str(self.original_id), + str(self.resolved_id), + str(self.attempted), self.filename + ]) + +def _get_best_available_genome_id(genome_id_data): + if 'RefSeq_accession' in genome_id_data: + return genome_id_data['RefSeq_accession'] + elif 'GenBank_accession' in genome_id_data: + return genome_id_data['GenBank_accession'] + elif 'JGI_Genome_ID' in genome_id_data: + return genome_id_data['JGI_Genome_ID'] + + logger.warning('No known genome ID field in genome data: {}'.format( + genome_id_data)) + return None + +def _ncbi_genbank_search(genbank_id, retry_time=5.0): + url = NCBI_LOOKUP_URL_NEW.format(genbank_id) + logger.debug('Looking up GenBank data for {} at {}'.format( + genbank_id, url)) + resp = httpx.get(url, follow_redirects=True) + + if resp.status_code == httpx.codes.OK: + # the page should contain a
element with class "assembly_summary_new". retrieving + # the page seems to fail occasionally in the middle of lengthy sequences of genome + # lookups, so there might be some throttling going on. this will automatically retry + # the lookup if the expected content isn't found the first time + soup = BeautifulSoup(resp.content, 'html.parser') + # find the
element with class "assembly_summary_new" + dl_element = soup.find('dl', {'class': 'assembly_summary_new'}) + if dl_element is not None: + return dl_element + + logger.debug( + 'NCBI lookup failed, status code {}. Trying again in {} seconds'. + format(resp.status_code, retry_time)) + time.sleep(retry_time) + logger.debug('Looking up GenBank data for {} at {}'.format( + genbank_id, url)) + resp = httpx.get(url, follow_redirects=True) + if resp.status_code == httpx.codes.OK: + soup = BeautifulSoup(resp.content, 'html.parser') + # find the
element with class "assembly_summary_new" + dl_element = soup.find('dl', {'class': 'assembly_summary_new'}) + if dl_element is not None: + return dl_element + + logger.warning( + 'Failed to resolve NCBI genome ID {} at URL {} (after retrying)'. + format(genbank_id, url)) + return None + +def _resolve_genbank_accession(genbank_id): + logger.info( + 'Attempting to resolve Genbank accession {} to RefSeq accession'. + format(genbank_id)) + # genbank id => genbank seq => refseq + + # The GenBank accession can have several formats: + # 1: BAFR00000000.1 + # 2: NZ_BAGG00000000.1 + # 3: NC_016887.1 + # Case 1 is the default. + if '_' in genbank_id: + # case 2 + if len(genbank_id.split('_')[-1].split('.')[0]) == 12: + genbank_id = genbank_id.split('_')[-1] + # case 3 + else: + genbank_id = genbank_id.lower() + + # get rid of any extraneous whitespace + genbank_id = genbank_id.strip() + logger.debug(f'Parsed GenBank ID to "{genbank_id}"') + + # run a search using the GenBank accession ID + try: + dl_element = _ncbi_genbank_search(genbank_id) + if dl_element is None: + raise Exception('Unknown HTML format') + + refseq_idx = -1 + for field_idx, field in enumerate(dl_element.children): + # this is the element immediately preceding the one with + # the actual RefSeq ID we want + if field.getText().strip() == 'RefSeq assembly accession:': + refseq_idx = field_idx + 1 + + # this should be True when we've reached the right element + if field_idx == refseq_idx: + refseq_id = field.getText() + # if it has any spaces, take everything up to first one (some have annotations afterwards) + if refseq_id.find(' ') != -1: + refseq_id = refseq_id[:refseq_id.find(' ')] + + return refseq_id + + if refseq_idx == -1: + raise Exception('Expected HTML elements not found') + except Exception as e: + logger.warning( + 'Failed resolving GenBank accession {}, error {}'.format( + genbank_id, e)) + + return None + +def _resolve_jgi_accession(jgi_id): + url = JGI_GENOME_LOOKUP_URL.format(jgi_id) + logger.info( + 'Attempting to resolve JGI_Genome_ID {} to GenBank accession via {}' + .format(jgi_id, url)) + # no User-Agent header produces a 403 Forbidden error on this site... + try: + resp = httpx.get(url, + headers={'User-Agent': USER_AGENT}, + timeout=10.0, + follow_redirects=True) + except httpx.ReadTimeout: + logger.warning( + 'Timed out waiting for result of JGI_Genome_ID lookup') + return None + + soup = BeautifulSoup(resp.content, 'html.parser') + # find the table entry giving the NCBI assembly accession ID + link = soup.find( + 'a', href=re.compile('https://www.ncbi.nlm.nih.gov/nuccore/.*')) + if link is None: + return None + + return _resolve_genbank_accession(link.text) + +def _resolve_genome_id_data(genome_id_data): + if 'RefSeq_accession' in genome_id_data: + # best case, can use this directly + return genome_id_data['RefSeq_accession'] + elif 'GenBank_accession' in genome_id_data: + # resolve via NCBI + return _resolve_genbank_accession( + genome_id_data['GenBank_accession']) + elif 'JGI_Genome_ID' in genome_id_data: + # resolve via JGI => NCBI + return _resolve_jgi_accession(genome_id_data['JGI_Genome_ID']) + + logger.warning( + f'Unable to resolve genome_ID: {genome_id_data}') + return None + +def _get_antismash_db_page(genome_obj): + # want to try up to 4 different links here, v1 and v2 databases, each + # with and without the .1 suffix on the accesssion ID + + accesssions = [genome_obj.resolved_id, genome_obj.resolved_id + '.1'] + for base_url in [ANTISMASH_DB_PAGE_URL, ANTISMASH_DBV2_PAGE_URL]: + for accession in accesssions: + url = base_url.format(accession) + link = None + + logger.info('antismash DB lookup for {} => {}'.format( + accession, url)) + try: + resp = httpx.get(url, follow_redirects=True) + soup = BeautifulSoup(resp.content, 'html.parser') + # retrieve .zip file download link + link = soup.find( + 'a', {'href': lambda url: url.endswith('.zip')}) + except Exception as e: + logger.debug(f'antiSMASH DB page load failed: {e}') + + if link is not None: + logger.info( + 'antiSMASH lookup succeeded! Filename is {}'.format( + link['href'])) + # save with the .1 suffix if that worked + genome_obj.resolved_id = accession + return link['href'] + + return None + +def _get_antismash_zip_data(accession_id, filename, local_path): + for base_url in [ + ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL + ]: + zipfile_url = base_url.format(accession_id, filename) + with open(local_path, 'wb') as f: + total_bytes = 0 + try: + with httpx.stream('GET', zipfile_url) as r: + if r.status_code == 404: + logger.debug('antiSMASH download URL was a 404') + continue + + logger.info('Downloading from antiSMASH: {}'.format( + zipfile_url)) + filesize = int(r.headers['content-length']) + bar = Bar(filename, + max=filesize, + suffix='%(percent)d%%') + for data in r.iter_bytes(): + f.write(data) + total_bytes += len(data) + bar.next(len(data)) + bar.finish() + except Exception as e: + logger.warning( + f'antiSMASH zip download failed: {e}') + continue + + return True + + return False + +def _download_antismash_zip(antismash_obj, project_download_cache): + # save zip files to avoid having to repeat above lookup every time + local_path = os.path.join(project_download_cache, + f'{antismash_obj.resolved_id}.zip') + logger.debug( + f'Checking for existing antismash zip at {local_path}') + + cached = False + # if the file exists locally + if os.path.exists(local_path): + logger.info(f'Found cached file at {local_path}') + try: + # check if it's a valid zip file, if so treat it as cached + _ = zipfile.ZipFile(local_path) + cached = True + antismash_obj.filename = local_path + except zipfile.BadZipFile as bzf: + # otherwise delete and redownload + logger.info( + 'Invalid antismash zipfile found ({}). Will download again' + .format(bzf)) + os.unlink(local_path) + antismash_obj.filename = "" + + if not cached: + filename = _get_antismash_db_page(antismash_obj) + if filename is None: + return False + + _get_antismash_zip_data(antismash_obj.resolved_id, filename, + local_path) + antismash_obj.filename = local_path + + return True + +def _extract_antismash_zip(antismash_obj, project_file_cache): + if antismash_obj.filename is None or len(antismash_obj.filename) == 0: + return False + + output_path = os.path.join(project_file_cache, 'antismash', + antismash_obj.resolved_id) + exists_already = os.path.exists(output_path) and os.path.exists( + os.path.join(output_path, 'completed')) + + logger.debug( + 'Extracting antismash data to {}, exists_already = {}'.format( + output_path, exists_already)) + if exists_already: + return True + + # create a subfolder for each set of genome data (the zip files used to be + # constructed with path info but that seems to have changed recently) + if not os.path.exists(output_path): + os.makedirs(output_path, exist_ok=True) + + antismash_zip = zipfile.ZipFile(antismash_obj.filename) + kc_prefix1 = f'{antismash_obj.resolved_id}/knownclusterblast' + kc_prefix2 = 'knownclusterblast' + for zip_member in antismash_zip.namelist(): + # TODO other files here? + if zip_member.endswith('.gbk') or zip_member.endswith('.json'): + antismash_zip.extract(zip_member, path=output_path) + elif zip_member.startswith(kc_prefix1) or zip_member.startswith( + kc_prefix2): + if zip_member.endswith( + '.txt') and 'mibig_hits' not in zip_member: + antismash_zip.extract(zip_member, path=output_path) + + open(os.path.join(output_path, 'completed'), 'w').close() + + return True + +def download_antismash_data(genome_records, project_download_cache): + genome_status = {} + + # this file records genome IDs and local filenames to avoid having to repeat HTTP requests + # each time the app is loaded (this can take a lot of time if there are dozens of genomes) + genome_status_file = os.path.join(project_download_cache, + 'genome_status.txt') + + # genome lookup status info + if os.path.exists(genome_status_file): + with open(genome_status_file) as f: + for line in csv.reader(f): + asobj = GenomeStatus.from_csv(*line) + genome_status[asobj.original_id] = asobj + + for i, genome_record in enumerate(genome_records): + # get the best available ID from the dict + best_id = _get_best_available_genome_id( + genome_record['genome_ID']) + if best_id is None: + logger.warning( + 'Ignoring genome record "{}" due to missing genome ID field' + .format(genome_record)) + continue + + # use this to check if the lookup has already been attempted and if + # so if the file is cached locally + if best_id not in genome_status: + genome_status[best_id] = GenomeStatus(best_id, None) + + genome_obj = genome_status[best_id] + + logger.info( + 'Checking for antismash data {}/{}, current genome ID={}'. + format(i + 1, len(genome_records), best_id)) + # first check if file is cached locally + if os.path.exists(genome_obj.filename): + # file already downloaded + logger.info('Genome ID {} already downloaded to {}'.format( + best_id, genome_obj.filename)) + genome_record['resolved_id'] = genome_obj.resolved_id + elif genome_obj.attempted: + # lookup attempted previously but failed + logger.info( + 'Genome ID {} skipped due to previous failure'.format( + best_id)) + genome_record['resolved_id'] = genome_obj.resolved_id + else: + # if no existing file and no lookup attempted, can start process of + # trying to retrieve the data + + # lookup the ID + logger.info('Beginning lookup process for genome ID {}'.format( + best_id)) + + genome_obj.resolved_id = _resolve_genome_id_data( + genome_record['genome_ID']) + genome_obj.attempted = True + + if genome_obj.resolved_id is None: + # give up on this one + logger.warning( + f'Failed lookup for genome ID {best_id}') + with open(genome_status_file, 'a+') as f: + f.write(genome_obj.to_csv() + '\n') + continue + + # if we got a refseq ID, now try to download the data from antismash + if _download_antismash_zip(genome_obj): + logger.info( + 'Genome data successfully downloaded for {}'.format( + best_id)) + genome_record['resolved_id'] = genome_obj.resolved_id + else: + logger.warning( + 'Failed to download antiSMASH data for genome ID {} ({})' + .format(genome_obj.resolved_id, + genome_obj.original_id)) + + with open(genome_status_file, 'a+', newline='\n') as f: + f.write(genome_obj.to_csv() + '\n') + + _extract_antismash_zip(genome_obj) + + missing = len( + [x for x in genome_status.values() if len(x.filename) == 0]) + logger.info( + 'Dataset has {} missing sets of antiSMASH data (from a total of {})' + .format(missing, len(genome_records))) + + with open(genome_status_file, 'w', newline='\n') as f: + for obj in genome_status.values(): + f.write(obj.to_csv() + '\n') + + if missing == len(genome_records): + logger.warning('Failed to successfully retrieve ANY genome data!') diff --git a/src/nplinker/pairedomics/downloader.py b/src/nplinker/pairedomics/downloader.py index 04914d94..addf306b 100644 --- a/src/nplinker/pairedomics/downloader.py +++ b/src/nplinker/pairedomics/downloader.py @@ -12,21 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -import csv -import io import json import os -from pathlib import Path -import re import sys import shutil -import tarfile -import time import zipfile from deprecated import deprecated import httpx -from bs4 import BeautifulSoup -from progress.bar import Bar from progress.spinner import Spinner from nplinker.logconfig import LogConfig from nplinker.metabolomics.gnps.gnps_downloader import GNPSDownloader @@ -34,6 +26,7 @@ from nplinker.strains import Strain from nplinker.strain_collection import StrainCollection from nplinker.genomics.mibig import download_and_extract_mibig_metadata +from nplinker.genomics.antismash import download_antismash_data logger = LogConfig.getLogger(__name__) @@ -45,43 +38,11 @@ PAIREDOMICS_PROJECT_URL = 'https://pairedomicsdata.bioinformatics.nl/api/projects/{}' GNPS_DATA_DOWNLOAD_URL = 'https://gnps.ucsd.edu/ProteoSAFe/DownloadResult?task={}&view=download_clustered_spectra' -ANTISMASH_DB_PAGE_URL = 'https://antismash-db.secondarymetabolites.org/output/{}/' -ANTISMASH_DB_DOWNLOAD_URL = 'https://antismash-db.secondarymetabolites.org/output/{}/{}' - -ANTISMASH_DBV2_PAGE_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/' -ANTISMASH_DBV2_DOWNLOAD_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/{}' - -NCBI_LOOKUP_URL_NEW = 'https://www.ncbi.nlm.nih.gov/assembly/?term={}' - -JGI_GENOME_LOOKUP_URL = 'https://img.jgi.doe.gov/cgi-bin/m/main.cgi?section=TaxonDetail&page=taxonDetail&taxon_oid={}' - MIBIG_METADATA_URL = 'https://dl.secondarymetabolites.org/mibig/mibig_json_{}.tar.gz' MIBIG_BGC_METADATA_URL = 'https://mibig.secondarymetabolites.org/repository/{}/annotations.json' # MIBIG_BGC_GENBANK_URL = 'https://mibig.secondarymetabolites.org/repository/{}/{}.gbk' # MIBIG_BGC_JSON_URL = 'https://mibig.secondarymetabolites.org/repository/{}/{}.json' -USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0' - - -class GenomeStatus: - - def __init__(self, original_id, resolved_id, attempted=False, filename=""): - self.original_id = ';'.join(original_id.split(',')) - self.resolved_id = None if resolved_id == 'None' else resolved_id - self.attempted = True if attempted == 'True' else False - self.filename = filename - - @classmethod - def from_csv(cls, original_id, resolved_id, attempted, filename): - return cls(original_id, resolved_id, attempted, filename) - - def to_csv(self): - return ','.join([ - str(self.original_id), - str(self.resolved_id), - str(self.attempted), self.filename - ]) - class Downloader(): # TODO: move to independent config file ---C.Geng @@ -195,7 +156,7 @@ def get(self, do_bigscape, extra_bigscape_parameters, use_mibig, self._download_metabolomics_zipfile(self.gnps_task_id) - self._download_genomics_data(self.project_json['genomes']) + download_antismash_data(self.project_json['genomes']) # CG: it extracts strain names and later will be used for strains self._parse_genome_labels(self.project_json['genome_metabolome_links'], @@ -232,248 +193,6 @@ def _run_bigscape(self, do_bigscape, extra_bigscape_parameters): 'Failed to run BiG-SCAPE on antismash data, error was "{}"'. format(e)) - def _ncbi_genbank_search(self, genbank_id, retry_time=5.0): - url = NCBI_LOOKUP_URL_NEW.format(genbank_id) - logger.debug('Looking up GenBank data for {} at {}'.format( - genbank_id, url)) - resp = httpx.get(url, follow_redirects=True) - - if resp.status_code == httpx.codes.OK: - # the page should contain a
element with class "assembly_summary_new". retrieving - # the page seems to fail occasionally in the middle of lengthy sequences of genome - # lookups, so there might be some throttling going on. this will automatically retry - # the lookup if the expected content isn't found the first time - soup = BeautifulSoup(resp.content, 'html.parser') - # find the
element with class "assembly_summary_new" - dl_element = soup.find('dl', {'class': 'assembly_summary_new'}) - if dl_element is not None: - return dl_element - - logger.debug( - 'NCBI lookup failed, status code {}. Trying again in {} seconds'. - format(resp.status_code, retry_time)) - time.sleep(retry_time) - logger.debug('Looking up GenBank data for {} at {}'.format( - genbank_id, url)) - resp = httpx.get(url, follow_redirects=True) - if resp.status_code == httpx.codes.OK: - soup = BeautifulSoup(resp.content, 'html.parser') - # find the
element with class "assembly_summary_new" - dl_element = soup.find('dl', {'class': 'assembly_summary_new'}) - if dl_element is not None: - return dl_element - - logger.warning( - 'Failed to resolve NCBI genome ID {} at URL {} (after retrying)'. - format(genbank_id, url)) - return None - - def _resolve_genbank_accession(self, genbank_id): - logger.info( - 'Attempting to resolve Genbank accession {} to RefSeq accession'. - format(genbank_id)) - # genbank id => genbank seq => refseq - - # The GenBank accession can have several formats: - # 1: BAFR00000000.1 - # 2: NZ_BAGG00000000.1 - # 3: NC_016887.1 - # Case 1 is the default. - if '_' in genbank_id: - # case 2 - if len(genbank_id.split('_')[-1].split('.')[0]) == 12: - genbank_id = genbank_id.split('_')[-1] - # case 3 - else: - genbank_id = genbank_id.lower() - - # get rid of any extraneous whitespace - genbank_id = genbank_id.strip() - logger.debug(f'Parsed GenBank ID to "{genbank_id}"') - - # run a search using the GenBank accession ID - try: - dl_element = self._ncbi_genbank_search(genbank_id) - if dl_element is None: - raise Exception('Unknown HTML format') - - refseq_idx = -1 - for field_idx, field in enumerate(dl_element.children): - # this is the element immediately preceding the one with - # the actual RefSeq ID we want - if field.getText().strip() == 'RefSeq assembly accession:': - refseq_idx = field_idx + 1 - - # this should be True when we've reached the right element - if field_idx == refseq_idx: - refseq_id = field.getText() - # if it has any spaces, take everything up to first one (some have annotations afterwards) - if refseq_id.find(' ') != -1: - refseq_id = refseq_id[:refseq_id.find(' ')] - - return refseq_id - - if refseq_idx == -1: - raise Exception('Expected HTML elements not found') - except Exception as e: - logger.warning( - 'Failed resolving GenBank accession {}, error {}'.format( - genbank_id, e)) - - return None - - def _resolve_jgi_accession(self, jgi_id): - url = JGI_GENOME_LOOKUP_URL.format(jgi_id) - logger.info( - 'Attempting to resolve JGI_Genome_ID {} to GenBank accession via {}' - .format(jgi_id, url)) - # no User-Agent header produces a 403 Forbidden error on this site... - try: - resp = httpx.get(url, - headers={'User-Agent': USER_AGENT}, - timeout=10.0, - follow_redirects=True) - except httpx.ReadTimeout: - logger.warning( - 'Timed out waiting for result of JGI_Genome_ID lookup') - return None - - soup = BeautifulSoup(resp.content, 'html.parser') - # find the table entry giving the NCBI assembly accession ID - link = soup.find( - 'a', href=re.compile('https://www.ncbi.nlm.nih.gov/nuccore/.*')) - if link is None: - return None - - return self._resolve_genbank_accession(link.text) - - def _get_best_available_genome_id(self, genome_id_data): - if 'RefSeq_accession' in genome_id_data: - return genome_id_data['RefSeq_accession'] - elif 'GenBank_accession' in genome_id_data: - return genome_id_data['GenBank_accession'] - elif 'JGI_Genome_ID' in genome_id_data: - return genome_id_data['JGI_Genome_ID'] - - logger.warning('No known genome ID field in genome data: {}'.format( - genome_id_data)) - return None - - def _resolve_genome_id_data(self, genome_id_data): - if 'RefSeq_accession' in genome_id_data: - # best case, can use this directly - return genome_id_data['RefSeq_accession'] - elif 'GenBank_accession' in genome_id_data: - # resolve via NCBI - return self._resolve_genbank_accession( - genome_id_data['GenBank_accession']) - elif 'JGI_Genome_ID' in genome_id_data: - # resolve via JGI => NCBI - return self._resolve_jgi_accession(genome_id_data['JGI_Genome_ID']) - - logger.warning( - f'Unable to resolve genome_ID: {genome_id_data}') - return None - - def _download_genomics_data(self, genome_records): - genome_status = {} - - # this file records genome IDs and local filenames to avoid having to repeat HTTP requests - # each time the app is loaded (this can take a lot of time if there are dozens of genomes) - genome_status_file = os.path.join(self.project_download_cache, - 'genome_status.txt') - - # genome lookup status info - if os.path.exists(genome_status_file): - with open(genome_status_file) as f: - for line in csv.reader(f): - asobj = GenomeStatus.from_csv(*line) - genome_status[asobj.original_id] = asobj - - for i, genome_record in enumerate(genome_records): - label = genome_record['genome_label'] - - # get the best available ID from the dict - best_id = self._get_best_available_genome_id( - genome_record['genome_ID']) - if best_id is None: - logger.warning( - 'Ignoring genome record "{}" due to missing genome ID field' - .format(genome_record)) - continue - - # use this to check if the lookup has already been attempted and if - # so if the file is cached locally - if best_id not in genome_status: - genome_status[best_id] = GenomeStatus(best_id, None) - - genome_obj = genome_status[best_id] - - logger.info( - 'Checking for antismash data {}/{}, current genome ID={}'. - format(i + 1, len(genome_records), best_id)) - # first check if file is cached locally - if os.path.exists(genome_obj.filename): - # file already downloaded - logger.info('Genome ID {} already downloaded to {}'.format( - best_id, genome_obj.filename)) - genome_record['resolved_id'] = genome_obj.resolved_id - elif genome_obj.attempted: - # lookup attempted previously but failed - logger.info( - 'Genome ID {} skipped due to previous failure'.format( - best_id)) - genome_record['resolved_id'] = genome_obj.resolved_id - else: - # if no existing file and no lookup attempted, can start process of - # trying to retrieve the data - - # lookup the ID - logger.info('Beginning lookup process for genome ID {}'.format( - best_id)) - - genome_obj.resolved_id = self._resolve_genome_id_data( - genome_record['genome_ID']) - genome_obj.attempted = True - - if genome_obj.resolved_id is None: - # give up on this one - logger.warning( - f'Failed lookup for genome ID {best_id}') - with open(genome_status_file, 'a+') as f: - f.write(genome_obj.to_csv() + '\n') - continue - - # if we got a refseq ID, now try to download the data from antismash - if self._download_antismash_zip(genome_obj): - logger.info( - 'Genome data successfully downloaded for {}'.format( - best_id)) - genome_record['resolved_id'] = genome_obj.resolved_id - else: - logger.warning( - 'Failed to download antiSMASH data for genome ID {} ({})' - .format(genome_obj.resolved_id, - genome_obj.original_id)) - - with open(genome_status_file, 'a+', newline='\n') as f: - f.write(genome_obj.to_csv() + '\n') - - self._extract_antismash_zip(genome_obj) - - missing = len( - [x for x in genome_status.values() if len(x.filename) == 0]) - logger.info( - 'Dataset has {} missing sets of antiSMASH data (from a total of {})' - .format(missing, len(genome_records))) - - with open(genome_status_file, 'w', newline='\n') as f: - for obj in genome_status.values(): - f.write(obj.to_csv() + '\n') - - if missing == len(genome_records): - logger.warning('Failed to successfully retrieve ANY genome data!') - def _download_mibig_json(self, version): output_path = os.path.join(self.project_file_cache, 'mibig_json') @@ -490,142 +209,6 @@ def _download_mibig_json(self, version): return True - def _get_antismash_db_page(self, genome_obj): - # want to try up to 4 different links here, v1 and v2 databases, each - # with and without the .1 suffix on the accesssion ID - - accesssions = [genome_obj.resolved_id, genome_obj.resolved_id + '.1'] - for base_url in [ANTISMASH_DB_PAGE_URL, ANTISMASH_DBV2_PAGE_URL]: - for accession in accesssions: - url = base_url.format(accession) - link = None - - logger.info('antismash DB lookup for {} => {}'.format( - accession, url)) - try: - resp = httpx.get(url, follow_redirects=True) - soup = BeautifulSoup(resp.content, 'html.parser') - # retrieve .zip file download link - link = soup.find( - 'a', {'href': lambda url: url.endswith('.zip')}) - except Exception as e: - logger.debug(f'antiSMASH DB page load failed: {e}') - - if link is not None: - logger.info( - 'antiSMASH lookup succeeded! Filename is {}'.format( - link['href'])) - # save with the .1 suffix if that worked - genome_obj.resolved_id = accession - return link['href'] - - return None - - def _get_antismash_zip_data(self, accession_id, filename, local_path): - for base_url in [ - ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL - ]: - zipfile_url = base_url.format(accession_id, filename) - with open(local_path, 'wb') as f: - total_bytes = 0 - try: - with httpx.stream('GET', zipfile_url) as r: - if r.status_code == 404: - logger.debug('antiSMASH download URL was a 404') - continue - - logger.info('Downloading from antiSMASH: {}'.format( - zipfile_url)) - filesize = int(r.headers['content-length']) - bar = Bar(filename, - max=filesize, - suffix='%(percent)d%%') - for data in r.iter_bytes(): - f.write(data) - total_bytes += len(data) - bar.next(len(data)) - bar.finish() - except Exception as e: - logger.warning( - f'antiSMASH zip download failed: {e}') - continue - - return True - - return False - - def _download_antismash_zip(self, antismash_obj): - # save zip files to avoid having to repeat above lookup every time - local_path = os.path.join(self.project_download_cache, - f'{antismash_obj.resolved_id}.zip') - logger.debug( - f'Checking for existing antismash zip at {local_path}') - - cached = False - # if the file exists locally - if os.path.exists(local_path): - logger.info(f'Found cached file at {local_path}') - try: - # check if it's a valid zip file, if so treat it as cached - _ = zipfile.ZipFile(local_path) - cached = True - antismash_obj.filename = local_path - except zipfile.BadZipFile as bzf: - # otherwise delete and redownload - logger.info( - 'Invalid antismash zipfile found ({}). Will download again' - .format(bzf)) - os.unlink(local_path) - antismash_obj.filename = "" - - if not cached: - filename = self._get_antismash_db_page(antismash_obj) - if filename is None: - return False - - self._get_antismash_zip_data(antismash_obj.resolved_id, filename, - local_path) - antismash_obj.filename = local_path - - return True - - def _extract_antismash_zip(self, antismash_obj): - if antismash_obj.filename is None or len(antismash_obj.filename) == 0: - return False - - output_path = os.path.join(self.project_file_cache, 'antismash', - antismash_obj.resolved_id) - exists_already = os.path.exists(output_path) and os.path.exists( - os.path.join(output_path, 'completed')) - - logger.debug( - 'Extracting antismash data to {}, exists_already = {}'.format( - output_path, exists_already)) - if exists_already: - return True - - # create a subfolder for each set of genome data (the zip files used to be - # constructed with path info but that seems to have changed recently) - if not os.path.exists(output_path): - os.makedirs(output_path, exist_ok=True) - - antismash_zip = zipfile.ZipFile(antismash_obj.filename) - kc_prefix1 = f'{antismash_obj.resolved_id}/knownclusterblast' - kc_prefix2 = 'knownclusterblast' - for zip_member in antismash_zip.namelist(): - # TODO other files here? - if zip_member.endswith('.gbk') or zip_member.endswith('.json'): - antismash_zip.extract(zip_member, path=output_path) - elif zip_member.startswith(kc_prefix1) or zip_member.startswith( - kc_prefix2): - if zip_member.endswith( - '.txt') and 'mibig_hits' not in zip_member: - antismash_zip.extract(zip_member, path=output_path) - - open(os.path.join(output_path, 'completed'), 'w').close() - - return True - def _parse_genome_labels(self, met_records, gen_records): temp = {} mc, gc = 0, 0 From ec2b048f650feb9f24ebacec3292fcecd2d242ab Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Tue, 28 Feb 2023 16:23:55 +0100 Subject: [PATCH 02/16] fix tests --- src/nplinker/genomics/antismash/__init__.py | 1 + src/nplinker/genomics/antismash/antismash_downloader.py | 6 +++--- src/nplinker/pairedomics/downloader.py | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/nplinker/genomics/antismash/__init__.py b/src/nplinker/genomics/antismash/__init__.py index a512800a..b0cdb0a4 100644 --- a/src/nplinker/genomics/antismash/__init__.py +++ b/src/nplinker/genomics/antismash/__init__.py @@ -1,6 +1,7 @@ import logging from .antismash_loader import AntismashBGCLoader from .antismash_loader import parse_bgc_genbank +from .antismash_downloader import download_antismash_data logging.getLogger(__name__).addHandler(logging.NullHandler()) diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py index e5d66577..281fa00a 100644 --- a/src/nplinker/genomics/antismash/antismash_downloader.py +++ b/src/nplinker/genomics/antismash/antismash_downloader.py @@ -322,7 +322,7 @@ def _extract_antismash_zip(antismash_obj, project_file_cache): return True -def download_antismash_data(genome_records, project_download_cache): +def download_antismash_data(genome_records, project_download_cache, project_file_cache): genome_status = {} # this file records genome IDs and local filenames to avoid having to repeat HTTP requests @@ -390,7 +390,7 @@ def download_antismash_data(genome_records, project_download_cache): continue # if we got a refseq ID, now try to download the data from antismash - if _download_antismash_zip(genome_obj): + if _download_antismash_zip(genome_obj, project_download_cache): logger.info( 'Genome data successfully downloaded for {}'.format( best_id)) @@ -404,7 +404,7 @@ def download_antismash_data(genome_records, project_download_cache): with open(genome_status_file, 'a+', newline='\n') as f: f.write(genome_obj.to_csv() + '\n') - _extract_antismash_zip(genome_obj) + _extract_antismash_zip(genome_obj, project_file_cache) missing = len( [x for x in genome_status.values() if len(x.filename) == 0]) diff --git a/src/nplinker/pairedomics/downloader.py b/src/nplinker/pairedomics/downloader.py index addf306b..cfbc76f9 100644 --- a/src/nplinker/pairedomics/downloader.py +++ b/src/nplinker/pairedomics/downloader.py @@ -156,7 +156,7 @@ def get(self, do_bigscape, extra_bigscape_parameters, use_mibig, self._download_metabolomics_zipfile(self.gnps_task_id) - download_antismash_data(self.project_json['genomes']) + download_antismash_data(self.project_json['genomes'], self.project_download_cache, self.project_file_cache) # CG: it extracts strain names and later will be used for strains self._parse_genome_labels(self.project_json['genome_metabolome_links'], From 26e685f7ee9fa14c90973fa599e2028ccfffdc93 Mon Sep 17 00:00:00 2001 From: Giulia Crocioni <55382553+gcroci2@users.noreply.github.com> Date: Fri, 3 Mar 2023 10:33:27 +0100 Subject: [PATCH 03/16] Update src/nplinker/genomics/antismash/antismash_downloader.py Co-authored-by: Cunliang Geng --- src/nplinker/genomics/antismash/antismash_downloader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py index 281fa00a..025d69d8 100644 --- a/src/nplinker/genomics/antismash/antismash_downloader.py +++ b/src/nplinker/genomics/antismash/antismash_downloader.py @@ -15,6 +15,7 @@ ANTISMASH_DB_PAGE_URL = 'https://antismash-db.secondarymetabolites.org/output/{}/' ANTISMASH_DB_DOWNLOAD_URL = 'https://antismash-db.secondarymetabolites.org/output/{}/{}' +# The antiSMASH DBV2 is for the availability of the old version, better to keep it. ANTISMASH_DBV2_PAGE_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/' ANTISMASH_DBV2_DOWNLOAD_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/{}' From da904c609b79fe4398577e0c2a2310ffaa483a6d Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Fri, 3 Mar 2023 17:50:06 +0100 Subject: [PATCH 04/16] add a new function for downloading and extracting antismash data --- .../antismash/antismash_downloader.py | 85 ++++++++++++++++++- 1 file changed, 83 insertions(+), 2 deletions(-) diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py index 281fa00a..5f0be7cb 100644 --- a/src/nplinker/genomics/antismash/antismash_downloader.py +++ b/src/nplinker/genomics/antismash/antismash_downloader.py @@ -4,6 +4,7 @@ import time import zipfile import httpx +from deprecated import deprecated from bs4 import BeautifulSoup from progress.bar import Bar from nplinker.logconfig import LogConfig @@ -11,7 +12,7 @@ logger = LogConfig.getLogger(__name__) - +# urls to be given to download antismash data ANTISMASH_DB_PAGE_URL = 'https://antismash-db.secondarymetabolites.org/output/{}/' ANTISMASH_DB_DOWNLOAD_URL = 'https://antismash-db.secondarymetabolites.org/output/{}/{}' @@ -322,7 +323,87 @@ def _extract_antismash_zip(antismash_obj, project_file_cache): return True -def download_antismash_data(genome_records, project_download_cache, project_file_cache): +def download_and_extract_antismash_data(item_id, download_root, extract_path): + genome_status = {} + + # this file records genome IDs and local filenames to avoid having to repeat HTTP requests + # each time the app is loaded (this can take a lot of time if there are dozens of genomes) + genome_status_file = os.path.join(download_root, + 'genome_status.txt') + + # genome lookup status info + if os.path.exists(genome_status_file): + with open(genome_status_file) as f: + for line in csv.reader(f): + asobj = GenomeStatus.from_csv(*line) + genome_status[asobj.original_id] = asobj + + # use this to check if the lookup has already been attempted and if + # so if the file is cached locally + if item_id not in genome_status: + genome_status[item_id] = GenomeStatus(item_id, None) + + genome_obj = genome_status[item_id] + + logger.info( + 'Checking for antismash data for genome ID={}'. + format(item_id)) + # first check if file is cached locally + if os.path.exists(genome_obj.filename): + # file already downloaded + logger.info('Genome ID {} already downloaded to {}'.format( + item_id, genome_obj.filename)) + elif genome_obj.attempted: + # lookup attempted previously but failed + logger.info( + 'Genome ID {} skipped due to previous failure'.format( + item_id)) + else: + # if no existing file and no lookup attempted, can start process of + # trying to retrieve the data + + # lookup the ID + logger.info('Beginning lookup process for genome ID {}'.format( + item_id)) + + genome_obj.resolved_id = item_id # TO CHECK (Cunliang) not sure if this is what we want; in a general case, + # I don't think we have different possible ids (as in podp json file, for genome_ID nested dicts), + # so maybe it makes sense to put genome_obj.resolved_id equal to the item_id and only in podp case do the check + # (through _resolve_genome_id_data, was done here before) outside this function. + # If this is true, then I think we need to modify GenomeStatus class attributes logic for original_id and resolved_id, + # which in this way would be the same thing. Then we should modify also the code below, which assumes original_id + # to be eventually different + genome_obj.attempted = True + + if genome_obj.resolved_id is None: + # give up on this one + logger.warning( + f'Failed lookup for genome ID {item_id}') + with open(genome_status_file, 'a+') as f: + f.write(genome_obj.to_csv() + '\n') + + # if we got a refseq ID, now try to download the data from antismash + if _download_antismash_zip(genome_obj, download_root): + logger.info( + 'Genome data successfully downloaded for {}'.format( + item_id)) + else: + logger.warning( + 'Failed to download antiSMASH data for genome ID {} ({})' + .format(genome_obj.resolved_id, + genome_obj.original_id)) + + with open(genome_status_file, 'a+', newline='\n') as f: + f.write(genome_obj.to_csv() + '\n') + + _extract_antismash_zip(genome_obj, extract_path) + + with open(genome_status_file, 'w', newline='\n') as f: + for obj in genome_status.values(): + f.write(obj.to_csv() + '\n') + +@deprecated(version="1.3.3", reason="Use download_and_extract_antismash_data class instead.") +def download_antismash_data(genome_records, project_download_cache, project_file_cache): genome_status = {} # this file records genome IDs and local filenames to avoid having to repeat HTTP requests From 7b07a7bbbb9829f097199465fd17a1bea3113d2a Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Tue, 7 Mar 2023 14:34:49 +0100 Subject: [PATCH 05/16] create podp_antismash_downloader module --- src/nplinker/pairedomics/__init__.py | 1 + src/nplinker/pairedomics/downloader.py | 2 +- .../pairedomics/podp_antismash_downloader.py | 423 ++++++++++++++++++ 3 files changed, 425 insertions(+), 1 deletion(-) create mode 100644 src/nplinker/pairedomics/podp_antismash_downloader.py diff --git a/src/nplinker/pairedomics/__init__.py b/src/nplinker/pairedomics/__init__.py index e69de29b..778793b4 100644 --- a/src/nplinker/pairedomics/__init__.py +++ b/src/nplinker/pairedomics/__init__.py @@ -0,0 +1 @@ +from .podp_antismash_downloader import download_antismash_data diff --git a/src/nplinker/pairedomics/downloader.py b/src/nplinker/pairedomics/downloader.py index cfbc76f9..610a6cad 100644 --- a/src/nplinker/pairedomics/downloader.py +++ b/src/nplinker/pairedomics/downloader.py @@ -26,7 +26,7 @@ from nplinker.strains import Strain from nplinker.strain_collection import StrainCollection from nplinker.genomics.mibig import download_and_extract_mibig_metadata -from nplinker.genomics.antismash import download_antismash_data +from . import download_antismash_data logger = LogConfig.getLogger(__name__) diff --git a/src/nplinker/pairedomics/podp_antismash_downloader.py b/src/nplinker/pairedomics/podp_antismash_downloader.py new file mode 100644 index 00000000..76cd56c8 --- /dev/null +++ b/src/nplinker/pairedomics/podp_antismash_downloader.py @@ -0,0 +1,423 @@ +import csv +import os +import re +import time +import zipfile +import httpx +from deprecated import deprecated +from bs4 import BeautifulSoup +from progress.bar import Bar +from nplinker.logconfig import LogConfig + + +logger = LogConfig.getLogger(__name__) + +# urls to be given to download antismash data +ANTISMASH_DB_PAGE_URL = 'https://antismash-db.secondarymetabolites.org/output/{}/' +ANTISMASH_DB_DOWNLOAD_URL = 'https://antismash-db.secondarymetabolites.org/output/{}/{}' + +# The antiSMASH DBV2 is for the availability of the old version, better to keep it. +ANTISMASH_DBV2_PAGE_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/' +ANTISMASH_DBV2_DOWNLOAD_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/{}' + +NCBI_LOOKUP_URL_NEW = 'https://www.ncbi.nlm.nih.gov/assembly/?term={}' + +JGI_GENOME_LOOKUP_URL = 'https://img.jgi.doe.gov/cgi-bin/m/main.cgi?section=TaxonDetail&page=taxonDetail&taxon_oid={}' + +USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0' + +class GenomeStatus: + + def __init__(self, original_id, resolved_id, attempted=False, filename=""): + self.original_id = ';'.join(original_id.split(',')) + self.resolved_id = None if resolved_id == 'None' else resolved_id + self.attempted = True if attempted == 'True' else False + self.filename = filename + + @classmethod + def from_csv(cls, original_id, resolved_id, attempted, filename): + return cls(original_id, resolved_id, attempted, filename) + + def to_csv(self): + return ','.join([ + str(self.original_id), + str(self.resolved_id), + str(self.attempted), self.filename + ]) + +def _get_best_available_genome_id(genome_id_data): + if 'RefSeq_accession' in genome_id_data: + return genome_id_data['RefSeq_accession'] + elif 'GenBank_accession' in genome_id_data: + return genome_id_data['GenBank_accession'] + elif 'JGI_Genome_ID' in genome_id_data: + return genome_id_data['JGI_Genome_ID'] + + logger.warning('No known genome ID field in genome data: {}'.format( + genome_id_data)) + return None + +def _ncbi_genbank_search(genbank_id, retry_time=5.0): + url = NCBI_LOOKUP_URL_NEW.format(genbank_id) + logger.debug('Looking up GenBank data for {} at {}'.format( + genbank_id, url)) + resp = httpx.get(url, follow_redirects=True) + + if resp.status_code == httpx.codes.OK: + # the page should contain a
element with class "assembly_summary_new". retrieving + # the page seems to fail occasionally in the middle of lengthy sequences of genome + # lookups, so there might be some throttling going on. this will automatically retry + # the lookup if the expected content isn't found the first time + soup = BeautifulSoup(resp.content, 'html.parser') + # find the
element with class "assembly_summary_new" + dl_element = soup.find('dl', {'class': 'assembly_summary_new'}) + if dl_element is not None: + return dl_element + + logger.debug( + 'NCBI lookup failed, status code {}. Trying again in {} seconds'. + format(resp.status_code, retry_time)) + time.sleep(retry_time) + logger.debug('Looking up GenBank data for {} at {}'.format( + genbank_id, url)) + resp = httpx.get(url, follow_redirects=True) + if resp.status_code == httpx.codes.OK: + soup = BeautifulSoup(resp.content, 'html.parser') + # find the
element with class "assembly_summary_new" + dl_element = soup.find('dl', {'class': 'assembly_summary_new'}) + if dl_element is not None: + return dl_element + + logger.warning( + 'Failed to resolve NCBI genome ID {} at URL {} (after retrying)'. + format(genbank_id, url)) + return None + +def _resolve_genbank_accession(genbank_id): + logger.info( + 'Attempting to resolve Genbank accession {} to RefSeq accession'. + format(genbank_id)) + # genbank id => genbank seq => refseq + + # The GenBank accession can have several formats: + # 1: BAFR00000000.1 + # 2: NZ_BAGG00000000.1 + # 3: NC_016887.1 + # Case 1 is the default. + if '_' in genbank_id: + # case 2 + if len(genbank_id.split('_')[-1].split('.')[0]) == 12: + genbank_id = genbank_id.split('_')[-1] + # case 3 + else: + genbank_id = genbank_id.lower() + + # get rid of any extraneous whitespace + genbank_id = genbank_id.strip() + logger.debug(f'Parsed GenBank ID to "{genbank_id}"') + + # run a search using the GenBank accession ID + try: + dl_element = _ncbi_genbank_search(genbank_id) + if dl_element is None: + raise Exception('Unknown HTML format') + + refseq_idx = -1 + for field_idx, field in enumerate(dl_element.children): + # this is the element immediately preceding the one with + # the actual RefSeq ID we want + if field.getText().strip() == 'RefSeq assembly accession:': + refseq_idx = field_idx + 1 + + # this should be True when we've reached the right element + if field_idx == refseq_idx: + refseq_id = field.getText() + # if it has any spaces, take everything up to first one (some have annotations afterwards) + if refseq_id.find(' ') != -1: + refseq_id = refseq_id[:refseq_id.find(' ')] + + return refseq_id + + if refseq_idx == -1: + raise Exception('Expected HTML elements not found') + except Exception as e: + logger.warning( + 'Failed resolving GenBank accession {}, error {}'.format( + genbank_id, e)) + + return None + +def _resolve_jgi_accession(jgi_id): + url = JGI_GENOME_LOOKUP_URL.format(jgi_id) + logger.info( + 'Attempting to resolve JGI_Genome_ID {} to GenBank accession via {}' + .format(jgi_id, url)) + # no User-Agent header produces a 403 Forbidden error on this site... + try: + resp = httpx.get(url, + headers={'User-Agent': USER_AGENT}, + timeout=10.0, + follow_redirects=True) + except httpx.ReadTimeout: + logger.warning( + 'Timed out waiting for result of JGI_Genome_ID lookup') + return None + + soup = BeautifulSoup(resp.content, 'html.parser') + # find the table entry giving the NCBI assembly accession ID + link = soup.find( + 'a', href=re.compile('https://www.ncbi.nlm.nih.gov/nuccore/.*')) + if link is None: + return None + + return _resolve_genbank_accession(link.text) + +def _resolve_genome_id_data(genome_id_data): + if 'RefSeq_accession' in genome_id_data: + # best case, can use this directly + return genome_id_data['RefSeq_accession'] + elif 'GenBank_accession' in genome_id_data: + # resolve via NCBI + return _resolve_genbank_accession( + genome_id_data['GenBank_accession']) + elif 'JGI_Genome_ID' in genome_id_data: + # resolve via JGI => NCBI + return _resolve_jgi_accession(genome_id_data['JGI_Genome_ID']) + + logger.warning( + f'Unable to resolve genome_ID: {genome_id_data}') + return None + +def _get_antismash_db_page(genome_obj): + # want to try up to 4 different links here, v1 and v2 databases, each + # with and without the .1 suffix on the accesssion ID + + accesssions = [genome_obj.resolved_id, genome_obj.resolved_id + '.1'] + for base_url in [ANTISMASH_DB_PAGE_URL, ANTISMASH_DBV2_PAGE_URL]: + for accession in accesssions: + url = base_url.format(accession) + link = None + + logger.info('antismash DB lookup for {} => {}'.format( + accession, url)) + try: + resp = httpx.get(url, follow_redirects=True) + soup = BeautifulSoup(resp.content, 'html.parser') + # retrieve .zip file download link + link = soup.find( + 'a', {'href': lambda url: url.endswith('.zip')}) + except Exception as e: + logger.debug(f'antiSMASH DB page load failed: {e}') + + if link is not None: + logger.info( + 'antiSMASH lookup succeeded! Filename is {}'.format( + link['href'])) + # save with the .1 suffix if that worked + genome_obj.resolved_id = accession + return link['href'] + + return None + +def _get_antismash_zip_data(accession_id, filename, local_path): + for base_url in [ + ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL + ]: + zipfile_url = base_url.format(accession_id, filename) + with open(local_path, 'wb') as f: + total_bytes = 0 + try: + with httpx.stream('GET', zipfile_url) as r: + if r.status_code == 404: + logger.debug('antiSMASH download URL was a 404') + continue + + logger.info('Downloading from antiSMASH: {}'.format( + zipfile_url)) + filesize = int(r.headers['content-length']) + bar = Bar(filename, + max=filesize, + suffix='%(percent)d%%') + for data in r.iter_bytes(): + f.write(data) + total_bytes += len(data) + bar.next(len(data)) + bar.finish() + except Exception as e: + logger.warning( + f'antiSMASH zip download failed: {e}') + continue + + return True + + return False + +def _download_antismash_zip(antismash_obj, project_download_cache): + # save zip files to avoid having to repeat above lookup every time + local_path = os.path.join(project_download_cache, + f'{antismash_obj.resolved_id}.zip') + logger.debug( + f'Checking for existing antismash zip at {local_path}') + + cached = False + # if the file exists locally + if os.path.exists(local_path): + logger.info(f'Found cached file at {local_path}') + try: + # check if it's a valid zip file, if so treat it as cached + _ = zipfile.ZipFile(local_path) + cached = True + antismash_obj.filename = local_path + except zipfile.BadZipFile as bzf: + # otherwise delete and redownload + logger.info( + 'Invalid antismash zipfile found ({}). Will download again' + .format(bzf)) + os.unlink(local_path) + antismash_obj.filename = "" + + if not cached: + filename = _get_antismash_db_page(antismash_obj) + if filename is None: + return False + + _get_antismash_zip_data(antismash_obj.resolved_id, filename, + local_path) + antismash_obj.filename = local_path + + return True + +def _extract_antismash_zip(antismash_obj, project_file_cache): + if antismash_obj.filename is None or len(antismash_obj.filename) == 0: + return False + + output_path = os.path.join(project_file_cache, 'antismash', + antismash_obj.resolved_id) + exists_already = os.path.exists(output_path) and os.path.exists( + os.path.join(output_path, 'completed')) + + logger.debug( + 'Extracting antismash data to {}, exists_already = {}'.format( + output_path, exists_already)) + if exists_already: + return True + + # create a subfolder for each set of genome data (the zip files used to be + # constructed with path info but that seems to have changed recently) + if not os.path.exists(output_path): + os.makedirs(output_path, exist_ok=True) + + antismash_zip = zipfile.ZipFile(antismash_obj.filename) + kc_prefix1 = f'{antismash_obj.resolved_id}/knownclusterblast' + kc_prefix2 = 'knownclusterblast' + for zip_member in antismash_zip.namelist(): + # TODO other files here? + if zip_member.endswith('.gbk') or zip_member.endswith('.json'): + antismash_zip.extract(zip_member, path=output_path) + elif zip_member.startswith(kc_prefix1) or zip_member.startswith( + kc_prefix2): + if zip_member.endswith( + '.txt') and 'mibig_hits' not in zip_member: + antismash_zip.extract(zip_member, path=output_path) + + open(os.path.join(output_path, 'completed'), 'w').close() + + return True + +@deprecated(version="1.3.3", reason="Use download_and_extract_antismash_data class instead.") +def download_antismash_data(genome_records, project_download_cache, project_file_cache): + genome_status = {} + + # this file records genome IDs and local filenames to avoid having to repeat HTTP requests + # each time the app is loaded (this can take a lot of time if there are dozens of genomes) + genome_status_file = os.path.join(project_download_cache, + 'genome_status.txt') + + # genome lookup status info + if os.path.exists(genome_status_file): + with open(genome_status_file) as f: + for line in csv.reader(f): + asobj = GenomeStatus.from_csv(*line) + genome_status[asobj.original_id] = asobj + + for i, genome_record in enumerate(genome_records): + # get the best available ID from the dict + best_id = _get_best_available_genome_id( + genome_record['genome_ID']) + if best_id is None: + logger.warning( + 'Ignoring genome record "{}" due to missing genome ID field' + .format(genome_record)) + continue + + # use this to check if the lookup has already been attempted and if + # so if the file is cached locally + if best_id not in genome_status: + genome_status[best_id] = GenomeStatus(best_id, None) + + genome_obj = genome_status[best_id] + + logger.info( + 'Checking for antismash data {}/{}, current genome ID={}'. + format(i + 1, len(genome_records), best_id)) + # first check if file is cached locally + if os.path.exists(genome_obj.filename): + # file already downloaded + logger.info('Genome ID {} already downloaded to {}'.format( + best_id, genome_obj.filename)) + genome_record['resolved_id'] = genome_obj.resolved_id + elif genome_obj.attempted: + # lookup attempted previously but failed + logger.info( + 'Genome ID {} skipped due to previous failure'.format( + best_id)) + genome_record['resolved_id'] = genome_obj.resolved_id + else: + # if no existing file and no lookup attempted, can start process of + # trying to retrieve the data + + # lookup the ID + logger.info('Beginning lookup process for genome ID {}'.format( + best_id)) + + genome_obj.resolved_id = _resolve_genome_id_data( + genome_record['genome_ID']) + genome_obj.attempted = True + + if genome_obj.resolved_id is None: + # give up on this one + logger.warning( + f'Failed lookup for genome ID {best_id}') + with open(genome_status_file, 'a+') as f: + f.write(genome_obj.to_csv() + '\n') + continue + + # if we got a refseq ID, now try to download the data from antismash + if _download_antismash_zip(genome_obj, project_download_cache): + logger.info( + 'Genome data successfully downloaded for {}'.format( + best_id)) + genome_record['resolved_id'] = genome_obj.resolved_id + else: + logger.warning( + 'Failed to download antiSMASH data for genome ID {} ({})' + .format(genome_obj.resolved_id, + genome_obj.original_id)) + + with open(genome_status_file, 'a+', newline='\n') as f: + f.write(genome_obj.to_csv() + '\n') + + _extract_antismash_zip(genome_obj, project_file_cache) + + missing = len( + [x for x in genome_status.values() if len(x.filename) == 0]) + logger.info( + 'Dataset has {} missing sets of antiSMASH data (from a total of {})' + .format(missing, len(genome_records))) + + with open(genome_status_file, 'w', newline='\n') as f: + for obj in genome_status.values(): + f.write(obj.to_csv() + '\n') + + if missing == len(genome_records): + logger.warning('Failed to successfully retrieve ANY genome data!') From 4879181203eb4eeb52aec434fac491ee257c3e29 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Tue, 7 Mar 2023 18:23:11 +0100 Subject: [PATCH 06/16] properly define download_and_extract_antismash_metadata function --- src/nplinker/genomics/antismash/__init__.py | 2 +- .../antismash/antismash_downloader.py | 516 ++---------------- 2 files changed, 43 insertions(+), 475 deletions(-) diff --git a/src/nplinker/genomics/antismash/__init__.py b/src/nplinker/genomics/antismash/__init__.py index b0cdb0a4..482c9e6d 100644 --- a/src/nplinker/genomics/antismash/__init__.py +++ b/src/nplinker/genomics/antismash/__init__.py @@ -1,7 +1,7 @@ import logging from .antismash_loader import AntismashBGCLoader from .antismash_loader import parse_bgc_genbank -from .antismash_downloader import download_antismash_data +from .antismash_downloader import download_and_extract_antismash_metadata logging.getLogger(__name__).addHandler(logging.NullHandler()) diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py index 6d87cb4b..42477e35 100644 --- a/src/nplinker/genomics/antismash/antismash_downloader.py +++ b/src/nplinker/genomics/antismash/antismash_downloader.py @@ -1,14 +1,7 @@ -import csv import os -import re -import time -import zipfile -import httpx -from deprecated import deprecated -from bs4 import BeautifulSoup -from progress.bar import Bar +import shutil from nplinker.logconfig import LogConfig - +from nplinker.utils import download_and_extract_archive, list_dirs, list_files logger = LogConfig.getLogger(__name__) @@ -20,483 +13,58 @@ ANTISMASH_DBV2_PAGE_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/' ANTISMASH_DBV2_DOWNLOAD_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/{}' -NCBI_LOOKUP_URL_NEW = 'https://www.ncbi.nlm.nih.gov/assembly/?term={}' - -JGI_GENOME_LOOKUP_URL = 'https://img.jgi.doe.gov/cgi-bin/m/main.cgi?section=TaxonDetail&page=taxonDetail&taxon_oid={}' - -USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0' - -class GenomeStatus: - - def __init__(self, original_id, resolved_id, attempted=False, filename=""): - self.original_id = ';'.join(original_id.split(',')) - self.resolved_id = None if resolved_id == 'None' else resolved_id - self.attempted = True if attempted == 'True' else False - self.filename = filename - - @classmethod - def from_csv(cls, original_id, resolved_id, attempted, filename): - return cls(original_id, resolved_id, attempted, filename) - - def to_csv(self): - return ','.join([ - str(self.original_id), - str(self.resolved_id), - str(self.attempted), self.filename - ]) - -def _get_best_available_genome_id(genome_id_data): - if 'RefSeq_accession' in genome_id_data: - return genome_id_data['RefSeq_accession'] - elif 'GenBank_accession' in genome_id_data: - return genome_id_data['GenBank_accession'] - elif 'JGI_Genome_ID' in genome_id_data: - return genome_id_data['JGI_Genome_ID'] - - logger.warning('No known genome ID field in genome data: {}'.format( - genome_id_data)) - return None - -def _ncbi_genbank_search(genbank_id, retry_time=5.0): - url = NCBI_LOOKUP_URL_NEW.format(genbank_id) - logger.debug('Looking up GenBank data for {} at {}'.format( - genbank_id, url)) - resp = httpx.get(url, follow_redirects=True) - - if resp.status_code == httpx.codes.OK: - # the page should contain a
element with class "assembly_summary_new". retrieving - # the page seems to fail occasionally in the middle of lengthy sequences of genome - # lookups, so there might be some throttling going on. this will automatically retry - # the lookup if the expected content isn't found the first time - soup = BeautifulSoup(resp.content, 'html.parser') - # find the
element with class "assembly_summary_new" - dl_element = soup.find('dl', {'class': 'assembly_summary_new'}) - if dl_element is not None: - return dl_element - - logger.debug( - 'NCBI lookup failed, status code {}. Trying again in {} seconds'. - format(resp.status_code, retry_time)) - time.sleep(retry_time) - logger.debug('Looking up GenBank data for {} at {}'.format( - genbank_id, url)) - resp = httpx.get(url, follow_redirects=True) - if resp.status_code == httpx.codes.OK: - soup = BeautifulSoup(resp.content, 'html.parser') - # find the
element with class "assembly_summary_new" - dl_element = soup.find('dl', {'class': 'assembly_summary_new'}) - if dl_element is not None: - return dl_element - - logger.warning( - 'Failed to resolve NCBI genome ID {} at URL {} (after retrying)'. - format(genbank_id, url)) - return None - -def _resolve_genbank_accession(genbank_id): - logger.info( - 'Attempting to resolve Genbank accession {} to RefSeq accession'. - format(genbank_id)) - # genbank id => genbank seq => refseq - - # The GenBank accession can have several formats: - # 1: BAFR00000000.1 - # 2: NZ_BAGG00000000.1 - # 3: NC_016887.1 - # Case 1 is the default. - if '_' in genbank_id: - # case 2 - if len(genbank_id.split('_')[-1].split('.')[0]) == 12: - genbank_id = genbank_id.split('_')[-1] - # case 3 - else: - genbank_id = genbank_id.lower() - - # get rid of any extraneous whitespace - genbank_id = genbank_id.strip() - logger.debug(f'Parsed GenBank ID to "{genbank_id}"') - - # run a search using the GenBank accession ID - try: - dl_element = _ncbi_genbank_search(genbank_id) - if dl_element is None: - raise Exception('Unknown HTML format') +# TODO: add unit tests (only public func) +# TODO: add doc string +def download_and_extract_antismash_metadata( + refseq_assembly_id: str, + download_root: str, + extract_path: str): + """_summary_ - refseq_idx = -1 - for field_idx, field in enumerate(dl_element.children): - # this is the element immediately preceding the one with - # the actual RefSeq ID we want - if field.getText().strip() == 'RefSeq assembly accession:': - refseq_idx = field_idx + 1 + Args: + refseq_assembly_id(str): _description_ + download_root(str): _description_ + extract_path(str): _description_ - # this should be True when we've reached the right element - if field_idx == refseq_idx: - refseq_id = field.getText() - # if it has any spaces, take everything up to first one (some have annotations afterwards) - if refseq_id.find(' ') != -1: - refseq_id = refseq_id[:refseq_id.find(' ')] + Raises: + ValueError: _description_ + ValueError: _description_ - return refseq_id + Examples: + >>> + """ - if refseq_idx == -1: - raise Exception('Expected HTML elements not found') - except Exception as e: - logger.warning( - 'Failed resolving GenBank accession {}, error {}'.format( - genbank_id, e)) + if not os.path.exists(extract_path): + os.makedirs(extract_path, exist_ok=True) - return None + if download_root == extract_path: + raise ValueError( + "Identical path of download directory and extract directory") -def _resolve_jgi_accession(jgi_id): - url = JGI_GENOME_LOOKUP_URL.format(jgi_id) - logger.info( - 'Attempting to resolve JGI_Genome_ID {} to GenBank accession via {}' - .format(jgi_id, url)) - # no User-Agent header produces a 403 Forbidden error on this site... - try: - resp = httpx.get(url, - headers={'User-Agent': USER_AGENT}, - timeout=10.0, - follow_redirects=True) - except httpx.ReadTimeout: - logger.warning( - 'Timed out waiting for result of JGI_Genome_ID lookup') - return None + # check if extract_path is empty + files = [i for i in os.listdir(extract_path)] + if len(files) != 0: + raise ValueError(f'Nonempty directory: "{extract_path}"') - soup = BeautifulSoup(resp.content, 'html.parser') - # find the table entry giving the NCBI assembly accession ID - link = soup.find( - 'a', href=re.compile('https://www.ncbi.nlm.nih.gov/nuccore/.*')) - if link is None: - return None - - return _resolve_genbank_accession(link.text) - -def _resolve_genome_id_data(genome_id_data): - if 'RefSeq_accession' in genome_id_data: - # best case, can use this directly - return genome_id_data['RefSeq_accession'] - elif 'GenBank_accession' in genome_id_data: - # resolve via NCBI - return _resolve_genbank_accession( - genome_id_data['GenBank_accession']) - elif 'JGI_Genome_ID' in genome_id_data: - # resolve via JGI => NCBI - return _resolve_jgi_accession(genome_id_data['JGI_Genome_ID']) - - logger.warning( - f'Unable to resolve genome_ID: {genome_id_data}') - return None - -def _get_antismash_db_page(genome_obj): - # want to try up to 4 different links here, v1 and v2 databases, each - # with and without the .1 suffix on the accesssion ID - - accesssions = [genome_obj.resolved_id, genome_obj.resolved_id + '.1'] - for base_url in [ANTISMASH_DB_PAGE_URL, ANTISMASH_DBV2_PAGE_URL]: - for accession in accesssions: - url = base_url.format(accession) - link = None - - logger.info('antismash DB lookup for {} => {}'.format( - accession, url)) - try: - resp = httpx.get(url, follow_redirects=True) - soup = BeautifulSoup(resp.content, 'html.parser') - # retrieve .zip file download link - link = soup.find( - 'a', {'href': lambda url: url.endswith('.zip')}) - except Exception as e: - logger.debug(f'antiSMASH DB page load failed: {e}') - - if link is not None: - logger.info( - 'antiSMASH lookup succeeded! Filename is {}'.format( - link['href'])) - # save with the .1 suffix if that worked - genome_obj.resolved_id = accession - return link['href'] - - return None - -def _get_antismash_zip_data(accession_id, filename, local_path): for base_url in [ ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL ]: - zipfile_url = base_url.format(accession_id, filename) - with open(local_path, 'wb') as f: - total_bytes = 0 - try: - with httpx.stream('GET', zipfile_url) as r: - if r.status_code == 404: - logger.debug('antiSMASH download URL was a 404') - continue - - logger.info('Downloading from antiSMASH: {}'.format( - zipfile_url)) - filesize = int(r.headers['content-length']) - bar = Bar(filename, - max=filesize, - suffix='%(percent)d%%') - for data in r.iter_bytes(): - f.write(data) - total_bytes += len(data) - bar.next(len(data)) - bar.finish() - except Exception as e: - logger.warning( - f'antiSMASH zip download failed: {e}') - continue - - return True - - return False - -def _download_antismash_zip(antismash_obj, project_download_cache): - # save zip files to avoid having to repeat above lookup every time - local_path = os.path.join(project_download_cache, - f'{antismash_obj.resolved_id}.zip') - logger.debug( - f'Checking for existing antismash zip at {local_path}') - - cached = False - # if the file exists locally - if os.path.exists(local_path): - logger.info(f'Found cached file at {local_path}') - try: - # check if it's a valid zip file, if so treat it as cached - _ = zipfile.ZipFile(local_path) - cached = True - antismash_obj.filename = local_path - except zipfile.BadZipFile as bzf: - # otherwise delete and redownload - logger.info( - 'Invalid antismash zipfile found ({}). Will download again' - .format(bzf)) - os.unlink(local_path) - antismash_obj.filename = "" - - if not cached: - filename = _get_antismash_db_page(antismash_obj) - if filename is None: - return False - - _get_antismash_zip_data(antismash_obj.resolved_id, filename, - local_path) - antismash_obj.filename = local_path - - return True + url = base_url.format(refseq_assembly_id, refseq_assembly_id + '.zip') -def _extract_antismash_zip(antismash_obj, project_file_cache): - if antismash_obj.filename is None or len(antismash_obj.filename) == 0: - return False - - output_path = os.path.join(project_file_cache, 'antismash', - antismash_obj.resolved_id) - exists_already = os.path.exists(output_path) and os.path.exists( - os.path.join(output_path, 'completed')) - - logger.debug( - 'Extracting antismash data to {}, exists_already = {}'.format( - output_path, exists_already)) - if exists_already: - return True - - # create a subfolder for each set of genome data (the zip files used to be - # constructed with path info but that seems to have changed recently) - if not os.path.exists(output_path): - os.makedirs(output_path, exist_ok=True) - - antismash_zip = zipfile.ZipFile(antismash_obj.filename) - kc_prefix1 = f'{antismash_obj.resolved_id}/knownclusterblast' - kc_prefix2 = 'knownclusterblast' - for zip_member in antismash_zip.namelist(): - # TODO other files here? - if zip_member.endswith('.gbk') or zip_member.endswith('.json'): - antismash_zip.extract(zip_member, path=output_path) - elif zip_member.startswith(kc_prefix1) or zip_member.startswith( - kc_prefix2): - if zip_member.endswith( - '.txt') and 'mibig_hits' not in zip_member: - antismash_zip.extract(zip_member, path=output_path) - - open(os.path.join(output_path, 'completed'), 'w').close() - - return True - -def download_and_extract_antismash_data(item_id, download_root, extract_path): - genome_status = {} - - # this file records genome IDs and local filenames to avoid having to repeat HTTP requests - # each time the app is loaded (this can take a lot of time if there are dozens of genomes) - genome_status_file = os.path.join(download_root, - 'genome_status.txt') - - # genome lookup status info - if os.path.exists(genome_status_file): - with open(genome_status_file) as f: - for line in csv.reader(f): - asobj = GenomeStatus.from_csv(*line) - genome_status[asobj.original_id] = asobj - - # use this to check if the lookup has already been attempted and if - # so if the file is cached locally - if item_id not in genome_status: - genome_status[item_id] = GenomeStatus(item_id, None) - - genome_obj = genome_status[item_id] - - logger.info( - 'Checking for antismash data for genome ID={}'. - format(item_id)) - # first check if file is cached locally - if os.path.exists(genome_obj.filename): - # file already downloaded - logger.info('Genome ID {} already downloaded to {}'.format( - item_id, genome_obj.filename)) - elif genome_obj.attempted: - # lookup attempted previously but failed + download_and_extract_archive(url, download_root, extract_path, refseq_assembly_id + '.zip') logger.info( - 'Genome ID {} skipped due to previous failure'.format( - item_id)) - else: - # if no existing file and no lookup attempted, can start process of - # trying to retrieve the data - - # lookup the ID - logger.info('Beginning lookup process for genome ID {}'.format( - item_id)) - - genome_obj.resolved_id = item_id # TO CHECK (Cunliang) not sure if this is what we want; in a general case, - # I don't think we have different possible ids (as in podp json file, for genome_ID nested dicts), - # so maybe it makes sense to put genome_obj.resolved_id equal to the item_id and only in podp case do the check - # (through _resolve_genome_id_data, was done here before) outside this function. - # If this is true, then I think we need to modify GenomeStatus class attributes logic for original_id and resolved_id, - # which in this way would be the same thing. Then we should modify also the code below, which assumes original_id - # to be eventually different - genome_obj.attempted = True - - if genome_obj.resolved_id is None: - # give up on this one - logger.warning( - f'Failed lookup for genome ID {item_id}') - with open(genome_status_file, 'a+') as f: - f.write(genome_obj.to_csv() + '\n') - - # if we got a refseq ID, now try to download the data from antismash - if _download_antismash_zip(genome_obj, download_root): - logger.info( - 'Genome data successfully downloaded for {}'.format( - item_id)) - else: - logger.warning( - 'Failed to download antiSMASH data for genome ID {} ({})' - .format(genome_obj.resolved_id, - genome_obj.original_id)) - - with open(genome_status_file, 'a+', newline='\n') as f: - f.write(genome_obj.to_csv() + '\n') - - _extract_antismash_zip(genome_obj, extract_path) - - with open(genome_status_file, 'w', newline='\n') as f: - for obj in genome_status.values(): - f.write(obj.to_csv() + '\n') - -@deprecated(version="1.3.3", reason="Use download_and_extract_antismash_data class instead.") -def download_antismash_data(genome_records, project_download_cache, project_file_cache): - genome_status = {} - - # this file records genome IDs and local filenames to avoid having to repeat HTTP requests - # each time the app is loaded (this can take a lot of time if there are dozens of genomes) - genome_status_file = os.path.join(project_download_cache, - 'genome_status.txt') - - # genome lookup status info - if os.path.exists(genome_status_file): - with open(genome_status_file) as f: - for line in csv.reader(f): - asobj = GenomeStatus.from_csv(*line) - genome_status[asobj.original_id] = asobj - - for i, genome_record in enumerate(genome_records): - # get the best available ID from the dict - best_id = _get_best_available_genome_id( - genome_record['genome_ID']) - if best_id is None: - logger.warning( - 'Ignoring genome record "{}" due to missing genome ID field' - .format(genome_record)) - continue - - # use this to check if the lookup has already been attempted and if - # so if the file is cached locally - if best_id not in genome_status: - genome_status[best_id] = GenomeStatus(best_id, None) - - genome_obj = genome_status[best_id] - - logger.info( - 'Checking for antismash data {}/{}, current genome ID={}'. - format(i + 1, len(genome_records), best_id)) - # first check if file is cached locally - if os.path.exists(genome_obj.filename): - # file already downloaded - logger.info('Genome ID {} already downloaded to {}'.format( - best_id, genome_obj.filename)) - genome_record['resolved_id'] = genome_obj.resolved_id - elif genome_obj.attempted: - # lookup attempted previously but failed - logger.info( - 'Genome ID {} skipped due to previous failure'.format( - best_id)) - genome_record['resolved_id'] = genome_obj.resolved_id - else: - # if no existing file and no lookup attempted, can start process of - # trying to retrieve the data - - # lookup the ID - logger.info('Beginning lookup process for genome ID {}'.format( - best_id)) - - genome_obj.resolved_id = _resolve_genome_id_data( - genome_record['genome_ID']) - genome_obj.attempted = True - - if genome_obj.resolved_id is None: - # give up on this one - logger.warning( - f'Failed lookup for genome ID {best_id}') - with open(genome_status_file, 'a+') as f: - f.write(genome_obj.to_csv() + '\n') - continue - - # if we got a refseq ID, now try to download the data from antismash - if _download_antismash_zip(genome_obj, project_download_cache): - logger.info( - 'Genome data successfully downloaded for {}'.format( - best_id)) - genome_record['resolved_id'] = genome_obj.resolved_id - else: - logger.warning( - 'Failed to download antiSMASH data for genome ID {} ({})' - .format(genome_obj.resolved_id, - genome_obj.original_id)) - - with open(genome_status_file, 'a+', newline='\n') as f: - f.write(genome_obj.to_csv() + '\n') - - _extract_antismash_zip(genome_obj, project_file_cache) + f'Genome data successfully extracted for {refseq_assembly_id}') + break - missing = len( - [x for x in genome_status.values() if len(x.filename) == 0]) - logger.info( - 'Dataset has {} missing sets of antiSMASH data (from a total of {})' - .format(missing, len(genome_records))) + # delete subdirs + logger.info('Deleting unnecessary extracted subdirs and files') + subdirs = list_dirs(extract_path) + for subdir_path in subdirs: + shutil.rmtree(subdir_path) - with open(genome_status_file, 'w', newline='\n') as f: - for obj in genome_status.values(): - f.write(obj.to_csv() + '\n') + files_to_keep = list_files(extract_path, suffix=(".json", ".gbk")) - if missing == len(genome_records): - logger.warning('Failed to successfully retrieve ANY genome data!') + for file in list_files(extract_path): + if file not in files_to_keep: + os.remove(file) + logger.info(f'download_and_extract_antismash_metadata process for {refseq_assembly_id} is over') From 392a73625bb45dec0321ff1a9c6f51242080627d Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Wed, 8 Mar 2023 12:17:16 +0100 Subject: [PATCH 07/16] add internal funcs and doc string --- .../antismash/antismash_downloader.py | 51 +++++++++++-------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py index 42477e35..c1db5f26 100644 --- a/src/nplinker/genomics/antismash/antismash_downloader.py +++ b/src/nplinker/genomics/antismash/antismash_downloader.py @@ -13,38 +13,49 @@ ANTISMASH_DBV2_PAGE_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/' ANTISMASH_DBV2_DOWNLOAD_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/{}' +def _check_roots(download_root, extract_root): + if download_root == extract_root: + raise ValueError( + "Identical path of download directory and extract directory") + +def _check_extract_path(extract_path): + if os.path.exists(extract_path): + # check if extract_path is empty + files = [i for i in os.listdir(extract_path)] + if len(files) != 0: + raise ValueError(f'Nonempty directory: "{extract_path}"') + else: + os.makedirs(extract_path, exist_ok=True) + # TODO: add unit tests (only public func) -# TODO: add doc string def download_and_extract_antismash_metadata( refseq_assembly_id: str, download_root: str, - extract_path: str): - """_summary_ + extract_root: str): + """Download and extract Antismash files for a specified refseq_assembly_id. Args: - refseq_assembly_id(str): _description_ - download_root(str): _description_ - extract_path(str): _description_ + refseq_assembly_id(str): Assembled genomic RefSeq (reference sequence) id. + If the id is versioned (e.g., "GCF_004339725.1") please be sure to + specify the version as well. + download_root(str): Path to the directory to place downloaded archive in. + extract_root(str): Path to the directory data files will be extracted to. + Note that if it will create an antismash/ directory in the specified extract_root, if + it doesn't already exist. + The files will be extracted to /antismash// dir. Raises: - ValueError: _description_ - ValueError: _description_ + ValueError: if download_root and extract_root dirs are the same. + ValueError: if /antismash/ dir is not empty. Examples: - >>> + >>> download_and_extract_antismash_metadata("GCF_004339725.1", "/data/download", "/data/extracted") """ + + extract_path = extract_root + f'/antismash/{refseq_assembly_id}' - if not os.path.exists(extract_path): - os.makedirs(extract_path, exist_ok=True) - - if download_root == extract_path: - raise ValueError( - "Identical path of download directory and extract directory") - - # check if extract_path is empty - files = [i for i in os.listdir(extract_path)] - if len(files) != 0: - raise ValueError(f'Nonempty directory: "{extract_path}"') + _check_roots(download_root, extract_root) + _check_extract_path(extract_path) for base_url in [ ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL From 850b7466ccaa7fd810ada3c304e384a8c06e1ece Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 9 Mar 2023 11:22:02 +0100 Subject: [PATCH 08/16] add tests and create antismash test folder --- tests/genomics/antismash/__init__.py | 0 tests/genomics/antismash/test_downloader.py | 35 +++++++++++++++++++ .../test_loader.py} | 2 +- 3 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 tests/genomics/antismash/__init__.py create mode 100644 tests/genomics/antismash/test_downloader.py rename tests/genomics/{test_antismash_loader.py => antismash/test_loader.py} (99%) diff --git a/tests/genomics/antismash/__init__.py b/tests/genomics/antismash/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/genomics/antismash/test_downloader.py b/tests/genomics/antismash/test_downloader.py new file mode 100644 index 00000000..059029f6 --- /dev/null +++ b/tests/genomics/antismash/test_downloader.py @@ -0,0 +1,35 @@ +import pytest +from nplinker.genomics.antismash import download_and_extract_antismash_metadata + +class TestDownloadAndExtractAntismashData(): + + def test_default(self, tmp_path): + download_root = tmp_path / "download" + extract_root = tmp_path / "extracted" + download_root.mkdir() + extract_root.mkdir() + refseq_assembly_id = "GCF_004339725.1" + download_and_extract_antismash_metadata(refseq_assembly_id, download_root, extract_root) + archive = download_root / "GCF_004339725.1.zip" + extracted_folder = extract_root / "antismash/GCF_004339725.1" + assert archive.exists() + assert archive.is_file() + assert extracted_folder.exists() + + def test_error_same_path(self, tmp_path): + refseq_assembly_id = "GCF_004339725.1" + with pytest.raises(ValueError) as e: + download_and_extract_antismash_metadata(refseq_assembly_id, tmp_path, tmp_path) + assert e.value.args[ + 0] == "Identical path of download directory and extract directory" + + def test_error_nonempty_path(self, tmp_path): + refseq_assembly_id = "GCF_004339725.1" + nonempty_path = tmp_path / f"extracted/antismash/{refseq_assembly_id}/subdir" + nonempty_path.mkdir(parents=True) + with pytest.raises(ValueError) as e: + download_and_extract_antismash_metadata( + refseq_assembly_id, + tmp_path, + tmp_path / "extracted") + assert "Nonempty directory" in e.value.args[0] diff --git a/tests/genomics/test_antismash_loader.py b/tests/genomics/antismash/test_loader.py similarity index 99% rename from tests/genomics/test_antismash_loader.py rename to tests/genomics/antismash/test_loader.py index 019feea6..dfe2a25b 100644 --- a/tests/genomics/test_antismash_loader.py +++ b/tests/genomics/antismash/test_loader.py @@ -3,7 +3,7 @@ from nplinker.genomics import BGCLoaderBase from nplinker.genomics.antismash import AntismashBGCLoader from nplinker.genomics.antismash import parse_bgc_genbank -from .. import DATA_DIR +from ... import DATA_DIR class TestAntismashBGCLoader: From e186ba8f5d0b0564f66b913bd30665615d7e3502 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 9 Mar 2023 11:23:00 +0100 Subject: [PATCH 09/16] format properly extract_path --- src/nplinker/genomics/antismash/antismash_downloader.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py index c1db5f26..c4936d75 100644 --- a/src/nplinker/genomics/antismash/antismash_downloader.py +++ b/src/nplinker/genomics/antismash/antismash_downloader.py @@ -51,8 +51,7 @@ def download_and_extract_antismash_metadata( Examples: >>> download_and_extract_antismash_metadata("GCF_004339725.1", "/data/download", "/data/extracted") """ - - extract_path = extract_root + f'/antismash/{refseq_assembly_id}' + extract_path = os.path.join(extract_root, "antismash", refseq_assembly_id) _check_roots(download_root, extract_root) _check_extract_path(extract_path) From 61e17fec836ea7cd6e93d5b73b51d73ff50d09fc Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 9 Mar 2023 11:40:09 +0100 Subject: [PATCH 10/16] run linting and formatting for modified files using yapf --- .../antismash/antismash_downloader.py | 22 ++-- .../pairedomics/podp_antismash_downloader.py | 116 +++++++++--------- tests/genomics/antismash/test_downloader.py | 18 +-- tests/genomics/antismash/test_loader.py | 8 +- 4 files changed, 84 insertions(+), 80 deletions(-) diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py index c4936d75..6171a835 100644 --- a/src/nplinker/genomics/antismash/antismash_downloader.py +++ b/src/nplinker/genomics/antismash/antismash_downloader.py @@ -13,11 +13,13 @@ ANTISMASH_DBV2_PAGE_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/' ANTISMASH_DBV2_DOWNLOAD_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/{}' + def _check_roots(download_root, extract_root): if download_root == extract_root: raise ValueError( "Identical path of download directory and extract directory") + def _check_extract_path(extract_path): if os.path.exists(extract_path): # check if extract_path is empty @@ -27,11 +29,10 @@ def _check_extract_path(extract_path): else: os.makedirs(extract_path, exist_ok=True) -# TODO: add unit tests (only public func) -def download_and_extract_antismash_metadata( - refseq_assembly_id: str, - download_root: str, - extract_root: str): + +def download_and_extract_antismash_metadata(refseq_assembly_id: str, + download_root: str, + extract_root: str): """Download and extract Antismash files for a specified refseq_assembly_id. Args: @@ -56,12 +57,11 @@ def download_and_extract_antismash_metadata( _check_roots(download_root, extract_root) _check_extract_path(extract_path) - for base_url in [ - ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL - ]: + for base_url in [ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL]: url = base_url.format(refseq_assembly_id, refseq_assembly_id + '.zip') - download_and_extract_archive(url, download_root, extract_path, refseq_assembly_id + '.zip') + download_and_extract_archive(url, download_root, extract_path, + refseq_assembly_id + '.zip') logger.info( f'Genome data successfully extracted for {refseq_assembly_id}') break @@ -77,4 +77,6 @@ def download_and_extract_antismash_metadata( for file in list_files(extract_path): if file not in files_to_keep: os.remove(file) - logger.info(f'download_and_extract_antismash_metadata process for {refseq_assembly_id} is over') + logger.info( + f'download_and_extract_antismash_metadata process for {refseq_assembly_id} is over' + ) diff --git a/src/nplinker/pairedomics/podp_antismash_downloader.py b/src/nplinker/pairedomics/podp_antismash_downloader.py index 76cd56c8..30f8ef58 100644 --- a/src/nplinker/pairedomics/podp_antismash_downloader.py +++ b/src/nplinker/pairedomics/podp_antismash_downloader.py @@ -9,7 +9,6 @@ from progress.bar import Bar from nplinker.logconfig import LogConfig - logger = LogConfig.getLogger(__name__) # urls to be given to download antismash data @@ -26,6 +25,7 @@ USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0' + class GenomeStatus: def __init__(self, original_id, resolved_id, attempted=False, filename=""): @@ -45,6 +45,7 @@ def to_csv(self): str(self.attempted), self.filename ]) + def _get_best_available_genome_id(genome_id_data): if 'RefSeq_accession' in genome_id_data: return genome_id_data['RefSeq_accession'] @@ -53,10 +54,11 @@ def _get_best_available_genome_id(genome_id_data): elif 'JGI_Genome_ID' in genome_id_data: return genome_id_data['JGI_Genome_ID'] - logger.warning('No known genome ID field in genome data: {}'.format( - genome_id_data)) + logger.warning( + 'No known genome ID field in genome data: {}'.format(genome_id_data)) return None + def _ncbi_genbank_search(genbank_id, retry_time=5.0): url = NCBI_LOOKUP_URL_NEW.format(genbank_id) logger.debug('Looking up GenBank data for {} at {}'.format( @@ -93,6 +95,7 @@ def _ncbi_genbank_search(genbank_id, retry_time=5.0): format(genbank_id, url)) return None + def _resolve_genbank_accession(genbank_id): logger.info( 'Attempting to resolve Genbank accession {} to RefSeq accession'. @@ -147,20 +150,20 @@ def _resolve_genbank_accession(genbank_id): return None + def _resolve_jgi_accession(jgi_id): url = JGI_GENOME_LOOKUP_URL.format(jgi_id) logger.info( - 'Attempting to resolve JGI_Genome_ID {} to GenBank accession via {}' - .format(jgi_id, url)) + 'Attempting to resolve JGI_Genome_ID {} to GenBank accession via {}'. + format(jgi_id, url)) # no User-Agent header produces a 403 Forbidden error on this site... try: resp = httpx.get(url, - headers={'User-Agent': USER_AGENT}, - timeout=10.0, - follow_redirects=True) + headers={'User-Agent': USER_AGENT}, + timeout=10.0, + follow_redirects=True) except httpx.ReadTimeout: - logger.warning( - 'Timed out waiting for result of JGI_Genome_ID lookup') + logger.warning('Timed out waiting for result of JGI_Genome_ID lookup') return None soup = BeautifulSoup(resp.content, 'html.parser') @@ -172,22 +175,22 @@ def _resolve_jgi_accession(jgi_id): return _resolve_genbank_accession(link.text) + def _resolve_genome_id_data(genome_id_data): if 'RefSeq_accession' in genome_id_data: # best case, can use this directly return genome_id_data['RefSeq_accession'] elif 'GenBank_accession' in genome_id_data: # resolve via NCBI - return _resolve_genbank_accession( - genome_id_data['GenBank_accession']) + return _resolve_genbank_accession(genome_id_data['GenBank_accession']) elif 'JGI_Genome_ID' in genome_id_data: # resolve via JGI => NCBI return _resolve_jgi_accession(genome_id_data['JGI_Genome_ID']) - logger.warning( - f'Unable to resolve genome_ID: {genome_id_data}') + logger.warning(f'Unable to resolve genome_ID: {genome_id_data}') return None + def _get_antismash_db_page(genome_obj): # want to try up to 4 different links here, v1 and v2 databases, each # with and without the .1 suffix on the accesssion ID @@ -204,8 +207,8 @@ def _get_antismash_db_page(genome_obj): resp = httpx.get(url, follow_redirects=True) soup = BeautifulSoup(resp.content, 'html.parser') # retrieve .zip file download link - link = soup.find( - 'a', {'href': lambda url: url.endswith('.zip')}) + link = soup.find('a', + {'href': lambda url: url.endswith('.zip')}) except Exception as e: logger.debug(f'antiSMASH DB page load failed: {e}') @@ -219,10 +222,9 @@ def _get_antismash_db_page(genome_obj): return None + def _get_antismash_zip_data(accession_id, filename, local_path): - for base_url in [ - ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL - ]: + for base_url in [ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL]: zipfile_url = base_url.format(accession_id, filename) with open(local_path, 'wb') as f: total_bytes = 0 @@ -232,32 +234,29 @@ def _get_antismash_zip_data(accession_id, filename, local_path): logger.debug('antiSMASH download URL was a 404') continue - logger.info('Downloading from antiSMASH: {}'.format( - zipfile_url)) + logger.info( + 'Downloading from antiSMASH: {}'.format(zipfile_url)) filesize = int(r.headers['content-length']) - bar = Bar(filename, - max=filesize, - suffix='%(percent)d%%') + bar = Bar(filename, max=filesize, suffix='%(percent)d%%') for data in r.iter_bytes(): f.write(data) total_bytes += len(data) bar.next(len(data)) bar.finish() except Exception as e: - logger.warning( - f'antiSMASH zip download failed: {e}') + logger.warning(f'antiSMASH zip download failed: {e}') continue return True return False + def _download_antismash_zip(antismash_obj, project_download_cache): # save zip files to avoid having to repeat above lookup every time local_path = os.path.join(project_download_cache, - f'{antismash_obj.resolved_id}.zip') - logger.debug( - f'Checking for existing antismash zip at {local_path}') + f'{antismash_obj.resolved_id}.zip') + logger.debug(f'Checking for existing antismash zip at {local_path}') cached = False # if the file exists locally @@ -271,8 +270,8 @@ def _download_antismash_zip(antismash_obj, project_download_cache): except zipfile.BadZipFile as bzf: # otherwise delete and redownload logger.info( - 'Invalid antismash zipfile found ({}). Will download again' - .format(bzf)) + 'Invalid antismash zipfile found ({}). Will download again'. + format(bzf)) os.unlink(local_path) antismash_obj.filename = "" @@ -282,23 +281,23 @@ def _download_antismash_zip(antismash_obj, project_download_cache): return False _get_antismash_zip_data(antismash_obj.resolved_id, filename, - local_path) + local_path) antismash_obj.filename = local_path return True + def _extract_antismash_zip(antismash_obj, project_file_cache): if antismash_obj.filename is None or len(antismash_obj.filename) == 0: return False output_path = os.path.join(project_file_cache, 'antismash', - antismash_obj.resolved_id) + antismash_obj.resolved_id) exists_already = os.path.exists(output_path) and os.path.exists( os.path.join(output_path, 'completed')) - logger.debug( - 'Extracting antismash data to {}, exists_already = {}'.format( - output_path, exists_already)) + logger.debug('Extracting antismash data to {}, exists_already = {}'.format( + output_path, exists_already)) if exists_already: return True @@ -316,22 +315,24 @@ def _extract_antismash_zip(antismash_obj, project_file_cache): antismash_zip.extract(zip_member, path=output_path) elif zip_member.startswith(kc_prefix1) or zip_member.startswith( kc_prefix2): - if zip_member.endswith( - '.txt') and 'mibig_hits' not in zip_member: + if zip_member.endswith('.txt') and 'mibig_hits' not in zip_member: antismash_zip.extract(zip_member, path=output_path) open(os.path.join(output_path, 'completed'), 'w').close() return True -@deprecated(version="1.3.3", reason="Use download_and_extract_antismash_data class instead.") -def download_antismash_data(genome_records, project_download_cache, project_file_cache): + +@deprecated(version="1.3.3", + reason="Use download_and_extract_antismash_data class instead.") +def download_antismash_data(genome_records, project_download_cache, + project_file_cache): genome_status = {} # this file records genome IDs and local filenames to avoid having to repeat HTTP requests # each time the app is loaded (this can take a lot of time if there are dozens of genomes) genome_status_file = os.path.join(project_download_cache, - 'genome_status.txt') + 'genome_status.txt') # genome lookup status info if os.path.exists(genome_status_file): @@ -342,12 +343,11 @@ def download_antismash_data(genome_records, project_download_cache, project_file for i, genome_record in enumerate(genome_records): # get the best available ID from the dict - best_id = _get_best_available_genome_id( - genome_record['genome_ID']) + best_id = _get_best_available_genome_id(genome_record['genome_ID']) if best_id is None: logger.warning( - 'Ignoring genome record "{}" due to missing genome ID field' - .format(genome_record)) + 'Ignoring genome record "{}" due to missing genome ID field'. + format(genome_record)) continue # use this to check if the lookup has already been attempted and if @@ -358,8 +358,8 @@ def download_antismash_data(genome_records, project_download_cache, project_file genome_obj = genome_status[best_id] logger.info( - 'Checking for antismash data {}/{}, current genome ID={}'. - format(i + 1, len(genome_records), best_id)) + 'Checking for antismash data {}/{}, current genome ID={}'.format( + i + 1, len(genome_records), best_id)) # first check if file is cached locally if os.path.exists(genome_obj.filename): # file already downloaded @@ -369,16 +369,15 @@ def download_antismash_data(genome_records, project_download_cache, project_file elif genome_obj.attempted: # lookup attempted previously but failed logger.info( - 'Genome ID {} skipped due to previous failure'.format( - best_id)) + 'Genome ID {} skipped due to previous failure'.format(best_id)) genome_record['resolved_id'] = genome_obj.resolved_id else: # if no existing file and no lookup attempted, can start process of # trying to retrieve the data # lookup the ID - logger.info('Beginning lookup process for genome ID {}'.format( - best_id)) + logger.info( + 'Beginning lookup process for genome ID {}'.format(best_id)) genome_obj.resolved_id = _resolve_genome_id_data( genome_record['genome_ID']) @@ -386,8 +385,7 @@ def download_antismash_data(genome_records, project_download_cache, project_file if genome_obj.resolved_id is None: # give up on this one - logger.warning( - f'Failed lookup for genome ID {best_id}') + logger.warning(f'Failed lookup for genome ID {best_id}') with open(genome_status_file, 'a+') as f: f.write(genome_obj.to_csv() + '\n') continue @@ -400,20 +398,18 @@ def download_antismash_data(genome_records, project_download_cache, project_file genome_record['resolved_id'] = genome_obj.resolved_id else: logger.warning( - 'Failed to download antiSMASH data for genome ID {} ({})' - .format(genome_obj.resolved_id, - genome_obj.original_id)) + 'Failed to download antiSMASH data for genome ID {} ({})'. + format(genome_obj.resolved_id, genome_obj.original_id)) with open(genome_status_file, 'a+', newline='\n') as f: f.write(genome_obj.to_csv() + '\n') _extract_antismash_zip(genome_obj, project_file_cache) - missing = len( - [x for x in genome_status.values() if len(x.filename) == 0]) + missing = len([x for x in genome_status.values() if len(x.filename) == 0]) logger.info( - 'Dataset has {} missing sets of antiSMASH data (from a total of {})' - .format(missing, len(genome_records))) + 'Dataset has {} missing sets of antiSMASH data (from a total of {})'. + format(missing, len(genome_records))) with open(genome_status_file, 'w', newline='\n') as f: for obj in genome_status.values(): diff --git a/tests/genomics/antismash/test_downloader.py b/tests/genomics/antismash/test_downloader.py index 059029f6..631d4d0c 100644 --- a/tests/genomics/antismash/test_downloader.py +++ b/tests/genomics/antismash/test_downloader.py @@ -1,15 +1,17 @@ import pytest from nplinker.genomics.antismash import download_and_extract_antismash_metadata + class TestDownloadAndExtractAntismashData(): def test_default(self, tmp_path): + refseq_assembly_id = "GCF_004339725.1" download_root = tmp_path / "download" - extract_root = tmp_path / "extracted" download_root.mkdir() + extract_root = tmp_path / "extracted" extract_root.mkdir() - refseq_assembly_id = "GCF_004339725.1" - download_and_extract_antismash_metadata(refseq_assembly_id, download_root, extract_root) + download_and_extract_antismash_metadata(refseq_assembly_id, + download_root, extract_root) archive = download_root / "GCF_004339725.1.zip" extracted_folder = extract_root / "antismash/GCF_004339725.1" assert archive.exists() @@ -19,7 +21,8 @@ def test_default(self, tmp_path): def test_error_same_path(self, tmp_path): refseq_assembly_id = "GCF_004339725.1" with pytest.raises(ValueError) as e: - download_and_extract_antismash_metadata(refseq_assembly_id, tmp_path, tmp_path) + download_and_extract_antismash_metadata(refseq_assembly_id, + tmp_path, tmp_path) assert e.value.args[ 0] == "Identical path of download directory and extract directory" @@ -28,8 +31,7 @@ def test_error_nonempty_path(self, tmp_path): nonempty_path = tmp_path / f"extracted/antismash/{refseq_assembly_id}/subdir" nonempty_path.mkdir(parents=True) with pytest.raises(ValueError) as e: - download_and_extract_antismash_metadata( - refseq_assembly_id, - tmp_path, - tmp_path / "extracted") + download_and_extract_antismash_metadata(refseq_assembly_id, + tmp_path, + tmp_path / "extracted") assert "Nonempty directory" in e.value.args[0] diff --git a/tests/genomics/antismash/test_loader.py b/tests/genomics/antismash/test_loader.py index dfe2a25b..ca356f58 100644 --- a/tests/genomics/antismash/test_loader.py +++ b/tests/genomics/antismash/test_loader.py @@ -63,10 +63,14 @@ def test_parse_bgc_genbank(): assert bgc.antismash_id == "NZ_AZWB01000005" assert bgc.antismash_file == gbk_file assert bgc.antismash_region == "1" - assert bgc.smiles == ["NC([*])C(=O)NC([*])C(=O)NC(CO)C(=O)NC(Cc1ccccc1)C(=O)NCC(=O)O"] + assert bgc.smiles == [ + "NC([*])C(=O)NC([*])C(=O)NC(CO)C(=O)NC(Cc1ccccc1)C(=O)NCC(=O)O" + ] + def test_parse_bgc_genbank_error(): gbk_file = str(DATA_DIR / "fake_antismash.region001.gbk") with pytest.raises(ValueError) as e: parse_bgc_genbank(gbk_file) - assert "Not found product prediction in antiSMASH Genbank file" in e.value.args[0] + assert "Not found product prediction in antiSMASH Genbank file" in e.value.args[ + 0] From 601957d81f1a4ce5e2fdc0a87d39e0524a565536 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 9 Mar 2023 13:26:00 +0100 Subject: [PATCH 11/16] fix prospector errors --- src/nplinker/genomics/antismash/__init__.py | 2 +- src/nplinker/genomics/antismash/antismash_downloader.py | 6 +++--- src/nplinker/genomics/antismash/antismash_loader.py | 3 +-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/nplinker/genomics/antismash/__init__.py b/src/nplinker/genomics/antismash/__init__.py index 482c9e6d..b7b7714f 100644 --- a/src/nplinker/genomics/antismash/__init__.py +++ b/src/nplinker/genomics/antismash/__init__.py @@ -5,4 +5,4 @@ logging.getLogger(__name__).addHandler(logging.NullHandler()) -__all__ = ["AntismashBGCLoader", "parse_bgc_genbank"] +__all__ = ["AntismashBGCLoader", "parse_bgc_genbank", "download_and_extract_antismash_metadata"] diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py index 6171a835..414825ec 100644 --- a/src/nplinker/genomics/antismash/antismash_downloader.py +++ b/src/nplinker/genomics/antismash/antismash_downloader.py @@ -23,7 +23,7 @@ def _check_roots(download_root, extract_root): def _check_extract_path(extract_path): if os.path.exists(extract_path): # check if extract_path is empty - files = [i for i in os.listdir(extract_path)] + files = list(os.listdir(extract_path)) if len(files) != 0: raise ValueError(f'Nonempty directory: "{extract_path}"') else: @@ -63,7 +63,7 @@ def download_and_extract_antismash_metadata(refseq_assembly_id: str, download_and_extract_archive(url, download_root, extract_path, refseq_assembly_id + '.zip') logger.info( - f'Genome data successfully extracted for {refseq_assembly_id}') + 'Genome data successfully extracted for %s', refseq_assembly_id) break # delete subdirs @@ -78,5 +78,5 @@ def download_and_extract_antismash_metadata(refseq_assembly_id: str, if file not in files_to_keep: os.remove(file) logger.info( - f'download_and_extract_antismash_metadata process for {refseq_assembly_id} is over' + 'download_and_extract_antismash_metadata process for %s is over', refseq_assembly_id ) diff --git a/src/nplinker/genomics/antismash/antismash_loader.py b/src/nplinker/genomics/antismash/antismash_loader.py index b1d3e7d4..156812a1 100644 --- a/src/nplinker/genomics/antismash/antismash_loader.py +++ b/src/nplinker/genomics/antismash/antismash_loader.py @@ -126,8 +126,7 @@ def parse_bgc_genbank(file: str) -> BGC: product_prediction = features.get("product") if product_prediction is None: raise ValueError( - "Not found product prediction in antiSMASH Genbank file {}".format( - file)) + f"Not found product prediction in antiSMASH Genbank file {file}") # init BGC bgc = BGC(bgc_id=fname, product_prediction=product_prediction) From e9fbe999f0179ac7f035a712090b483cc64e5984 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 9 Mar 2023 13:33:35 +0100 Subject: [PATCH 12/16] make refseq_assembly_id class variable for tests --- tests/genomics/antismash/test_downloader.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/tests/genomics/antismash/test_downloader.py b/tests/genomics/antismash/test_downloader.py index 631d4d0c..d646d42f 100644 --- a/tests/genomics/antismash/test_downloader.py +++ b/tests/genomics/antismash/test_downloader.py @@ -4,13 +4,14 @@ class TestDownloadAndExtractAntismashData(): + refseq_assembly_id = "GCF_004339725.1" + def test_default(self, tmp_path): - refseq_assembly_id = "GCF_004339725.1" download_root = tmp_path / "download" download_root.mkdir() extract_root = tmp_path / "extracted" extract_root.mkdir() - download_and_extract_antismash_metadata(refseq_assembly_id, + download_and_extract_antismash_metadata(self.refseq_assembly_id, download_root, extract_root) archive = download_root / "GCF_004339725.1.zip" extracted_folder = extract_root / "antismash/GCF_004339725.1" @@ -19,19 +20,17 @@ def test_default(self, tmp_path): assert extracted_folder.exists() def test_error_same_path(self, tmp_path): - refseq_assembly_id = "GCF_004339725.1" with pytest.raises(ValueError) as e: - download_and_extract_antismash_metadata(refseq_assembly_id, + download_and_extract_antismash_metadata(self.refseq_assembly_id, tmp_path, tmp_path) assert e.value.args[ 0] == "Identical path of download directory and extract directory" def test_error_nonempty_path(self, tmp_path): - refseq_assembly_id = "GCF_004339725.1" - nonempty_path = tmp_path / f"extracted/antismash/{refseq_assembly_id}/subdir" + nonempty_path = tmp_path / f"extracted/antismash/{self.refseq_assembly_id}/subdir" nonempty_path.mkdir(parents=True) with pytest.raises(ValueError) as e: - download_and_extract_antismash_metadata(refseq_assembly_id, + download_and_extract_antismash_metadata(self.refseq_assembly_id, tmp_path, tmp_path / "extracted") assert "Nonempty directory" in e.value.args[0] From 9a22c06f4448718f51f5f6b5cfbafa0bb322560b Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 9 Mar 2023 13:35:37 +0100 Subject: [PATCH 13/16] reorder imports --- src/nplinker/genomics/antismash/__init__.py | 3 ++- src/nplinker/genomics/antismash/antismash_downloader.py | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/src/nplinker/genomics/antismash/__init__.py b/src/nplinker/genomics/antismash/__init__.py index b7b7714f..411a58ca 100644 --- a/src/nplinker/genomics/antismash/__init__.py +++ b/src/nplinker/genomics/antismash/__init__.py @@ -1,7 +1,8 @@ import logging +from .antismash_downloader import download_and_extract_antismash_metadata from .antismash_loader import AntismashBGCLoader from .antismash_loader import parse_bgc_genbank -from .antismash_downloader import download_and_extract_antismash_metadata + logging.getLogger(__name__).addHandler(logging.NullHandler()) diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py index 414825ec..99a22205 100644 --- a/src/nplinker/genomics/antismash/antismash_downloader.py +++ b/src/nplinker/genomics/antismash/antismash_downloader.py @@ -1,7 +1,10 @@ import os import shutil from nplinker.logconfig import LogConfig -from nplinker.utils import download_and_extract_archive, list_dirs, list_files +from nplinker.utils import download_and_extract_archive +from nplinker.utils import list_dirs +from nplinker.utils import list_files + logger = LogConfig.getLogger(__name__) From decf286a8fccc119f4f178db59add40dae174a8a Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 9 Mar 2023 13:39:01 +0100 Subject: [PATCH 14/16] add minor static typing --- src/nplinker/genomics/antismash/antismash_downloader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py index 99a22205..39141b0a 100644 --- a/src/nplinker/genomics/antismash/antismash_downloader.py +++ b/src/nplinker/genomics/antismash/antismash_downloader.py @@ -17,13 +17,13 @@ ANTISMASH_DBV2_DOWNLOAD_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/{}' -def _check_roots(download_root, extract_root): +def _check_roots(download_root: str, extract_root: str): if download_root == extract_root: raise ValueError( "Identical path of download directory and extract directory") -def _check_extract_path(extract_path): +def _check_extract_path(extract_path: str): if os.path.exists(extract_path): # check if extract_path is empty files = list(os.listdir(extract_path)) From 0c5b6bd6c8505b1f041fbbf56b63eae4f7d413ed Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Wed, 15 Mar 2023 12:47:09 +0100 Subject: [PATCH 15/16] Revert "Merge branch 'dev' into 98_add_antismash_downloader_gcroci2" This reverts commit f64a17c41716c00867c39e0655059a34bdcdf5a9, reversing changes made to 4879181203eb4eeb52aec434fac491ee257c3e29. revert merge --- .vscode/settings.json | 6 +- src/nplinker/loader.py | 33 ++++----- .../metabolomics/gnps/gnps_extractor.py | 7 +- src/nplinker/metabolomics/gnps/gnps_loader.py | 3 - src/nplinker/metabolomics/load_gnps.py | 12 ++-- src/nplinker/metabolomics/metabolomics.py | 23 +++--- src/nplinker/scoring/linking/data_linking.py | 4 +- src/nplinker/strains.py | 3 - tests/test_gnps_loader.py | 6 -- tests/test_loader.py | 71 ++----------------- 10 files changed, 43 insertions(+), 125 deletions(-) delete mode 100644 src/nplinker/metabolomics/gnps/gnps_loader.py delete mode 100644 tests/test_gnps_loader.py diff --git a/.vscode/settings.json b/.vscode/settings.json index 30cb3f3f..7411789c 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -20,8 +20,10 @@ // python docstring "autoDocstring.docstringFormat": "google", "autoDocstring.customTemplatePath": ".vscode/vscode_docstring_google_adapted.mustache", + + // Jupyter + "jupyter.sendSelectionToInteractiveWindow": true, "jupyter.askForKernelRestart": false, "notebook.lineNumbers": "on", - "notebook.diff.ignoreMetadata": true, - "jupyter.interactiveWindow.textEditor.executeSelection": true + "notebook.diff.ignoreMetadata": true } diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py index cd3a5fb7..225c0ca8 100644 --- a/src/nplinker/loader.py +++ b/src/nplinker/loader.py @@ -28,8 +28,7 @@ from nplinker.pairedomics.downloader import Downloader from nplinker.pairedomics.runbigscape import run_bigscape from nplinker.strain_collection import StrainCollection -from nplinker.genomics.antismash import AntismashBGCLoader -from pathlib import Path + try: from importlib.resources import files @@ -245,23 +244,19 @@ def _init_metabolomics_paths(self): optional=True) # 8. MET: /DB_result/*.tsv (new) or /result_specnets_DB/*.tsv (old) / annotations_dir= - if Path.is_file(Path(self._root) / "annotations.tsv"): - self.annotations_dir = str(self._root) - self.annotations_config_file = os.path.join(self._root, "annotations.tsv") - else: - self.annotations_dir = self._overrides.get( - self.OR_ANNO) or find_via_glob_alts_dir( - [ - os.path.join(self._root, 'DB_result'), - os.path.join(self._root, 'result_specnets_DB'), - ], - self.OR_ANNO, - optional=False - ) - if self.annotations_dir is not None: - self.annotations_config_file = self._overrides.get( - self.OR_ANNO_CONFIG) or os.path.join(self.annotations_dir, - 'annotations.tsv') + self.annotations_dir = self._overrides.get( + self.OR_ANNO) or find_via_glob_alts_dir( + [ + os.path.join(self._root, 'DB_result'), + os.path.join(self._root, 'result_specnets_DB'), + ], + self.OR_ANNO, + optional=False + ) + if self.annotations_dir is not None: + self.annotations_config_file = self._overrides.get( + self.OR_ANNO_CONFIG) or os.path.join(self.annotations_dir, + 'annotations.tsv') def _init_genomics_paths(self): # 9. GEN: /antismash / antismash_dir= diff --git a/src/nplinker/metabolomics/gnps/gnps_extractor.py b/src/nplinker/metabolomics/gnps/gnps_extractor.py index 916c888e..5789d61c 100644 --- a/src/nplinker/metabolomics/gnps/gnps_extractor.py +++ b/src/nplinker/metabolomics/gnps/gnps_extractor.py @@ -1,7 +1,6 @@ from os import PathLike import os from pathlib import Path -import shutil import zipfile from nplinker import utils @@ -56,7 +55,7 @@ def _extract_spectra(self): def _extract_molecular_families(self): """Helper function to extract the molecular families file from the archive.""" prefix = "networkedges_selfloop" - suffix = ".selfloop" if self._is_fbmn else ".pairsinfo" + suffix = "..selfloop" if self._is_fbmn else ".pairsinfo" utils.extract_file_matching_pattern( self.get_data(), prefix, @@ -89,5 +88,5 @@ def _extract_annotations(self): self._extract_path, "annotations.tsv" ) - - shutil.rmtree(self._extract_path / prefix) + if self._is_fbmn: + os.rmdir(self._extract_path / prefix) \ No newline at end of file diff --git a/src/nplinker/metabolomics/gnps/gnps_loader.py b/src/nplinker/metabolomics/gnps/gnps_loader.py deleted file mode 100644 index 2ef0d7a9..00000000 --- a/src/nplinker/metabolomics/gnps/gnps_loader.py +++ /dev/null @@ -1,3 +0,0 @@ -class GNPSLoader: - def __init__(self): - pass \ No newline at end of file diff --git a/src/nplinker/metabolomics/load_gnps.py b/src/nplinker/metabolomics/load_gnps.py index 0266df56..d3855868 100644 --- a/src/nplinker/metabolomics/load_gnps.py +++ b/src/nplinker/metabolomics/load_gnps.py @@ -229,13 +229,13 @@ def _load_clusterinfo_old(gnps_format: str, strains: StrainCollection, file: str unknown_strains[mzxml] += 1 # else: # print('{} ===> {}'.format(mzxml, strain)) - if clu_index in spec_dict: - if strain is not None: - # TODO this need fixed somehow (missing growth medium info) - spec_dict[clu_index].add_strain(strain, None, 1) - # update metadata on Spectrum object - spec_dict[clu_index].metadata.update(metadata) + if strain is not None: + # TODO this need fixed somehow (missing growth medium info) + spec_dict[clu_index].add_strain(strain, None, 1) + + # update metadata on Spectrum object + spec_dict[clu_index].metadata.update(metadata) if len(unknown_strains) > 0: logger.warning( diff --git a/src/nplinker/metabolomics/metabolomics.py b/src/nplinker/metabolomics/metabolomics.py index 1c4cde5d..15257ef2 100644 --- a/src/nplinker/metabolomics/metabolomics.py +++ b/src/nplinker/metabolomics/metabolomics.py @@ -84,18 +84,17 @@ def load_edges(edges_file: str, spec_dict: dict[int, Spectrum]): cosine = float(line[cos_index]) family = int(line[fam_index]) - if spec1_id in spec_dict and spec2_id in spec_dict: - spec1 = spec_dict[spec1_id] - spec2 = spec_dict[spec2_id] - - if family != -1: # singletons - spec1.family_id = family - spec2.family_id = family - - spec1.edges.append((spec2.id, spec2.spectrum_id, cosine)) - spec2.edges.append((spec1.id, spec1.spectrum_id, cosine)) - else: - spec1.family_id = family + spec1 = spec_dict[spec1_id] + spec2 = spec_dict[spec2_id] + + if family != -1: # singletons + spec1.family_id = family + spec2.family_id = family + + spec1.edges.append((spec2.id, spec2.spectrum_id, cosine)) + spec2.edges.append((spec1.id, spec1.spectrum_id, cosine)) + else: + spec1.family_id = family @deprecated(version="1.3.3", reason="Use the GNPSLoader class instead.") diff --git a/src/nplinker/scoring/linking/data_linking.py b/src/nplinker/scoring/linking/data_linking.py index a6f181f0..a8c3f3c9 100644 --- a/src/nplinker/scoring/linking/data_linking.py +++ b/src/nplinker/scoring/linking/data_linking.py @@ -341,9 +341,7 @@ def common_strains(self, objects_a, objects_b, filter_no_shared=False): objects_b = [objects_b] # retrieve object IDs - # TODO: issue #103 stop using gcf.id, but note that the ids_b should be - # a list of int - ids_b = [gcf.id for gcf in objects_b] + ids_b = [gcf.gcf_id for gcf in objects_b] # these might be MolFams or Spectra, either way they'll have a .id attribute ids_a = [obj.id for obj in objects_a] diff --git a/src/nplinker/strains.py b/src/nplinker/strains.py index 8887fa6f..0fb55880 100644 --- a/src/nplinker/strains.py +++ b/src/nplinker/strains.py @@ -45,6 +45,3 @@ def __eq__(self, other): and self.id == other.id and self.aliases == other.aliases ) - - def __hash__(self) -> int: - return hash(self.id) diff --git a/tests/test_gnps_loader.py b/tests/test_gnps_loader.py deleted file mode 100644 index 4cf5ae50..00000000 --- a/tests/test_gnps_loader.py +++ /dev/null @@ -1,6 +0,0 @@ -from nplinker.metabolomics.gnps.gnps_loader import GNPSLoader - - -def test_default(): - sut = GNPSLoader() - assert sut is not None \ No newline at end of file diff --git a/tests/test_loader.py b/tests/test_loader.py index d89c329c..f46e0dc2 100644 --- a/tests/test_loader.py +++ b/tests/test_loader.py @@ -1,10 +1,6 @@ from pathlib import Path -import shutil import pytest from nplinker.loader import DatasetLoader -from nplinker.metabolomics.gnps.gnps_extractor import GNPSExtractor -from nplinker.metabolomics.gnps.gnps_spectrum_loader import GNPSSpectrumLoader -from nplinker.strain_collection import StrainCollection from . import DATA_DIR @pytest.fixture @@ -12,76 +8,17 @@ def config(): return { "dataset" : { "root": DATA_DIR / "ProteoSAFe-METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra", - "platform_id": "", - "overrides": { - "strain_mappings_file": str(DATA_DIR / "strain_mappings.csv") - } + "platform_id": "" } } -@pytest.fixture -def config_with_new_gnps_extractor(): - GNPSExtractor(DATA_DIR / "ProteoSAFe-METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra.zip", DATA_DIR / "extracted").extract() - yield { - "dataset" : { - "root": DATA_DIR / "extracted", - "platform_id": "", - "overrides": { - "strain_mappings_file": str(DATA_DIR / "strain_mappings.csv") - } - } - } - shutil.rmtree(DATA_DIR / "extracted") - def test_default(config): sut = DatasetLoader(config) assert sut._platform_id == config["dataset"]["platform_id"] -def test_has_metabolomics_paths(config): +def test_has_spectra_path(config): sut = DatasetLoader(config) sut._init_metabolomics_paths() - assert sut.mgf_file == str(config["dataset"]["root"] / "METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra-main.mgf") - assert sut.edges_file == str(config["dataset"]["root"] / "networkedges_selfloop" / "6da5be36f5b14e878860167fa07004d6.pairsinfo") - assert sut.nodes_file == str(config["dataset"]["root"] / "clusterinfosummarygroup_attributes_withIDs_withcomponentID" / "d69356c8e5044c2a9fef3dd2a2f991e1.tsv") - assert sut.annotations_dir == str(config["dataset"]["root"] / "result_specnets_DB") - - -def test_has_metabolomics_paths_new_gnps(config_with_new_gnps_extractor): - sut = DatasetLoader(config_with_new_gnps_extractor) - sut._init_metabolomics_paths() - assert sut.mgf_file == str(config_with_new_gnps_extractor["dataset"]["root"] / "spectra.mgf") - assert sut.edges_file == str(config_with_new_gnps_extractor["dataset"]["root"] / "molecular_families.pairsinfo") - assert sut.nodes_file == str(config_with_new_gnps_extractor["dataset"]["root"] / "file_mappings.tsv") - assert sut.annotations_dir == str(config_with_new_gnps_extractor["dataset"]["root"]) - assert sut.annotations_config_file == str(config_with_new_gnps_extractor["dataset"]["root"] / "annotations.tsv") - - -def test_load_metabolomics(config): - sut = DatasetLoader(config) - sut._init_paths() - sut._load_strain_mappings() - sut._load_metabolomics() - - expected_spectra = GNPSSpectrumLoader(sut.mgf_file).spectra() - - #HH TODO: switch to different comparison as soon as strains are implemented - assert len(sut.spectra) == len(expected_spectra) - assert len(sut.molfams) == 429 - -def test_has_strain_mappings(config): - sut = DatasetLoader(config) - sut._init_paths() - assert sut.strain_mappings_file == str(DATA_DIR / "strain_mappings.csv") - - -def test_load_strain_mappings(config): - sut = DatasetLoader(config) - sut._init_paths() - sut._load_strain_mappings() - - actual = sut.strains - expected = StrainCollection() - expected.add_from_file(sut.strain_mappings_file) - - assert actual == expected + actual = sut.mgf_file + assert actual == str(config["dataset"]["root"] / "METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra-main.mgf") From c66a10a2074084464fea7e95e72ab7aeb11bad13 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Wed, 15 Mar 2023 12:57:49 +0100 Subject: [PATCH 16/16] Revert "Revert "Merge branch 'dev' into 98_add_antismash_downloader_gcroci2"" This reverts commit 0c5b6bd6c8505b1f041fbbf56b63eae4f7d413ed. revert last merge --- .vscode/settings.json | 6 +- src/nplinker/loader.py | 33 +++++---- .../metabolomics/gnps/gnps_extractor.py | 7 +- src/nplinker/metabolomics/gnps/gnps_loader.py | 3 + src/nplinker/metabolomics/load_gnps.py | 12 ++-- src/nplinker/metabolomics/metabolomics.py | 23 +++--- src/nplinker/scoring/linking/data_linking.py | 4 +- src/nplinker/strains.py | 3 + tests/test_gnps_loader.py | 6 ++ tests/test_loader.py | 71 +++++++++++++++++-- 10 files changed, 125 insertions(+), 43 deletions(-) create mode 100644 src/nplinker/metabolomics/gnps/gnps_loader.py create mode 100644 tests/test_gnps_loader.py diff --git a/.vscode/settings.json b/.vscode/settings.json index 7411789c..30cb3f3f 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -20,10 +20,8 @@ // python docstring "autoDocstring.docstringFormat": "google", "autoDocstring.customTemplatePath": ".vscode/vscode_docstring_google_adapted.mustache", - - // Jupyter - "jupyter.sendSelectionToInteractiveWindow": true, "jupyter.askForKernelRestart": false, "notebook.lineNumbers": "on", - "notebook.diff.ignoreMetadata": true + "notebook.diff.ignoreMetadata": true, + "jupyter.interactiveWindow.textEditor.executeSelection": true } diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py index 225c0ca8..cd3a5fb7 100644 --- a/src/nplinker/loader.py +++ b/src/nplinker/loader.py @@ -28,7 +28,8 @@ from nplinker.pairedomics.downloader import Downloader from nplinker.pairedomics.runbigscape import run_bigscape from nplinker.strain_collection import StrainCollection - +from nplinker.genomics.antismash import AntismashBGCLoader +from pathlib import Path try: from importlib.resources import files @@ -244,19 +245,23 @@ def _init_metabolomics_paths(self): optional=True) # 8. MET: /DB_result/*.tsv (new) or /result_specnets_DB/*.tsv (old) / annotations_dir= - self.annotations_dir = self._overrides.get( - self.OR_ANNO) or find_via_glob_alts_dir( - [ - os.path.join(self._root, 'DB_result'), - os.path.join(self._root, 'result_specnets_DB'), - ], - self.OR_ANNO, - optional=False - ) - if self.annotations_dir is not None: - self.annotations_config_file = self._overrides.get( - self.OR_ANNO_CONFIG) or os.path.join(self.annotations_dir, - 'annotations.tsv') + if Path.is_file(Path(self._root) / "annotations.tsv"): + self.annotations_dir = str(self._root) + self.annotations_config_file = os.path.join(self._root, "annotations.tsv") + else: + self.annotations_dir = self._overrides.get( + self.OR_ANNO) or find_via_glob_alts_dir( + [ + os.path.join(self._root, 'DB_result'), + os.path.join(self._root, 'result_specnets_DB'), + ], + self.OR_ANNO, + optional=False + ) + if self.annotations_dir is not None: + self.annotations_config_file = self._overrides.get( + self.OR_ANNO_CONFIG) or os.path.join(self.annotations_dir, + 'annotations.tsv') def _init_genomics_paths(self): # 9. GEN: /antismash / antismash_dir= diff --git a/src/nplinker/metabolomics/gnps/gnps_extractor.py b/src/nplinker/metabolomics/gnps/gnps_extractor.py index 5789d61c..916c888e 100644 --- a/src/nplinker/metabolomics/gnps/gnps_extractor.py +++ b/src/nplinker/metabolomics/gnps/gnps_extractor.py @@ -1,6 +1,7 @@ from os import PathLike import os from pathlib import Path +import shutil import zipfile from nplinker import utils @@ -55,7 +56,7 @@ def _extract_spectra(self): def _extract_molecular_families(self): """Helper function to extract the molecular families file from the archive.""" prefix = "networkedges_selfloop" - suffix = "..selfloop" if self._is_fbmn else ".pairsinfo" + suffix = ".selfloop" if self._is_fbmn else ".pairsinfo" utils.extract_file_matching_pattern( self.get_data(), prefix, @@ -88,5 +89,5 @@ def _extract_annotations(self): self._extract_path, "annotations.tsv" ) - if self._is_fbmn: - os.rmdir(self._extract_path / prefix) \ No newline at end of file + + shutil.rmtree(self._extract_path / prefix) diff --git a/src/nplinker/metabolomics/gnps/gnps_loader.py b/src/nplinker/metabolomics/gnps/gnps_loader.py new file mode 100644 index 00000000..2ef0d7a9 --- /dev/null +++ b/src/nplinker/metabolomics/gnps/gnps_loader.py @@ -0,0 +1,3 @@ +class GNPSLoader: + def __init__(self): + pass \ No newline at end of file diff --git a/src/nplinker/metabolomics/load_gnps.py b/src/nplinker/metabolomics/load_gnps.py index d3855868..0266df56 100644 --- a/src/nplinker/metabolomics/load_gnps.py +++ b/src/nplinker/metabolomics/load_gnps.py @@ -229,13 +229,13 @@ def _load_clusterinfo_old(gnps_format: str, strains: StrainCollection, file: str unknown_strains[mzxml] += 1 # else: # print('{} ===> {}'.format(mzxml, strain)) + if clu_index in spec_dict: + if strain is not None: + # TODO this need fixed somehow (missing growth medium info) + spec_dict[clu_index].add_strain(strain, None, 1) - if strain is not None: - # TODO this need fixed somehow (missing growth medium info) - spec_dict[clu_index].add_strain(strain, None, 1) - - # update metadata on Spectrum object - spec_dict[clu_index].metadata.update(metadata) + # update metadata on Spectrum object + spec_dict[clu_index].metadata.update(metadata) if len(unknown_strains) > 0: logger.warning( diff --git a/src/nplinker/metabolomics/metabolomics.py b/src/nplinker/metabolomics/metabolomics.py index 15257ef2..1c4cde5d 100644 --- a/src/nplinker/metabolomics/metabolomics.py +++ b/src/nplinker/metabolomics/metabolomics.py @@ -84,17 +84,18 @@ def load_edges(edges_file: str, spec_dict: dict[int, Spectrum]): cosine = float(line[cos_index]) family = int(line[fam_index]) - spec1 = spec_dict[spec1_id] - spec2 = spec_dict[spec2_id] - - if family != -1: # singletons - spec1.family_id = family - spec2.family_id = family - - spec1.edges.append((spec2.id, spec2.spectrum_id, cosine)) - spec2.edges.append((spec1.id, spec1.spectrum_id, cosine)) - else: - spec1.family_id = family + if spec1_id in spec_dict and spec2_id in spec_dict: + spec1 = spec_dict[spec1_id] + spec2 = spec_dict[spec2_id] + + if family != -1: # singletons + spec1.family_id = family + spec2.family_id = family + + spec1.edges.append((spec2.id, spec2.spectrum_id, cosine)) + spec2.edges.append((spec1.id, spec1.spectrum_id, cosine)) + else: + spec1.family_id = family @deprecated(version="1.3.3", reason="Use the GNPSLoader class instead.") diff --git a/src/nplinker/scoring/linking/data_linking.py b/src/nplinker/scoring/linking/data_linking.py index a8c3f3c9..a6f181f0 100644 --- a/src/nplinker/scoring/linking/data_linking.py +++ b/src/nplinker/scoring/linking/data_linking.py @@ -341,7 +341,9 @@ def common_strains(self, objects_a, objects_b, filter_no_shared=False): objects_b = [objects_b] # retrieve object IDs - ids_b = [gcf.gcf_id for gcf in objects_b] + # TODO: issue #103 stop using gcf.id, but note that the ids_b should be + # a list of int + ids_b = [gcf.id for gcf in objects_b] # these might be MolFams or Spectra, either way they'll have a .id attribute ids_a = [obj.id for obj in objects_a] diff --git a/src/nplinker/strains.py b/src/nplinker/strains.py index 0fb55880..8887fa6f 100644 --- a/src/nplinker/strains.py +++ b/src/nplinker/strains.py @@ -45,3 +45,6 @@ def __eq__(self, other): and self.id == other.id and self.aliases == other.aliases ) + + def __hash__(self) -> int: + return hash(self.id) diff --git a/tests/test_gnps_loader.py b/tests/test_gnps_loader.py new file mode 100644 index 00000000..4cf5ae50 --- /dev/null +++ b/tests/test_gnps_loader.py @@ -0,0 +1,6 @@ +from nplinker.metabolomics.gnps.gnps_loader import GNPSLoader + + +def test_default(): + sut = GNPSLoader() + assert sut is not None \ No newline at end of file diff --git a/tests/test_loader.py b/tests/test_loader.py index f46e0dc2..d89c329c 100644 --- a/tests/test_loader.py +++ b/tests/test_loader.py @@ -1,6 +1,10 @@ from pathlib import Path +import shutil import pytest from nplinker.loader import DatasetLoader +from nplinker.metabolomics.gnps.gnps_extractor import GNPSExtractor +from nplinker.metabolomics.gnps.gnps_spectrum_loader import GNPSSpectrumLoader +from nplinker.strain_collection import StrainCollection from . import DATA_DIR @pytest.fixture @@ -8,17 +12,76 @@ def config(): return { "dataset" : { "root": DATA_DIR / "ProteoSAFe-METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra", - "platform_id": "" + "platform_id": "", + "overrides": { + "strain_mappings_file": str(DATA_DIR / "strain_mappings.csv") + } } } +@pytest.fixture +def config_with_new_gnps_extractor(): + GNPSExtractor(DATA_DIR / "ProteoSAFe-METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra.zip", DATA_DIR / "extracted").extract() + yield { + "dataset" : { + "root": DATA_DIR / "extracted", + "platform_id": "", + "overrides": { + "strain_mappings_file": str(DATA_DIR / "strain_mappings.csv") + } + } + } + shutil.rmtree(DATA_DIR / "extracted") + def test_default(config): sut = DatasetLoader(config) assert sut._platform_id == config["dataset"]["platform_id"] -def test_has_spectra_path(config): +def test_has_metabolomics_paths(config): sut = DatasetLoader(config) sut._init_metabolomics_paths() - actual = sut.mgf_file - assert actual == str(config["dataset"]["root"] / "METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra-main.mgf") + assert sut.mgf_file == str(config["dataset"]["root"] / "METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra-main.mgf") + assert sut.edges_file == str(config["dataset"]["root"] / "networkedges_selfloop" / "6da5be36f5b14e878860167fa07004d6.pairsinfo") + assert sut.nodes_file == str(config["dataset"]["root"] / "clusterinfosummarygroup_attributes_withIDs_withcomponentID" / "d69356c8e5044c2a9fef3dd2a2f991e1.tsv") + assert sut.annotations_dir == str(config["dataset"]["root"] / "result_specnets_DB") + + +def test_has_metabolomics_paths_new_gnps(config_with_new_gnps_extractor): + sut = DatasetLoader(config_with_new_gnps_extractor) + sut._init_metabolomics_paths() + assert sut.mgf_file == str(config_with_new_gnps_extractor["dataset"]["root"] / "spectra.mgf") + assert sut.edges_file == str(config_with_new_gnps_extractor["dataset"]["root"] / "molecular_families.pairsinfo") + assert sut.nodes_file == str(config_with_new_gnps_extractor["dataset"]["root"] / "file_mappings.tsv") + assert sut.annotations_dir == str(config_with_new_gnps_extractor["dataset"]["root"]) + assert sut.annotations_config_file == str(config_with_new_gnps_extractor["dataset"]["root"] / "annotations.tsv") + + +def test_load_metabolomics(config): + sut = DatasetLoader(config) + sut._init_paths() + sut._load_strain_mappings() + sut._load_metabolomics() + + expected_spectra = GNPSSpectrumLoader(sut.mgf_file).spectra() + + #HH TODO: switch to different comparison as soon as strains are implemented + assert len(sut.spectra) == len(expected_spectra) + assert len(sut.molfams) == 429 + +def test_has_strain_mappings(config): + sut = DatasetLoader(config) + sut._init_paths() + assert sut.strain_mappings_file == str(DATA_DIR / "strain_mappings.csv") + + +def test_load_strain_mappings(config): + sut = DatasetLoader(config) + sut._init_paths() + sut._load_strain_mappings() + + actual = sut.strains + expected = StrainCollection() + expected.add_from_file(sut.strain_mappings_file) + + assert actual == expected