From 8f6499878b73af99f5fb8e6e4b53ab0d8699c469 Mon Sep 17 00:00:00 2001
From: gcroci2 <crocioni.giulia@gmail.com>
Date: Tue, 28 Feb 2023 15:56:05 +0100
Subject: [PATCH 01/16] creat antismash downloader module

---
 .../antismash/antismash_downloader.py         | 420 +++++++++++++++++
 src/nplinker/pairedomics/downloader.py        | 421 +-----------------
 2 files changed, 422 insertions(+), 419 deletions(-)
 create mode 100644 src/nplinker/genomics/antismash/antismash_downloader.py
diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py
new file mode 100644
index 00000000..e5d66577
--- /dev/null
+++ b/src/nplinker/genomics/antismash/antismash_downloader.py
@@ -0,0 +1,420 @@
+import csv
+import os
+import re
+import time
+import zipfile
+import httpx
+from bs4 import BeautifulSoup
+from progress.bar import Bar
+from nplinker.logconfig import LogConfig
+
+
+logger = LogConfig.getLogger(__name__)
+
+
+ANTISMASH_DB_PAGE_URL = 'https://antismash-db.secondarymetabolites.org/output/{}/'
+ANTISMASH_DB_DOWNLOAD_URL = 'https://antismash-db.secondarymetabolites.org/output/{}/{}'
+
+ANTISMASH_DBV2_PAGE_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/'
+ANTISMASH_DBV2_DOWNLOAD_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/{}'
+
+NCBI_LOOKUP_URL_NEW = 'https://www.ncbi.nlm.nih.gov/assembly/?term={}'
+
+JGI_GENOME_LOOKUP_URL = 'https://img.jgi.doe.gov/cgi-bin/m/main.cgi?section=TaxonDetail&page=taxonDetail&taxon_oid={}'
+
+USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0'
+
+class GenomeStatus:
+
+    def __init__(self, original_id, resolved_id, attempted=False, filename=""):
+        self.original_id = ';'.join(original_id.split(','))
+        self.resolved_id = None if resolved_id == 'None' else resolved_id
+        self.attempted = True if attempted == 'True' else False
+        self.filename = filename
+
+    @classmethod
+    def from_csv(cls, original_id, resolved_id, attempted, filename):
+        return cls(original_id, resolved_id, attempted, filename)
+
+    def to_csv(self):
+        return ','.join([
+            str(self.original_id),
+            str(self.resolved_id),
+            str(self.attempted), self.filename
+        ])
+
+def _get_best_available_genome_id(genome_id_data):
+    if 'RefSeq_accession' in genome_id_data:
+        return genome_id_data['RefSeq_accession']
+    elif 'GenBank_accession' in genome_id_data:
+        return genome_id_data['GenBank_accession']
+    elif 'JGI_Genome_ID' in genome_id_data:
+        return genome_id_data['JGI_Genome_ID']
+
+    logger.warning('No known genome ID field in genome data: {}'.format(
+        genome_id_data))
+    return None
+
+def _ncbi_genbank_search(genbank_id, retry_time=5.0):
+    url = NCBI_LOOKUP_URL_NEW.format(genbank_id)
+    logger.debug('Looking up GenBank data for {} at {}'.format(
+        genbank_id, url))
+    resp = httpx.get(url, follow_redirects=True)
+
+    if resp.status_code == httpx.codes.OK:
+        # the page should contain a <dl> element with class "assembly_summary_new". retrieving
+        # the page seems to fail occasionally in the middle of lengthy sequences of genome
+        # lookups, so there might be some throttling going on. this will automatically retry
+        # the lookup if the expected content isn't found the first time
+        soup = BeautifulSoup(resp.content, 'html.parser')
+        # find the <dl> element with class "assembly_summary_new"
+        dl_element = soup.find('dl', {'class': 'assembly_summary_new'})
+        if dl_element is not None:
+            return dl_element
+
+    logger.debug(
+        'NCBI lookup failed, status code {}. Trying again in {} seconds'.
+        format(resp.status_code, retry_time))
+    time.sleep(retry_time)
+    logger.debug('Looking up GenBank data for {} at {}'.format(
+        genbank_id, url))
+    resp = httpx.get(url, follow_redirects=True)
+    if resp.status_code == httpx.codes.OK:
+        soup = BeautifulSoup(resp.content, 'html.parser')
+        # find the <dl> element with class "assembly_summary_new"
+        dl_element = soup.find('dl', {'class': 'assembly_summary_new'})
+        if dl_element is not None:
+            return dl_element
+
+    logger.warning(
+        'Failed to resolve NCBI genome ID {} at URL {} (after retrying)'.
+        format(genbank_id, url))
+    return None
+
+def _resolve_genbank_accession(genbank_id):
+    logger.info(
+        'Attempting to resolve Genbank accession {} to RefSeq accession'.
+        format(genbank_id))
+    # genbank id => genbank seq => refseq
+
+    # The GenBank accession can have several formats:
+    # 1: BAFR00000000.1
+    # 2: NZ_BAGG00000000.1
+    # 3: NC_016887.1
+    # Case 1 is the default.
+    if '_' in genbank_id:
+        # case 2
+        if len(genbank_id.split('_')[-1].split('.')[0]) == 12:
+            genbank_id = genbank_id.split('_')[-1]
+        # case 3
+        else:
+            genbank_id = genbank_id.lower()
+
+    # get rid of any extraneous whitespace
+    genbank_id = genbank_id.strip()
+    logger.debug(f'Parsed GenBank ID to "{genbank_id}"')
+
+    # run a search using the GenBank accession ID
+    try:
+        dl_element = _ncbi_genbank_search(genbank_id)
+        if dl_element is None:
+            raise Exception('Unknown HTML format')
+
+        refseq_idx = -1
+        for field_idx, field in enumerate(dl_element.children):
+            # this is the element immediately preceding the one with
+            # the actual RefSeq ID we want
+            if field.getText().strip() == 'RefSeq assembly accession:':
+                refseq_idx = field_idx + 1
+
+            # this should be True when we've reached the right element
+            if field_idx == refseq_idx:
+                refseq_id = field.getText()
+                # if it has any spaces, take everything up to first one (some have annotations afterwards)
+                if refseq_id.find(' ') != -1:
+                    refseq_id = refseq_id[:refseq_id.find(' ')]
+
+                return refseq_id
+
+        if refseq_idx == -1:
+            raise Exception('Expected HTML elements not found')
+    except Exception as e:
+        logger.warning(
+            'Failed resolving GenBank accession {}, error {}'.format(
+                genbank_id, e))
+
+    return None
+
+def _resolve_jgi_accession(jgi_id):
+    url = JGI_GENOME_LOOKUP_URL.format(jgi_id)
+    logger.info(
+        'Attempting to resolve JGI_Genome_ID {} to GenBank accession via {}'
+        .format(jgi_id, url))
+    # no User-Agent header produces a 403 Forbidden error on this site...
+    try:
+        resp = httpx.get(url,
+                            headers={'User-Agent': USER_AGENT},
+                            timeout=10.0,
+                            follow_redirects=True)
+    except httpx.ReadTimeout:
+        logger.warning(
+            'Timed out waiting for result of JGI_Genome_ID lookup')
+        return None
+
+    soup = BeautifulSoup(resp.content, 'html.parser')
+    # find the table entry giving the NCBI assembly accession ID
+    link = soup.find(
+        'a', href=re.compile('https://www.ncbi.nlm.nih.gov/nuccore/.*'))
+    if link is None:
+        return None
+
+    return _resolve_genbank_accession(link.text)
+
+def _resolve_genome_id_data(genome_id_data):
+    if 'RefSeq_accession' in genome_id_data:
+        # best case, can use this directly
+        return genome_id_data['RefSeq_accession']
+    elif 'GenBank_accession' in genome_id_data:
+        # resolve via NCBI
+        return _resolve_genbank_accession(
+            genome_id_data['GenBank_accession'])
+    elif 'JGI_Genome_ID' in genome_id_data:
+        # resolve via JGI => NCBI
+        return _resolve_jgi_accession(genome_id_data['JGI_Genome_ID'])
+
+    logger.warning(
+        f'Unable to resolve genome_ID: {genome_id_data}')
+    return None
+
+def _get_antismash_db_page(genome_obj):
+    # want to try up to 4 different links here, v1 and v2 databases, each
+    # with and without the .1 suffix on the accesssion ID
+
+    accesssions = [genome_obj.resolved_id, genome_obj.resolved_id + '.1']
+    for base_url in [ANTISMASH_DB_PAGE_URL, ANTISMASH_DBV2_PAGE_URL]:
+        for accession in accesssions:
+            url = base_url.format(accession)
+            link = None
+
+            logger.info('antismash DB lookup for {} => {}'.format(
+                accession, url))
+            try:
+                resp = httpx.get(url, follow_redirects=True)
+                soup = BeautifulSoup(resp.content, 'html.parser')
+                # retrieve .zip file download link
+                link = soup.find(
+                    'a', {'href': lambda url: url.endswith('.zip')})
+            except Exception as e:
+                logger.debug(f'antiSMASH DB page load failed: {e}')
+
+            if link is not None:
+                logger.info(
+                    'antiSMASH lookup succeeded! Filename is {}'.format(
+                        link['href']))
+                # save with the .1 suffix if that worked
+                genome_obj.resolved_id = accession
+                return link['href']
+
+    return None
+
+def _get_antismash_zip_data(accession_id, filename, local_path):
+    for base_url in [
+            ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL
+    ]:
+        zipfile_url = base_url.format(accession_id, filename)
+        with open(local_path, 'wb') as f:
+            total_bytes = 0
+            try:
+                with httpx.stream('GET', zipfile_url) as r:
+                    if r.status_code == 404:
+                        logger.debug('antiSMASH download URL was a 404')
+                        continue
+
+                    logger.info('Downloading from antiSMASH: {}'.format(
+                        zipfile_url))
+                    filesize = int(r.headers['content-length'])
+                    bar = Bar(filename,
+                                max=filesize,
+                                suffix='%(percent)d%%')
+                    for data in r.iter_bytes():
+                        f.write(data)
+                        total_bytes += len(data)
+                        bar.next(len(data))
+                    bar.finish()
+            except Exception as e:
+                logger.warning(
+                    f'antiSMASH zip download failed: {e}')
+                continue
+
+        return True
+
+    return False
+
+def _download_antismash_zip(antismash_obj, project_download_cache):
+    # save zip files to avoid having to repeat above lookup every time
+    local_path = os.path.join(project_download_cache,
+                                f'{antismash_obj.resolved_id}.zip')
+    logger.debug(
+        f'Checking for existing antismash zip at {local_path}')
+
+    cached = False
+    # if the file exists locally
+    if os.path.exists(local_path):
+        logger.info(f'Found cached file at {local_path}')
+        try:
+            # check if it's a valid zip file, if so treat it as cached
+            _ = zipfile.ZipFile(local_path)
+            cached = True
+            antismash_obj.filename = local_path
+        except zipfile.BadZipFile as bzf:
+            # otherwise delete and redownload
+            logger.info(
+                'Invalid antismash zipfile found ({}). Will download again'
+                .format(bzf))
+            os.unlink(local_path)
+            antismash_obj.filename = ""
+
+    if not cached:
+        filename = _get_antismash_db_page(antismash_obj)
+        if filename is None:
+            return False
+
+        _get_antismash_zip_data(antismash_obj.resolved_id, filename,
+                                        local_path)
+        antismash_obj.filename = local_path
+
+    return True
+
+def _extract_antismash_zip(antismash_obj, project_file_cache):
+    if antismash_obj.filename is None or len(antismash_obj.filename) == 0:
+        return False
+
+    output_path = os.path.join(project_file_cache, 'antismash',
+                                antismash_obj.resolved_id)
+    exists_already = os.path.exists(output_path) and os.path.exists(
+        os.path.join(output_path, 'completed'))
+
+    logger.debug(
+        'Extracting antismash data to {}, exists_already = {}'.format(
+            output_path, exists_already))
+    if exists_already:
+        return True
+
+    # create a subfolder for each set of genome data (the zip files used to be
+    # constructed with path info but that seems to have changed recently)
+    if not os.path.exists(output_path):
+        os.makedirs(output_path, exist_ok=True)
+
+    antismash_zip = zipfile.ZipFile(antismash_obj.filename)
+    kc_prefix1 = f'{antismash_obj.resolved_id}/knownclusterblast'
+    kc_prefix2 = 'knownclusterblast'
+    for zip_member in antismash_zip.namelist():
+        # TODO other files here?
+        if zip_member.endswith('.gbk') or zip_member.endswith('.json'):
+            antismash_zip.extract(zip_member, path=output_path)
+        elif zip_member.startswith(kc_prefix1) or zip_member.startswith(
+                kc_prefix2):
+            if zip_member.endswith(
+                    '.txt') and 'mibig_hits' not in zip_member:
+                antismash_zip.extract(zip_member, path=output_path)
+
+    open(os.path.join(output_path, 'completed'), 'w').close()
+
+    return True
+
+def download_antismash_data(genome_records, project_download_cache):
+    genome_status = {}
+
+    # this file records genome IDs and local filenames to avoid having to repeat HTTP requests
+    # each time the app is loaded (this can take a lot of time if there are dozens of genomes)
+    genome_status_file = os.path.join(project_download_cache,
+                                        'genome_status.txt')
+
+    # genome lookup status info
+    if os.path.exists(genome_status_file):
+        with open(genome_status_file) as f:
+            for line in csv.reader(f):
+                asobj = GenomeStatus.from_csv(*line)
+                genome_status[asobj.original_id] = asobj
+
+    for i, genome_record in enumerate(genome_records):
+        # get the best available ID from the dict
+        best_id = _get_best_available_genome_id(
+            genome_record['genome_ID'])
+        if best_id is None:
+            logger.warning(
+                'Ignoring genome record "{}" due to missing genome ID field'
+                .format(genome_record))
+            continue
+
+        # use this to check if the lookup has already been attempted and if
+        # so if the file is cached locally
+        if best_id not in genome_status:
+            genome_status[best_id] = GenomeStatus(best_id, None)
+
+        genome_obj = genome_status[best_id]
+
+        logger.info(
+            'Checking for antismash data {}/{}, current genome ID={}'.
+            format(i + 1, len(genome_records), best_id))
+        # first check if file is cached locally
+        if os.path.exists(genome_obj.filename):
+            # file already downloaded
+            logger.info('Genome ID {} already downloaded to {}'.format(
+                best_id, genome_obj.filename))
+            genome_record['resolved_id'] = genome_obj.resolved_id
+        elif genome_obj.attempted:
+            # lookup attempted previously but failed
+            logger.info(
+                'Genome ID {} skipped due to previous failure'.format(
+                    best_id))
+            genome_record['resolved_id'] = genome_obj.resolved_id
+        else:
+            # if no existing file and no lookup attempted, can start process of
+            # trying to retrieve the data
+
+            # lookup the ID
+            logger.info('Beginning lookup process for genome ID {}'.format(
+                best_id))
+
+            genome_obj.resolved_id = _resolve_genome_id_data(
+                genome_record['genome_ID'])
+            genome_obj.attempted = True
+
+            if genome_obj.resolved_id is None:
+                # give up on this one
+                logger.warning(
+                    f'Failed lookup for genome ID {best_id}')
+                with open(genome_status_file, 'a+') as f:
+                    f.write(genome_obj.to_csv() + '\n')
+                continue
+
+            # if we got a refseq ID, now try to download the data from antismash
+            if _download_antismash_zip(genome_obj):
+                logger.info(
+                    'Genome data successfully downloaded for {}'.format(
+                        best_id))
+                genome_record['resolved_id'] = genome_obj.resolved_id
+            else:
+                logger.warning(
+                    'Failed to download antiSMASH data for genome ID {} ({})'
+                    .format(genome_obj.resolved_id,
+                            genome_obj.original_id))
+
+            with open(genome_status_file, 'a+', newline='\n') as f:
+                f.write(genome_obj.to_csv() + '\n')
+
+        _extract_antismash_zip(genome_obj)
+
+    missing = len(
+        [x for x in genome_status.values() if len(x.filename) == 0])
+    logger.info(
+        'Dataset has {} missing sets of antiSMASH data (from a total of {})'
+        .format(missing, len(genome_records)))
+
+    with open(genome_status_file, 'w', newline='\n') as f:
+        for obj in genome_status.values():
+            f.write(obj.to_csv() + '\n')
+
+    if missing == len(genome_records):
+        logger.warning('Failed to successfully retrieve ANY genome data!')
diff --git a/src/nplinker/pairedomics/downloader.py b/src/nplinker/pairedomics/downloader.py
index 04914d94..addf306b 100644
--- a/src/nplinker/pairedomics/downloader.py
+++ b/src/nplinker/pairedomics/downloader.py
@@ -12,21 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import csv
-import io
 import json
 import os
-from pathlib import Path
-import re
 import sys
 import shutil
-import tarfile
-import time
 import zipfile
 from deprecated import deprecated
 import httpx
-from bs4 import BeautifulSoup
-from progress.bar import Bar
 from progress.spinner import Spinner
 from nplinker.logconfig import LogConfig
 from nplinker.metabolomics.gnps.gnps_downloader import GNPSDownloader
@@ -34,6 +26,7 @@
 from nplinker.strains import Strain
 from nplinker.strain_collection import StrainCollection
 from nplinker.genomics.mibig import download_and_extract_mibig_metadata
+from nplinker.genomics.antismash import download_antismash_data
 
 
 logger = LogConfig.getLogger(__name__)
@@ -45,43 +38,11 @@
 PAIREDOMICS_PROJECT_URL = 'https://pairedomicsdata.bioinformatics.nl/api/projects/{}'
 GNPS_DATA_DOWNLOAD_URL = 'https://gnps.ucsd.edu/ProteoSAFe/DownloadResult?task={}&view=download_clustered_spectra'
 
-ANTISMASH_DB_PAGE_URL = 'https://antismash-db.secondarymetabolites.org/output/{}/'
-ANTISMASH_DB_DOWNLOAD_URL = 'https://antismash-db.secondarymetabolites.org/output/{}/{}'
-
-ANTISMASH_DBV2_PAGE_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/'
-ANTISMASH_DBV2_DOWNLOAD_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/{}'
-
-NCBI_LOOKUP_URL_NEW = 'https://www.ncbi.nlm.nih.gov/assembly/?term={}'
-
-JGI_GENOME_LOOKUP_URL = 'https://img.jgi.doe.gov/cgi-bin/m/main.cgi?section=TaxonDetail&page=taxonDetail&taxon_oid={}'
-
 MIBIG_METADATA_URL = 'https://dl.secondarymetabolites.org/mibig/mibig_json_{}.tar.gz'
 MIBIG_BGC_METADATA_URL = 'https://mibig.secondarymetabolites.org/repository/{}/annotations.json'
 # MIBIG_BGC_GENBANK_URL = 'https://mibig.secondarymetabolites.org/repository/{}/{}.gbk'
 # MIBIG_BGC_JSON_URL = 'https://mibig.secondarymetabolites.org/repository/{}/{}.json'
 
-USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0'
-
-
-class GenomeStatus:
-
-    def __init__(self, original_id, resolved_id, attempted=False, filename=""):
-        self.original_id = ';'.join(original_id.split(','))
-        self.resolved_id = None if resolved_id == 'None' else resolved_id
-        self.attempted = True if attempted == 'True' else False
-        self.filename = filename
-
-    @classmethod
-    def from_csv(cls, original_id, resolved_id, attempted, filename):
-        return cls(original_id, resolved_id, attempted, filename)
-
-    def to_csv(self):
-        return ','.join([
-            str(self.original_id),
-            str(self.resolved_id),
-            str(self.attempted), self.filename
-        ])
-
 
 class Downloader():
     # TODO: move to independent config file  ---C.Geng
@@ -195,7 +156,7 @@ def get(self, do_bigscape, extra_bigscape_parameters, use_mibig,
 
         self._download_metabolomics_zipfile(self.gnps_task_id)
 
-        self._download_genomics_data(self.project_json['genomes'])
+        download_antismash_data(self.project_json['genomes'])
 
         # CG: it extracts strain names and later will be used for strains
         self._parse_genome_labels(self.project_json['genome_metabolome_links'],
@@ -232,248 +193,6 @@ def _run_bigscape(self, do_bigscape, extra_bigscape_parameters):
                 'Failed to run BiG-SCAPE on antismash data, error was "{}"'.
                 format(e))
 
-    def _ncbi_genbank_search(self, genbank_id, retry_time=5.0):
-        url = NCBI_LOOKUP_URL_NEW.format(genbank_id)
-        logger.debug('Looking up GenBank data for {} at {}'.format(
-            genbank_id, url))
-        resp = httpx.get(url, follow_redirects=True)
-
-        if resp.status_code == httpx.codes.OK:
-            # the page should contain a <dl> element with class "assembly_summary_new". retrieving
-            # the page seems to fail occasionally in the middle of lengthy sequences of genome
-            # lookups, so there might be some throttling going on. this will automatically retry
-            # the lookup if the expected content isn't found the first time
-            soup = BeautifulSoup(resp.content, 'html.parser')
-            # find the <dl> element with class "assembly_summary_new"
-            dl_element = soup.find('dl', {'class': 'assembly_summary_new'})
-            if dl_element is not None:
-                return dl_element
-
-        logger.debug(
-            'NCBI lookup failed, status code {}. Trying again in {} seconds'.
-            format(resp.status_code, retry_time))
-        time.sleep(retry_time)
-        logger.debug('Looking up GenBank data for {} at {}'.format(
-            genbank_id, url))
-        resp = httpx.get(url, follow_redirects=True)
-        if resp.status_code == httpx.codes.OK:
-            soup = BeautifulSoup(resp.content, 'html.parser')
-            # find the <dl> element with class "assembly_summary_new"
-            dl_element = soup.find('dl', {'class': 'assembly_summary_new'})
-            if dl_element is not None:
-                return dl_element
-
-        logger.warning(
-            'Failed to resolve NCBI genome ID {} at URL {} (after retrying)'.
-            format(genbank_id, url))
-        return None
-
-    def _resolve_genbank_accession(self, genbank_id):
-        logger.info(
-            'Attempting to resolve Genbank accession {} to RefSeq accession'.
-            format(genbank_id))
-        # genbank id => genbank seq => refseq
-
-        # The GenBank accession can have several formats:
-        # 1: BAFR00000000.1
-        # 2: NZ_BAGG00000000.1
-        # 3: NC_016887.1
-        # Case 1 is the default.
-        if '_' in genbank_id:
-            # case 2
-            if len(genbank_id.split('_')[-1].split('.')[0]) == 12:
-                genbank_id = genbank_id.split('_')[-1]
-            # case 3
-            else:
-                genbank_id = genbank_id.lower()
-
-        # get rid of any extraneous whitespace
-        genbank_id = genbank_id.strip()
-        logger.debug(f'Parsed GenBank ID to "{genbank_id}"')
-
-        # run a search using the GenBank accession ID
-        try:
-            dl_element = self._ncbi_genbank_search(genbank_id)
-            if dl_element is None:
-                raise Exception('Unknown HTML format')
-
-            refseq_idx = -1
-            for field_idx, field in enumerate(dl_element.children):
-                # this is the element immediately preceding the one with
-                # the actual RefSeq ID we want
-                if field.getText().strip() == 'RefSeq assembly accession:':
-                    refseq_idx = field_idx + 1
-
-                # this should be True when we've reached the right element
-                if field_idx == refseq_idx:
-                    refseq_id = field.getText()
-                    # if it has any spaces, take everything up to first one (some have annotations afterwards)
-                    if refseq_id.find(' ') != -1:
-                        refseq_id = refseq_id[:refseq_id.find(' ')]
-
-                    return refseq_id
-
-            if refseq_idx == -1:
-                raise Exception('Expected HTML elements not found')
-        except Exception as e:
-            logger.warning(
-                'Failed resolving GenBank accession {}, error {}'.format(
-                    genbank_id, e))
-
-        return None
-
-    def _resolve_jgi_accession(self, jgi_id):
-        url = JGI_GENOME_LOOKUP_URL.format(jgi_id)
-        logger.info(
-            'Attempting to resolve JGI_Genome_ID {} to GenBank accession via {}'
-            .format(jgi_id, url))
-        # no User-Agent header produces a 403 Forbidden error on this site...
-        try:
-            resp = httpx.get(url,
-                             headers={'User-Agent': USER_AGENT},
-                             timeout=10.0,
-                             follow_redirects=True)
-        except httpx.ReadTimeout:
-            logger.warning(
-                'Timed out waiting for result of JGI_Genome_ID lookup')
-            return None
-
-        soup = BeautifulSoup(resp.content, 'html.parser')
-        # find the table entry giving the NCBI assembly accession ID
-        link = soup.find(
-            'a', href=re.compile('https://www.ncbi.nlm.nih.gov/nuccore/.*'))
-        if link is None:
-            return None
-
-        return self._resolve_genbank_accession(link.text)
-
-    def _get_best_available_genome_id(self, genome_id_data):
-        if 'RefSeq_accession' in genome_id_data:
-            return genome_id_data['RefSeq_accession']
-        elif 'GenBank_accession' in genome_id_data:
-            return genome_id_data['GenBank_accession']
-        elif 'JGI_Genome_ID' in genome_id_data:
-            return genome_id_data['JGI_Genome_ID']
-
-        logger.warning('No known genome ID field in genome data: {}'.format(
-            genome_id_data))
-        return None
-
-    def _resolve_genome_id_data(self, genome_id_data):
-        if 'RefSeq_accession' in genome_id_data:
-            # best case, can use this directly
-            return genome_id_data['RefSeq_accession']
-        elif 'GenBank_accession' in genome_id_data:
-            # resolve via NCBI
-            return self._resolve_genbank_accession(
-                genome_id_data['GenBank_accession'])
-        elif 'JGI_Genome_ID' in genome_id_data:
-            # resolve via JGI => NCBI
-            return self._resolve_jgi_accession(genome_id_data['JGI_Genome_ID'])
-
-        logger.warning(
-            f'Unable to resolve genome_ID: {genome_id_data}')
-        return None
-
-    def _download_genomics_data(self, genome_records):
-        genome_status = {}
-
-        # this file records genome IDs and local filenames to avoid having to repeat HTTP requests
-        # each time the app is loaded (this can take a lot of time if there are dozens of genomes)
-        genome_status_file = os.path.join(self.project_download_cache,
-                                          'genome_status.txt')
-
-        # genome lookup status info
-        if os.path.exists(genome_status_file):
-            with open(genome_status_file) as f:
-                for line in csv.reader(f):
-                    asobj = GenomeStatus.from_csv(*line)
-                    genome_status[asobj.original_id] = asobj
-
-        for i, genome_record in enumerate(genome_records):
-            label = genome_record['genome_label']
-
-            # get the best available ID from the dict
-            best_id = self._get_best_available_genome_id(
-                genome_record['genome_ID'])
-            if best_id is None:
-                logger.warning(
-                    'Ignoring genome record "{}" due to missing genome ID field'
-                    .format(genome_record))
-                continue
-
-            # use this to check if the lookup has already been attempted and if
-            # so if the file is cached locally
-            if best_id not in genome_status:
-                genome_status[best_id] = GenomeStatus(best_id, None)
-
-            genome_obj = genome_status[best_id]
-
-            logger.info(
-                'Checking for antismash data {}/{}, current genome ID={}'.
-                format(i + 1, len(genome_records), best_id))
-            # first check if file is cached locally
-            if os.path.exists(genome_obj.filename):
-                # file already downloaded
-                logger.info('Genome ID {} already downloaded to {}'.format(
-                    best_id, genome_obj.filename))
-                genome_record['resolved_id'] = genome_obj.resolved_id
-            elif genome_obj.attempted:
-                # lookup attempted previously but failed
-                logger.info(
-                    'Genome ID {} skipped due to previous failure'.format(
-                        best_id))
-                genome_record['resolved_id'] = genome_obj.resolved_id
-            else:
-                # if no existing file and no lookup attempted, can start process of
-                # trying to retrieve the data
-
-                # lookup the ID
-                logger.info('Beginning lookup process for genome ID {}'.format(
-                    best_id))
-
-                genome_obj.resolved_id = self._resolve_genome_id_data(
-                    genome_record['genome_ID'])
-                genome_obj.attempted = True
-
-                if genome_obj.resolved_id is None:
-                    # give up on this one
-                    logger.warning(
-                        f'Failed lookup for genome ID {best_id}')
-                    with open(genome_status_file, 'a+') as f:
-                        f.write(genome_obj.to_csv() + '\n')
-                    continue
-
-                # if we got a refseq ID, now try to download the data from antismash
-                if self._download_antismash_zip(genome_obj):
-                    logger.info(
-                        'Genome data successfully downloaded for {}'.format(
-                            best_id))
-                    genome_record['resolved_id'] = genome_obj.resolved_id
-                else:
-                    logger.warning(
-                        'Failed to download antiSMASH data for genome ID {} ({})'
-                        .format(genome_obj.resolved_id,
-                                genome_obj.original_id))
-
-                with open(genome_status_file, 'a+', newline='\n') as f:
-                    f.write(genome_obj.to_csv() + '\n')
-
-            self._extract_antismash_zip(genome_obj)
-
-        missing = len(
-            [x for x in genome_status.values() if len(x.filename) == 0])
-        logger.info(
-            'Dataset has {} missing sets of antiSMASH data (from a total of {})'
-            .format(missing, len(genome_records)))
-
-        with open(genome_status_file, 'w', newline='\n') as f:
-            for obj in genome_status.values():
-                f.write(obj.to_csv() + '\n')
-
-        if missing == len(genome_records):
-            logger.warning('Failed to successfully retrieve ANY genome data!')
-
     def _download_mibig_json(self, version):
         output_path = os.path.join(self.project_file_cache, 'mibig_json')
 
@@ -490,142 +209,6 @@ def _download_mibig_json(self, version):
 
         return True
 
-    def _get_antismash_db_page(self, genome_obj):
-        # want to try up to 4 different links here, v1 and v2 databases, each
-        # with and without the .1 suffix on the accesssion ID
-
-        accesssions = [genome_obj.resolved_id, genome_obj.resolved_id + '.1']
-        for base_url in [ANTISMASH_DB_PAGE_URL, ANTISMASH_DBV2_PAGE_URL]:
-            for accession in accesssions:
-                url = base_url.format(accession)
-                link = None
-
-                logger.info('antismash DB lookup for {} => {}'.format(
-                    accession, url))
-                try:
-                    resp = httpx.get(url, follow_redirects=True)
-                    soup = BeautifulSoup(resp.content, 'html.parser')
-                    # retrieve .zip file download link
-                    link = soup.find(
-                        'a', {'href': lambda url: url.endswith('.zip')})
-                except Exception as e:
-                    logger.debug(f'antiSMASH DB page load failed: {e}')
-
-                if link is not None:
-                    logger.info(
-                        'antiSMASH lookup succeeded! Filename is {}'.format(
-                            link['href']))
-                    # save with the .1 suffix if that worked
-                    genome_obj.resolved_id = accession
-                    return link['href']
-
-        return None
-
-    def _get_antismash_zip_data(self, accession_id, filename, local_path):
-        for base_url in [
-                ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL
-        ]:
-            zipfile_url = base_url.format(accession_id, filename)
-            with open(local_path, 'wb') as f:
-                total_bytes = 0
-                try:
-                    with httpx.stream('GET', zipfile_url) as r:
-                        if r.status_code == 404:
-                            logger.debug('antiSMASH download URL was a 404')
-                            continue
-
-                        logger.info('Downloading from antiSMASH: {}'.format(
-                            zipfile_url))
-                        filesize = int(r.headers['content-length'])
-                        bar = Bar(filename,
-                                  max=filesize,
-                                  suffix='%(percent)d%%')
-                        for data in r.iter_bytes():
-                            f.write(data)
-                            total_bytes += len(data)
-                            bar.next(len(data))
-                        bar.finish()
-                except Exception as e:
-                    logger.warning(
-                        f'antiSMASH zip download failed: {e}')
-                    continue
-
-            return True
-
-        return False
-
-    def _download_antismash_zip(self, antismash_obj):
-        # save zip files to avoid having to repeat above lookup every time
-        local_path = os.path.join(self.project_download_cache,
-                                  f'{antismash_obj.resolved_id}.zip')
-        logger.debug(
-            f'Checking for existing antismash zip at {local_path}')
-
-        cached = False
-        # if the file exists locally
-        if os.path.exists(local_path):
-            logger.info(f'Found cached file at {local_path}')
-            try:
-                # check if it's a valid zip file, if so treat it as cached
-                _ = zipfile.ZipFile(local_path)
-                cached = True
-                antismash_obj.filename = local_path
-            except zipfile.BadZipFile as bzf:
-                # otherwise delete and redownload
-                logger.info(
-                    'Invalid antismash zipfile found ({}). Will download again'
-                    .format(bzf))
-                os.unlink(local_path)
-                antismash_obj.filename = ""
-
-        if not cached:
-            filename = self._get_antismash_db_page(antismash_obj)
-            if filename is None:
-                return False
-
-            self._get_antismash_zip_data(antismash_obj.resolved_id, filename,
-                                         local_path)
-            antismash_obj.filename = local_path
-
-        return True
-
-    def _extract_antismash_zip(self, antismash_obj):
-        if antismash_obj.filename is None or len(antismash_obj.filename) == 0:
-            return False
-
-        output_path = os.path.join(self.project_file_cache, 'antismash',
-                                   antismash_obj.resolved_id)
-        exists_already = os.path.exists(output_path) and os.path.exists(
-            os.path.join(output_path, 'completed'))
-
-        logger.debug(
-            'Extracting antismash data to {}, exists_already = {}'.format(
-                output_path, exists_already))
-        if exists_already:
-            return True
-
-        # create a subfolder for each set of genome data (the zip files used to be
-        # constructed with path info but that seems to have changed recently)
-        if not os.path.exists(output_path):
-            os.makedirs(output_path, exist_ok=True)
-
-        antismash_zip = zipfile.ZipFile(antismash_obj.filename)
-        kc_prefix1 = f'{antismash_obj.resolved_id}/knownclusterblast'
-        kc_prefix2 = 'knownclusterblast'
-        for zip_member in antismash_zip.namelist():
-            # TODO other files here?
-            if zip_member.endswith('.gbk') or zip_member.endswith('.json'):
-                antismash_zip.extract(zip_member, path=output_path)
-            elif zip_member.startswith(kc_prefix1) or zip_member.startswith(
-                    kc_prefix2):
-                if zip_member.endswith(
-                        '.txt') and 'mibig_hits' not in zip_member:
-                    antismash_zip.extract(zip_member, path=output_path)
-
-        open(os.path.join(output_path, 'completed'), 'w').close()
-
-        return True
-
     def _parse_genome_labels(self, met_records, gen_records):
         temp = {}
         mc, gc = 0, 0

From ec2b048f650feb9f24ebacec3292fcecd2d242ab Mon Sep 17 00:00:00 2001
From: gcroci2 <crocioni.giulia@gmail.com>
Date: Tue, 28 Feb 2023 16:23:55 +0100
Subject: [PATCH 02/16] fix tests

---
 src/nplinker/genomics/antismash/__init__.py             | 1 +
 src/nplinker/genomics/antismash/antismash_downloader.py | 6 +++---
 src/nplinker/pairedomics/downloader.py                  | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/nplinker/genomics/antismash/__init__.py b/src/nplinker/genomics/antismash/__init__.py
index a512800a..b0cdb0a4 100644
--- a/src/nplinker/genomics/antismash/__init__.py
+++ b/src/nplinker/genomics/antismash/__init__.py
@@ -1,6 +1,7 @@
 import logging
 from .antismash_loader import AntismashBGCLoader
 from .antismash_loader import parse_bgc_genbank
+from .antismash_downloader import download_antismash_data
 
 logging.getLogger(__name__).addHandler(logging.NullHandler())
 
diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py
index e5d66577..281fa00a 100644
--- a/src/nplinker/genomics/antismash/antismash_downloader.py
+++ b/src/nplinker/genomics/antismash/antismash_downloader.py
@@ -322,7 +322,7 @@ def _extract_antismash_zip(antismash_obj, project_file_cache):
 
     return True
 
-def download_antismash_data(genome_records, project_download_cache):
+def download_antismash_data(genome_records, project_download_cache, project_file_cache):
     genome_status = {}
 
     # this file records genome IDs and local filenames to avoid having to repeat HTTP requests
@@ -390,7 +390,7 @@ def download_antismash_data(genome_records, project_download_cache):
                 continue
 
             # if we got a refseq ID, now try to download the data from antismash
-            if _download_antismash_zip(genome_obj):
+            if _download_antismash_zip(genome_obj, project_download_cache):
                 logger.info(
                     'Genome data successfully downloaded for {}'.format(
                         best_id))
@@ -404,7 +404,7 @@ def download_antismash_data(genome_records, project_download_cache):
             with open(genome_status_file, 'a+', newline='\n') as f:
                 f.write(genome_obj.to_csv() + '\n')
 
-        _extract_antismash_zip(genome_obj)
+        _extract_antismash_zip(genome_obj, project_file_cache)
 
     missing = len(
         [x for x in genome_status.values() if len(x.filename) == 0])
diff --git a/src/nplinker/pairedomics/downloader.py b/src/nplinker/pairedomics/downloader.py
index addf306b..cfbc76f9 100644
--- a/src/nplinker/pairedomics/downloader.py
+++ b/src/nplinker/pairedomics/downloader.py
@@ -156,7 +156,7 @@ def get(self, do_bigscape, extra_bigscape_parameters, use_mibig,
 
         self._download_metabolomics_zipfile(self.gnps_task_id)
 
-        download_antismash_data(self.project_json['genomes'])
+        download_antismash_data(self.project_json['genomes'], self.project_download_cache, self.project_file_cache)
 
         # CG: it extracts strain names and later will be used for strains
         self._parse_genome_labels(self.project_json['genome_metabolome_links'],

From 26e685f7ee9fa14c90973fa599e2028ccfffdc93 Mon Sep 17 00:00:00 2001
From: Giulia Crocioni <55382553+gcroci2@users.noreply.github.com>
Date: Fri, 3 Mar 2023 10:33:27 +0100
Subject: [PATCH 03/16] Update
 src/nplinker/genomics/antismash/antismash_downloader.py

Co-authored-by: Cunliang Geng <c.geng@esciencecenter.nl>
---
 src/nplinker/genomics/antismash/antismash_downloader.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py
index 281fa00a..025d69d8 100644
--- a/src/nplinker/genomics/antismash/antismash_downloader.py
+++ b/src/nplinker/genomics/antismash/antismash_downloader.py
@@ -15,6 +15,7 @@
 ANTISMASH_DB_PAGE_URL = 'https://antismash-db.secondarymetabolites.org/output/{}/'
 ANTISMASH_DB_DOWNLOAD_URL = 'https://antismash-db.secondarymetabolites.org/output/{}/{}'
 
+# The antiSMASH DBV2 is for the availability of the old version, better to keep it.
 ANTISMASH_DBV2_PAGE_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/'
 ANTISMASH_DBV2_DOWNLOAD_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/{}'
 

From da904c609b79fe4398577e0c2a2310ffaa483a6d Mon Sep 17 00:00:00 2001
From: gcroci2 <crocioni.giulia@gmail.com>
Date: Fri, 3 Mar 2023 17:50:06 +0100
Subject: [PATCH 04/16] add a new function for downloading and extracting
 antismash data

---
 .../antismash/antismash_downloader.py         | 85 ++++++++++++++++++-
 1 file changed, 83 insertions(+), 2 deletions(-)

diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py
index 281fa00a..5f0be7cb 100644
--- a/src/nplinker/genomics/antismash/antismash_downloader.py
+++ b/src/nplinker/genomics/antismash/antismash_downloader.py
@@ -4,6 +4,7 @@
 import time
 import zipfile
 import httpx
+from deprecated import deprecated
 from bs4 import BeautifulSoup
 from progress.bar import Bar
 from nplinker.logconfig import LogConfig
@@ -11,7 +12,7 @@
 
 logger = LogConfig.getLogger(__name__)
 
-
+# urls to be given to download antismash data
 ANTISMASH_DB_PAGE_URL = 'https://antismash-db.secondarymetabolites.org/output/{}/'
 ANTISMASH_DB_DOWNLOAD_URL = 'https://antismash-db.secondarymetabolites.org/output/{}/{}'
 
@@ -322,7 +323,87 @@ def _extract_antismash_zip(antismash_obj, project_file_cache):
 
     return True
 
-def download_antismash_data(genome_records, project_download_cache, project_file_cache):
+def download_and_extract_antismash_data(item_id, download_root, extract_path): 
+    genome_status = {}
+
+    # this file records genome IDs and local filenames to avoid having to repeat HTTP requests
+    # each time the app is loaded (this can take a lot of time if there are dozens of genomes)
+    genome_status_file = os.path.join(download_root,
+                                        'genome_status.txt')
+
+    # genome lookup status info
+    if os.path.exists(genome_status_file):
+        with open(genome_status_file) as f:
+            for line in csv.reader(f):
+                asobj = GenomeStatus.from_csv(*line)
+                genome_status[asobj.original_id] = asobj
+
+    # use this to check if the lookup has already been attempted and if
+    # so if the file is cached locally
+    if item_id not in genome_status:
+        genome_status[item_id] = GenomeStatus(item_id, None)
+
+    genome_obj = genome_status[item_id]
+
+    logger.info(
+        'Checking for antismash data for genome ID={}'.
+        format(item_id))
+    # first check if file is cached locally
+    if os.path.exists(genome_obj.filename):
+        # file already downloaded
+        logger.info('Genome ID {} already downloaded to {}'.format(
+            item_id, genome_obj.filename))
+    elif genome_obj.attempted:
+        # lookup attempted previously but failed
+        logger.info(
+            'Genome ID {} skipped due to previous failure'.format(
+                item_id))
+    else:
+        # if no existing file and no lookup attempted, can start process of
+        # trying to retrieve the data
+
+        # lookup the ID
+        logger.info('Beginning lookup process for genome ID {}'.format(
+            item_id))
+
+        genome_obj.resolved_id = item_id # TO CHECK (Cunliang) not sure if this is what we want; in a general case,
+        # I don't think we have different possible ids (as in podp json file, for genome_ID nested dicts),
+        # so maybe it makes sense to put genome_obj.resolved_id equal to the item_id and only in podp case do the check
+        # (through _resolve_genome_id_data, was done here before) outside this function.
+        # If this is true, then I think we need to modify GenomeStatus class attributes logic for original_id and resolved_id,
+        # which in this way would be the same thing. Then we should modify also the code below, which assumes original_id
+        # to be eventually different
+        genome_obj.attempted = True
+
+        if genome_obj.resolved_id is None:
+            # give up on this one
+            logger.warning(
+                f'Failed lookup for genome ID {item_id}')
+            with open(genome_status_file, 'a+') as f:
+                f.write(genome_obj.to_csv() + '\n')
+
+        # if we got a refseq ID, now try to download the data from antismash
+        if _download_antismash_zip(genome_obj, download_root):
+            logger.info(
+                'Genome data successfully downloaded for {}'.format(
+                    item_id))
+        else:
+            logger.warning(
+                'Failed to download antiSMASH data for genome ID {} ({})'
+                .format(genome_obj.resolved_id,
+                        genome_obj.original_id))
+
+        with open(genome_status_file, 'a+', newline='\n') as f:
+            f.write(genome_obj.to_csv() + '\n')
+
+    _extract_antismash_zip(genome_obj, extract_path)
+
+    with open(genome_status_file, 'w', newline='\n') as f:
+        for obj in genome_status.values():
+            f.write(obj.to_csv() + '\n')
+
+@deprecated(version="1.3.3", reason="Use download_and_extract_antismash_data class instead.")
+def download_antismash_data(genome_records, project_download_cache, project_file_cache): 
     genome_status = {}
 
     # this file records genome IDs and local filenames to avoid having to repeat HTTP requests

From 7b07a7bbbb9829f097199465fd17a1bea3113d2a Mon Sep 17 00:00:00 2001
From: gcroci2 <crocioni.giulia@gmail.com>
Date: Tue, 7 Mar 2023 14:34:49 +0100
Subject: [PATCH 05/16] create podp_antismash_downloader module

---
 src/nplinker/pairedomics/__init__.py          |   1 +
 src/nplinker/pairedomics/downloader.py        |   2 +-
 .../pairedomics/podp_antismash_downloader.py  | 423 ++++++++++++++++++
 3 files changed, 425 insertions(+), 1 deletion(-)
 create mode 100644 src/nplinker/pairedomics/podp_antismash_downloader.py

diff --git a/src/nplinker/pairedomics/__init__.py b/src/nplinker/pairedomics/__init__.py
index e69de29b..778793b4 100644
--- a/src/nplinker/pairedomics/__init__.py
+++ b/src/nplinker/pairedomics/__init__.py
@@ -0,0 +1 @@
+from .podp_antismash_downloader import download_antismash_data
diff --git a/src/nplinker/pairedomics/downloader.py b/src/nplinker/pairedomics/downloader.py
index cfbc76f9..610a6cad 100644
--- a/src/nplinker/pairedomics/downloader.py
+++ b/src/nplinker/pairedomics/downloader.py
@@ -26,7 +26,7 @@
 from nplinker.strains import Strain
 from nplinker.strain_collection import StrainCollection
 from nplinker.genomics.mibig import download_and_extract_mibig_metadata
-from nplinker.genomics.antismash import download_antismash_data
+from . import download_antismash_data
 
 
 logger = LogConfig.getLogger(__name__)
diff --git a/src/nplinker/pairedomics/podp_antismash_downloader.py b/src/nplinker/pairedomics/podp_antismash_downloader.py
new file mode 100644
index 00000000..76cd56c8
--- /dev/null
+++ b/src/nplinker/pairedomics/podp_antismash_downloader.py
@@ -0,0 +1,423 @@
+import csv
+import os
+import re
+import time
+import zipfile
+import httpx
+from deprecated import deprecated
+from bs4 import BeautifulSoup
+from progress.bar import Bar
+from nplinker.logconfig import LogConfig
+
+
+logger = LogConfig.getLogger(__name__)
+
+# urls to be given to download antismash data
+ANTISMASH_DB_PAGE_URL = 'https://antismash-db.secondarymetabolites.org/output/{}/'
+ANTISMASH_DB_DOWNLOAD_URL = 'https://antismash-db.secondarymetabolites.org/output/{}/{}'
+
+# The antiSMASH DBV2 is for the availability of the old version, better to keep it.
+ANTISMASH_DBV2_PAGE_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/'
+ANTISMASH_DBV2_DOWNLOAD_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/{}'
+
+NCBI_LOOKUP_URL_NEW = 'https://www.ncbi.nlm.nih.gov/assembly/?term={}'
+
+JGI_GENOME_LOOKUP_URL = 'https://img.jgi.doe.gov/cgi-bin/m/main.cgi?section=TaxonDetail&page=taxonDetail&taxon_oid={}'
+
+USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0'
+
+class GenomeStatus:
+
+    def __init__(self, original_id, resolved_id, attempted=False, filename=""):
+        self.original_id = ';'.join(original_id.split(','))
+        self.resolved_id = None if resolved_id == 'None' else resolved_id
+        self.attempted = True if attempted == 'True' else False
+        self.filename = filename
+
+    @classmethod
+    def from_csv(cls, original_id, resolved_id, attempted, filename):
+        return cls(original_id, resolved_id, attempted, filename)
+
+    def to_csv(self):
+        return ','.join([
+            str(self.original_id),
+            str(self.resolved_id),
+            str(self.attempted), self.filename
+        ])
+
+def _get_best_available_genome_id(genome_id_data):
+    if 'RefSeq_accession' in genome_id_data:
+        return genome_id_data['RefSeq_accession']
+    elif 'GenBank_accession' in genome_id_data:
+        return genome_id_data['GenBank_accession']
+    elif 'JGI_Genome_ID' in genome_id_data:
+        return genome_id_data['JGI_Genome_ID']
+
+    logger.warning('No known genome ID field in genome data: {}'.format(
+        genome_id_data))
+    return None
+
+def _ncbi_genbank_search(genbank_id, retry_time=5.0):
+    url = NCBI_LOOKUP_URL_NEW.format(genbank_id)
+    logger.debug('Looking up GenBank data for {} at {}'.format(
+        genbank_id, url))
+    resp = httpx.get(url, follow_redirects=True)
+
+    if resp.status_code == httpx.codes.OK:
+        # the page should contain a <dl> element with class "assembly_summary_new". retrieving
+        # the page seems to fail occasionally in the middle of lengthy sequences of genome
+        # lookups, so there might be some throttling going on. this will automatically retry
+        # the lookup if the expected content isn't found the first time
+        soup = BeautifulSoup(resp.content, 'html.parser')
+        # find the <dl> element with class "assembly_summary_new"
+        dl_element = soup.find('dl', {'class': 'assembly_summary_new'})
+        if dl_element is not None:
+            return dl_element
+
+    logger.debug(
+        'NCBI lookup failed, status code {}. Trying again in {} seconds'.
+        format(resp.status_code, retry_time))
+    time.sleep(retry_time)
+    logger.debug('Looking up GenBank data for {} at {}'.format(
+        genbank_id, url))
+    resp = httpx.get(url, follow_redirects=True)
+    if resp.status_code == httpx.codes.OK:
+        soup = BeautifulSoup(resp.content, 'html.parser')
+        # find the <dl> element with class "assembly_summary_new"
+        dl_element = soup.find('dl', {'class': 'assembly_summary_new'})
+        if dl_element is not None:
+            return dl_element
+
+    logger.warning(
+        'Failed to resolve NCBI genome ID {} at URL {} (after retrying)'.
+        format(genbank_id, url))
+    return None
+
+def _resolve_genbank_accession(genbank_id):
+    logger.info(
+        'Attempting to resolve Genbank accession {} to RefSeq accession'.
+        format(genbank_id))
+    # genbank id => genbank seq => refseq
+
+    # The GenBank accession can have several formats:
+    # 1: BAFR00000000.1
+    # 2: NZ_BAGG00000000.1
+    # 3: NC_016887.1
+    # Case 1 is the default.
+    if '_' in genbank_id:
+        # case 2
+        if len(genbank_id.split('_')[-1].split('.')[0]) == 12:
+            genbank_id = genbank_id.split('_')[-1]
+        # case 3
+        else:
+            genbank_id = genbank_id.lower()
+
+    # get rid of any extraneous whitespace
+    genbank_id = genbank_id.strip()
+    logger.debug(f'Parsed GenBank ID to "{genbank_id}"')
+
+    # run a search using the GenBank accession ID
+    try:
+        dl_element = _ncbi_genbank_search(genbank_id)
+        if dl_element is None:
+            raise Exception('Unknown HTML format')
+
+        refseq_idx = -1
+        for field_idx, field in enumerate(dl_element.children):
+            # this is the element immediately preceding the one with
+            # the actual RefSeq ID we want
+            if field.getText().strip() == 'RefSeq assembly accession:':
+                refseq_idx = field_idx + 1
+
+            # this should be True when we've reached the right element
+            if field_idx == refseq_idx:
+                refseq_id = field.getText()
+                # if it has any spaces, take everything up to first one (some have annotations afterwards)
+                if refseq_id.find(' ') != -1:
+                    refseq_id = refseq_id[:refseq_id.find(' ')]
+
+                return refseq_id
+
+        if refseq_idx == -1:
+            raise Exception('Expected HTML elements not found')
+    except Exception as e:
+        logger.warning(
+            'Failed resolving GenBank accession {}, error {}'.format(
+                genbank_id, e))
+
+    return None
+
+def _resolve_jgi_accession(jgi_id):
+    url = JGI_GENOME_LOOKUP_URL.format(jgi_id)
+    logger.info(
+        'Attempting to resolve JGI_Genome_ID {} to GenBank accession via {}'
+        .format(jgi_id, url))
+    # no User-Agent header produces a 403 Forbidden error on this site...
+    try:
+        resp = httpx.get(url,
+                            headers={'User-Agent': USER_AGENT},
+                            timeout=10.0,
+                            follow_redirects=True)
+    except httpx.ReadTimeout:
+        logger.warning(
+            'Timed out waiting for result of JGI_Genome_ID lookup')
+        return None
+
+    soup = BeautifulSoup(resp.content, 'html.parser')
+    # find the table entry giving the NCBI assembly accession ID
+    link = soup.find(
+        'a', href=re.compile('https://www.ncbi.nlm.nih.gov/nuccore/.*'))
+    if link is None:
+        return None
+
+    return _resolve_genbank_accession(link.text)
+
+def _resolve_genome_id_data(genome_id_data):
+    if 'RefSeq_accession' in genome_id_data:
+        # best case, can use this directly
+        return genome_id_data['RefSeq_accession']
+    elif 'GenBank_accession' in genome_id_data:
+        # resolve via NCBI
+        return _resolve_genbank_accession(
+            genome_id_data['GenBank_accession'])
+    elif 'JGI_Genome_ID' in genome_id_data:
+        # resolve via JGI => NCBI
+        return _resolve_jgi_accession(genome_id_data['JGI_Genome_ID'])
+
+    logger.warning(
+        f'Unable to resolve genome_ID: {genome_id_data}')
+    return None
+
+def _get_antismash_db_page(genome_obj):
+    # want to try up to 4 different links here, v1 and v2 databases, each
+    # with and without the .1 suffix on the accesssion ID
+
+    accesssions = [genome_obj.resolved_id, genome_obj.resolved_id + '.1']
+    for base_url in [ANTISMASH_DB_PAGE_URL, ANTISMASH_DBV2_PAGE_URL]:
+        for accession in accesssions:
+            url = base_url.format(accession)
+            link = None
+
+            logger.info('antismash DB lookup for {} => {}'.format(
+                accession, url))
+            try:
+                resp = httpx.get(url, follow_redirects=True)
+                soup = BeautifulSoup(resp.content, 'html.parser')
+                # retrieve .zip file download link
+                link = soup.find(
+                    'a', {'href': lambda url: url.endswith('.zip')})
+            except Exception as e:
+                logger.debug(f'antiSMASH DB page load failed: {e}')
+
+            if link is not None:
+                logger.info(
+                    'antiSMASH lookup succeeded! Filename is {}'.format(
+                        link['href']))
+                # save with the .1 suffix if that worked
+                genome_obj.resolved_id = accession
+                return link['href']
+
+    return None
+
+def _get_antismash_zip_data(accession_id, filename, local_path):
+    for base_url in [
+            ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL
+    ]:
+        zipfile_url = base_url.format(accession_id, filename)
+        with open(local_path, 'wb') as f:
+            total_bytes = 0
+            try:
+                with httpx.stream('GET', zipfile_url) as r:
+                    if r.status_code == 404:
+                        logger.debug('antiSMASH download URL was a 404')
+                        continue
+
+                    logger.info('Downloading from antiSMASH: {}'.format(
+                        zipfile_url))
+                    filesize = int(r.headers['content-length'])
+                    bar = Bar(filename,
+                                max=filesize,
+                                suffix='%(percent)d%%')
+                    for data in r.iter_bytes():
+                        f.write(data)
+                        total_bytes += len(data)
+                        bar.next(len(data))
+                    bar.finish()
+            except Exception as e:
+                logger.warning(
+                    f'antiSMASH zip download failed: {e}')
+                continue
+
+        return True
+
+    return False
+
+def _download_antismash_zip(antismash_obj, project_download_cache):
+    # save zip files to avoid having to repeat above lookup every time
+    local_path = os.path.join(project_download_cache,
+                                f'{antismash_obj.resolved_id}.zip')
+    logger.debug(
+        f'Checking for existing antismash zip at {local_path}')
+
+    cached = False
+    # if the file exists locally
+    if os.path.exists(local_path):
+        logger.info(f'Found cached file at {local_path}')
+        try:
+            # check if it's a valid zip file, if so treat it as cached
+            _ = zipfile.ZipFile(local_path)
+            cached = True
+            antismash_obj.filename = local_path
+        except zipfile.BadZipFile as bzf:
+            # otherwise delete and redownload
+            logger.info(
+                'Invalid antismash zipfile found ({}). Will download again'
+                .format(bzf))
+            os.unlink(local_path)
+            antismash_obj.filename = ""
+
+    if not cached:
+        filename = _get_antismash_db_page(antismash_obj)
+        if filename is None:
+            return False
+
+        _get_antismash_zip_data(antismash_obj.resolved_id, filename,
+                                        local_path)
+        antismash_obj.filename = local_path
+
+    return True
+
+def _extract_antismash_zip(antismash_obj, project_file_cache):
+    if antismash_obj.filename is None or len(antismash_obj.filename) == 0:
+        return False
+
+    output_path = os.path.join(project_file_cache, 'antismash',
+                                antismash_obj.resolved_id)
+    exists_already = os.path.exists(output_path) and os.path.exists(
+        os.path.join(output_path, 'completed'))
+
+    logger.debug(
+        'Extracting antismash data to {}, exists_already = {}'.format(
+            output_path, exists_already))
+    if exists_already:
+        return True
+
+    # create a subfolder for each set of genome data (the zip files used to be
+    # constructed with path info but that seems to have changed recently)
+    if not os.path.exists(output_path):
+        os.makedirs(output_path, exist_ok=True)
+
+    antismash_zip = zipfile.ZipFile(antismash_obj.filename)
+    kc_prefix1 = f'{antismash_obj.resolved_id}/knownclusterblast'
+    kc_prefix2 = 'knownclusterblast'
+    for zip_member in antismash_zip.namelist():
+        # TODO other files here?
+        if zip_member.endswith('.gbk') or zip_member.endswith('.json'):
+            antismash_zip.extract(zip_member, path=output_path)
+        elif zip_member.startswith(kc_prefix1) or zip_member.startswith(
+                kc_prefix2):
+            if zip_member.endswith(
+                    '.txt') and 'mibig_hits' not in zip_member:
+                antismash_zip.extract(zip_member, path=output_path)
+
+    open(os.path.join(output_path, 'completed'), 'w').close()
+
+    return True
+
+@deprecated(version="1.3.3", reason="Use download_and_extract_antismash_data class instead.")
+def download_antismash_data(genome_records, project_download_cache, project_file_cache): 
+    genome_status = {}
+
+    # this file records genome IDs and local filenames to avoid having to repeat HTTP requests
+    # each time the app is loaded (this can take a lot of time if there are dozens of genomes)
+    genome_status_file = os.path.join(project_download_cache,
+                                        'genome_status.txt')
+
+    # genome lookup status info
+    if os.path.exists(genome_status_file):
+        with open(genome_status_file) as f:
+            for line in csv.reader(f):
+                asobj = GenomeStatus.from_csv(*line)
+                genome_status[asobj.original_id] = asobj
+
+    for i, genome_record in enumerate(genome_records):
+        # get the best available ID from the dict
+        best_id = _get_best_available_genome_id(
+            genome_record['genome_ID'])
+        if best_id is None:
+            logger.warning(
+                'Ignoring genome record "{}" due to missing genome ID field'
+                .format(genome_record))
+            continue
+
+        # use this to check if the lookup has already been attempted and if
+        # so if the file is cached locally
+        if best_id not in genome_status:
+            genome_status[best_id] = GenomeStatus(best_id, None)
+
+        genome_obj = genome_status[best_id]
+
+        logger.info(
+            'Checking for antismash data {}/{}, current genome ID={}'.
+            format(i + 1, len(genome_records), best_id))
+        # first check if file is cached locally
+        if os.path.exists(genome_obj.filename):
+            # file already downloaded
+            logger.info('Genome ID {} already downloaded to {}'.format(
+                best_id, genome_obj.filename))
+            genome_record['resolved_id'] = genome_obj.resolved_id
+        elif genome_obj.attempted:
+            # lookup attempted previously but failed
+            logger.info(
+                'Genome ID {} skipped due to previous failure'.format(
+                    best_id))
+            genome_record['resolved_id'] = genome_obj.resolved_id
+        else:
+            # if no existing file and no lookup attempted, can start process of
+            # trying to retrieve the data
+
+            # lookup the ID
+            logger.info('Beginning lookup process for genome ID {}'.format(
+                best_id))
+
+            genome_obj.resolved_id = _resolve_genome_id_data(
+                genome_record['genome_ID'])
+            genome_obj.attempted = True
+
+            if genome_obj.resolved_id is None:
+                # give up on this one
+                logger.warning(
+                    f'Failed lookup for genome ID {best_id}')
+                with open(genome_status_file, 'a+') as f:
+                    f.write(genome_obj.to_csv() + '\n')
+                continue
+
+            # if we got a refseq ID, now try to download the data from antismash
+            if _download_antismash_zip(genome_obj, project_download_cache):
+                logger.info(
+                    'Genome data successfully downloaded for {}'.format(
+                        best_id))
+                genome_record['resolved_id'] = genome_obj.resolved_id
+            else:
+                logger.warning(
+                    'Failed to download antiSMASH data for genome ID {} ({})'
+                    .format(genome_obj.resolved_id,
+                            genome_obj.original_id))
+
+            with open(genome_status_file, 'a+', newline='\n') as f:
+                f.write(genome_obj.to_csv() + '\n')
+
+        _extract_antismash_zip(genome_obj, project_file_cache)
+
+    missing = len(
+        [x for x in genome_status.values() if len(x.filename) == 0])
+    logger.info(
+        'Dataset has {} missing sets of antiSMASH data (from a total of {})'
+        .format(missing, len(genome_records)))
+
+    with open(genome_status_file, 'w', newline='\n') as f:
+        for obj in genome_status.values():
+            f.write(obj.to_csv() + '\n')
+
+    if missing == len(genome_records):
+        logger.warning('Failed to successfully retrieve ANY genome data!')

From 4879181203eb4eeb52aec434fac491ee257c3e29 Mon Sep 17 00:00:00 2001
From: gcroci2 <crocioni.giulia@gmail.com>
Date: Tue, 7 Mar 2023 18:23:11 +0100
Subject: [PATCH 06/16] properly define download_and_extract_antismash_metadata
 function

---
 src/nplinker/genomics/antismash/__init__.py   |   2 +-
 .../antismash/antismash_downloader.py         | 516 ++----------------
 2 files changed, 43 insertions(+), 475 deletions(-)

diff --git a/src/nplinker/genomics/antismash/__init__.py b/src/nplinker/genomics/antismash/__init__.py
index b0cdb0a4..482c9e6d 100644
--- a/src/nplinker/genomics/antismash/__init__.py
+++ b/src/nplinker/genomics/antismash/__init__.py
@@ -1,7 +1,7 @@
 import logging
 from .antismash_loader import AntismashBGCLoader
 from .antismash_loader import parse_bgc_genbank
-from .antismash_downloader import download_antismash_data
+from .antismash_downloader import download_and_extract_antismash_metadata
 
 logging.getLogger(__name__).addHandler(logging.NullHandler())
 
diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py
index 6d87cb4b..42477e35 100644
--- a/src/nplinker/genomics/antismash/antismash_downloader.py
+++ b/src/nplinker/genomics/antismash/antismash_downloader.py
@@ -1,14 +1,7 @@
-import csv
 import os
-import re
-import time
-import zipfile
-import httpx
-from deprecated import deprecated
-from bs4 import BeautifulSoup
-from progress.bar import Bar
+import shutil
 from nplinker.logconfig import LogConfig
-
+from nplinker.utils import download_and_extract_archive, list_dirs, list_files
 
 logger = LogConfig.getLogger(__name__)
 
@@ -20,483 +13,58 @@
 ANTISMASH_DBV2_PAGE_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/'
 ANTISMASH_DBV2_DOWNLOAD_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/{}'
 
-NCBI_LOOKUP_URL_NEW = 'https://www.ncbi.nlm.nih.gov/assembly/?term={}'
-
-JGI_GENOME_LOOKUP_URL = 'https://img.jgi.doe.gov/cgi-bin/m/main.cgi?section=TaxonDetail&page=taxonDetail&taxon_oid={}'
-
-USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0'
-
-class GenomeStatus:
-
-    def __init__(self, original_id, resolved_id, attempted=False, filename=""):
-        self.original_id = ';'.join(original_id.split(','))
-        self.resolved_id = None if resolved_id == 'None' else resolved_id
-        self.attempted = True if attempted == 'True' else False
-        self.filename = filename
-
-    @classmethod
-    def from_csv(cls, original_id, resolved_id, attempted, filename):
-        return cls(original_id, resolved_id, attempted, filename)
-
-    def to_csv(self):
-        return ','.join([
-            str(self.original_id),
-            str(self.resolved_id),
-            str(self.attempted), self.filename
-        ])
-
-def _get_best_available_genome_id(genome_id_data):
-    if 'RefSeq_accession' in genome_id_data:
-        return genome_id_data['RefSeq_accession']
-    elif 'GenBank_accession' in genome_id_data:
-        return genome_id_data['GenBank_accession']
-    elif 'JGI_Genome_ID' in genome_id_data:
-        return genome_id_data['JGI_Genome_ID']
-
-    logger.warning('No known genome ID field in genome data: {}'.format(
-        genome_id_data))
-    return None
-
-def _ncbi_genbank_search(genbank_id, retry_time=5.0):
-    url = NCBI_LOOKUP_URL_NEW.format(genbank_id)
-    logger.debug('Looking up GenBank data for {} at {}'.format(
-        genbank_id, url))
-    resp = httpx.get(url, follow_redirects=True)
-
-    if resp.status_code == httpx.codes.OK:
-        # the page should contain a <dl> element with class "assembly_summary_new". retrieving
-        # the page seems to fail occasionally in the middle of lengthy sequences of genome
-        # lookups, so there might be some throttling going on. this will automatically retry
-        # the lookup if the expected content isn't found the first time
-        soup = BeautifulSoup(resp.content, 'html.parser')
-        # find the <dl> element with class "assembly_summary_new"
-        dl_element = soup.find('dl', {'class': 'assembly_summary_new'})
-        if dl_element is not None:
-            return dl_element
-
-    logger.debug(
-        'NCBI lookup failed, status code {}. Trying again in {} seconds'.
-        format(resp.status_code, retry_time))
-    time.sleep(retry_time)
-    logger.debug('Looking up GenBank data for {} at {}'.format(
-        genbank_id, url))
-    resp = httpx.get(url, follow_redirects=True)
-    if resp.status_code == httpx.codes.OK:
-        soup = BeautifulSoup(resp.content, 'html.parser')
-        # find the <dl> element with class "assembly_summary_new"
-        dl_element = soup.find('dl', {'class': 'assembly_summary_new'})
-        if dl_element is not None:
-            return dl_element
-
-    logger.warning(
-        'Failed to resolve NCBI genome ID {} at URL {} (after retrying)'.
-        format(genbank_id, url))
-    return None
-
-def _resolve_genbank_accession(genbank_id):
-    logger.info(
-        'Attempting to resolve Genbank accession {} to RefSeq accession'.
-        format(genbank_id))
-    # genbank id => genbank seq => refseq
-
-    # The GenBank accession can have several formats:
-    # 1: BAFR00000000.1
-    # 2: NZ_BAGG00000000.1
-    # 3: NC_016887.1
-    # Case 1 is the default.
-    if '_' in genbank_id:
-        # case 2
-        if len(genbank_id.split('_')[-1].split('.')[0]) == 12:
-            genbank_id = genbank_id.split('_')[-1]
-        # case 3
-        else:
-            genbank_id = genbank_id.lower()
-
-    # get rid of any extraneous whitespace
-    genbank_id = genbank_id.strip()
-    logger.debug(f'Parsed GenBank ID to "{genbank_id}"')
-
-    # run a search using the GenBank accession ID
-    try:
-        dl_element = _ncbi_genbank_search(genbank_id)
-        if dl_element is None:
-            raise Exception('Unknown HTML format')
+# TODO: add unit tests (only public func)
+# TODO: add doc string
+def download_and_extract_antismash_metadata(
+        refseq_assembly_id: str,
+        download_root: str,
+        extract_path: str):
+    """_summary_
 
-        refseq_idx = -1
-        for field_idx, field in enumerate(dl_element.children):
-            # this is the element immediately preceding the one with
-            # the actual RefSeq ID we want
-            if field.getText().strip() == 'RefSeq assembly accession:':
-                refseq_idx = field_idx + 1
+    Args:
+        refseq_assembly_id(str): _description_
+        download_root(str): _description_
+        extract_path(str): _description_
 
-            # this should be True when we've reached the right element
-            if field_idx == refseq_idx:
-                refseq_id = field.getText()
-                # if it has any spaces, take everything up to first one (some have annotations afterwards)
-                if refseq_id.find(' ') != -1:
-                    refseq_id = refseq_id[:refseq_id.find(' ')]
+    Raises:
+        ValueError: _description_
+        ValueError: _description_
 
-                return refseq_id
+    Examples:
+        >>> 
+        """
 
-        if refseq_idx == -1:
-            raise Exception('Expected HTML elements not found')
-    except Exception as e:
-        logger.warning(
-            'Failed resolving GenBank accession {}, error {}'.format(
-                genbank_id, e))
+    if not os.path.exists(extract_path):
+        os.makedirs(extract_path, exist_ok=True)
 
-    return None
+    if download_root == extract_path:
+        raise ValueError(
+            "Identical path of download directory and extract directory")
 
-def _resolve_jgi_accession(jgi_id):
-    url = JGI_GENOME_LOOKUP_URL.format(jgi_id)
-    logger.info(
-        'Attempting to resolve JGI_Genome_ID {} to GenBank accession via {}'
-        .format(jgi_id, url))
-    # no User-Agent header produces a 403 Forbidden error on this site...
-    try:
-        resp = httpx.get(url,
-                            headers={'User-Agent': USER_AGENT},
-                            timeout=10.0,
-                            follow_redirects=True)
-    except httpx.ReadTimeout:
-        logger.warning(
-            'Timed out waiting for result of JGI_Genome_ID lookup')
-        return None
+    # check if extract_path is empty
+    files = [i for i in os.listdir(extract_path)]
+    if len(files) != 0:
+        raise ValueError(f'Nonempty directory: "{extract_path}"')
 
-    soup = BeautifulSoup(resp.content, 'html.parser')
-    # find the table entry giving the NCBI assembly accession ID
-    link = soup.find(
-        'a', href=re.compile('https://www.ncbi.nlm.nih.gov/nuccore/.*'))
-    if link is None:
-        return None
-
-    return _resolve_genbank_accession(link.text)
-
-def _resolve_genome_id_data(genome_id_data):
-    if 'RefSeq_accession' in genome_id_data:
-        # best case, can use this directly
-        return genome_id_data['RefSeq_accession']
-    elif 'GenBank_accession' in genome_id_data:
-        # resolve via NCBI
-        return _resolve_genbank_accession(
-            genome_id_data['GenBank_accession'])
-    elif 'JGI_Genome_ID' in genome_id_data:
-        # resolve via JGI => NCBI
-        return _resolve_jgi_accession(genome_id_data['JGI_Genome_ID'])
-
-    logger.warning(
-        f'Unable to resolve genome_ID: {genome_id_data}')
-    return None
-
-def _get_antismash_db_page(genome_obj):
-    # want to try up to 4 different links here, v1 and v2 databases, each
-    # with and without the .1 suffix on the accesssion ID
-
-    accesssions = [genome_obj.resolved_id, genome_obj.resolved_id + '.1']
-    for base_url in [ANTISMASH_DB_PAGE_URL, ANTISMASH_DBV2_PAGE_URL]:
-        for accession in accesssions:
-            url = base_url.format(accession)
-            link = None
-
-            logger.info('antismash DB lookup for {} => {}'.format(
-                accession, url))
-            try:
-                resp = httpx.get(url, follow_redirects=True)
-                soup = BeautifulSoup(resp.content, 'html.parser')
-                # retrieve .zip file download link
-                link = soup.find(
-                    'a', {'href': lambda url: url.endswith('.zip')})
-            except Exception as e:
-                logger.debug(f'antiSMASH DB page load failed: {e}')
-
-            if link is not None:
-                logger.info(
-                    'antiSMASH lookup succeeded! Filename is {}'.format(
-                        link['href']))
-                # save with the .1 suffix if that worked
-                genome_obj.resolved_id = accession
-                return link['href']
-
-    return None
-
-def _get_antismash_zip_data(accession_id, filename, local_path):
     for base_url in [
             ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL
     ]:
-        zipfile_url = base_url.format(accession_id, filename)
-        with open(local_path, 'wb') as f:
-            total_bytes = 0
-            try:
-                with httpx.stream('GET', zipfile_url) as r:
-                    if r.status_code == 404:
-                        logger.debug('antiSMASH download URL was a 404')
-                        continue
-
-                    logger.info('Downloading from antiSMASH: {}'.format(
-                        zipfile_url))
-                    filesize = int(r.headers['content-length'])
-                    bar = Bar(filename,
-                                max=filesize,
-                                suffix='%(percent)d%%')
-                    for data in r.iter_bytes():
-                        f.write(data)
-                        total_bytes += len(data)
-                        bar.next(len(data))
-                    bar.finish()
-            except Exception as e:
-                logger.warning(
-                    f'antiSMASH zip download failed: {e}')
-                continue
-
-        return True
-
-    return False
-
-def _download_antismash_zip(antismash_obj, project_download_cache):
-    # save zip files to avoid having to repeat above lookup every time
-    local_path = os.path.join(project_download_cache,
-                                f'{antismash_obj.resolved_id}.zip')
-    logger.debug(
-        f'Checking for existing antismash zip at {local_path}')
-
-    cached = False
-    # if the file exists locally
-    if os.path.exists(local_path):
-        logger.info(f'Found cached file at {local_path}')
-        try:
-            # check if it's a valid zip file, if so treat it as cached
-            _ = zipfile.ZipFile(local_path)
-            cached = True
-            antismash_obj.filename = local_path
-        except zipfile.BadZipFile as bzf:
-            # otherwise delete and redownload
-            logger.info(
-                'Invalid antismash zipfile found ({}). Will download again'
-                .format(bzf))
-            os.unlink(local_path)
-            antismash_obj.filename = ""
-
-    if not cached:
-        filename = _get_antismash_db_page(antismash_obj)
-        if filename is None:
-            return False
-
-        _get_antismash_zip_data(antismash_obj.resolved_id, filename,
-                                        local_path)
-        antismash_obj.filename = local_path
-
-    return True
+        url = base_url.format(refseq_assembly_id, refseq_assembly_id + '.zip')
 
-def _extract_antismash_zip(antismash_obj, project_file_cache):
-    if antismash_obj.filename is None or len(antismash_obj.filename) == 0:
-        return False
-
-    output_path = os.path.join(project_file_cache, 'antismash',
-                                antismash_obj.resolved_id)
-    exists_already = os.path.exists(output_path) and os.path.exists(
-        os.path.join(output_path, 'completed'))
-
-    logger.debug(
-        'Extracting antismash data to {}, exists_already = {}'.format(
-            output_path, exists_already))
-    if exists_already:
-        return True
-
-    # create a subfolder for each set of genome data (the zip files used to be
-    # constructed with path info but that seems to have changed recently)
-    if not os.path.exists(output_path):
-        os.makedirs(output_path, exist_ok=True)
-
-    antismash_zip = zipfile.ZipFile(antismash_obj.filename)
-    kc_prefix1 = f'{antismash_obj.resolved_id}/knownclusterblast'
-    kc_prefix2 = 'knownclusterblast'
-    for zip_member in antismash_zip.namelist():
-        # TODO other files here?
-        if zip_member.endswith('.gbk') or zip_member.endswith('.json'):
-            antismash_zip.extract(zip_member, path=output_path)
-        elif zip_member.startswith(kc_prefix1) or zip_member.startswith(
-                kc_prefix2):
-            if zip_member.endswith(
-                    '.txt') and 'mibig_hits' not in zip_member:
-                antismash_zip.extract(zip_member, path=output_path)
-
-    open(os.path.join(output_path, 'completed'), 'w').close()
-
-    return True
-
-def download_and_extract_antismash_data(item_id, download_root, extract_path): 
-    genome_status = {}
-
-    # this file records genome IDs and local filenames to avoid having to repeat HTTP requests
-    # each time the app is loaded (this can take a lot of time if there are dozens of genomes)
-    genome_status_file = os.path.join(download_root,
-                                        'genome_status.txt')
-
-    # genome lookup status info
-    if os.path.exists(genome_status_file):
-        with open(genome_status_file) as f:
-            for line in csv.reader(f):
-                asobj = GenomeStatus.from_csv(*line)
-                genome_status[asobj.original_id] = asobj
-
-    # use this to check if the lookup has already been attempted and if
-    # so if the file is cached locally
-    if item_id not in genome_status:
-        genome_status[item_id] = GenomeStatus(item_id, None)
-
-    genome_obj = genome_status[item_id]
-
-    logger.info(
-        'Checking for antismash data for genome ID={}'.
-        format(item_id))
-    # first check if file is cached locally
-    if os.path.exists(genome_obj.filename):
-        # file already downloaded
-        logger.info('Genome ID {} already downloaded to {}'.format(
-            item_id, genome_obj.filename))
-    elif genome_obj.attempted:
-        # lookup attempted previously but failed
+        download_and_extract_archive(url, download_root, extract_path, refseq_assembly_id + '.zip')
         logger.info(
-            'Genome ID {} skipped due to previous failure'.format(
-                item_id))
-    else:
-        # if no existing file and no lookup attempted, can start process of
-        # trying to retrieve the data
-
-        # lookup the ID
-        logger.info('Beginning lookup process for genome ID {}'.format(
-            item_id))
-
-        genome_obj.resolved_id = item_id # TO CHECK (Cunliang) not sure if this is what we want; in a general case,
-        # I don't think we have different possible ids (as in podp json file, for genome_ID nested dicts),
-        # so maybe it makes sense to put genome_obj.resolved_id equal to the item_id and only in podp case do the check
-        # (through _resolve_genome_id_data, was done here before) outside this function.
-        # If this is true, then I think we need to modify GenomeStatus class attributes logic for original_id and resolved_id,
-        # which in this way would be the same thing. Then we should modify also the code below, which assumes original_id
-        # to be eventually different
-        genome_obj.attempted = True
-
-        if genome_obj.resolved_id is None:
-            # give up on this one
-            logger.warning(
-                f'Failed lookup for genome ID {item_id}')
-            with open(genome_status_file, 'a+') as f:
-                f.write(genome_obj.to_csv() + '\n')
-
-        # if we got a refseq ID, now try to download the data from antismash
-        if _download_antismash_zip(genome_obj, download_root):
-            logger.info(
-                'Genome data successfully downloaded for {}'.format(
-                    item_id))
-        else:
-            logger.warning(
-                'Failed to download antiSMASH data for genome ID {} ({})'
-                .format(genome_obj.resolved_id,
-                        genome_obj.original_id))
-
-        with open(genome_status_file, 'a+', newline='\n') as f:
-            f.write(genome_obj.to_csv() + '\n')
-
-    _extract_antismash_zip(genome_obj, extract_path)
-
-    with open(genome_status_file, 'w', newline='\n') as f:
-        for obj in genome_status.values():
-            f.write(obj.to_csv() + '\n')
-
-@deprecated(version="1.3.3", reason="Use download_and_extract_antismash_data class instead.")
-def download_antismash_data(genome_records, project_download_cache, project_file_cache): 
-    genome_status = {}
-
-    # this file records genome IDs and local filenames to avoid having to repeat HTTP requests
-    # each time the app is loaded (this can take a lot of time if there are dozens of genomes)
-    genome_status_file = os.path.join(project_download_cache,
-                                        'genome_status.txt')
-
-    # genome lookup status info
-    if os.path.exists(genome_status_file):
-        with open(genome_status_file) as f:
-            for line in csv.reader(f):
-                asobj = GenomeStatus.from_csv(*line)
-                genome_status[asobj.original_id] = asobj
-
-    for i, genome_record in enumerate(genome_records):
-        # get the best available ID from the dict
-        best_id = _get_best_available_genome_id(
-            genome_record['genome_ID'])
-        if best_id is None:
-            logger.warning(
-                'Ignoring genome record "{}" due to missing genome ID field'
-                .format(genome_record))
-            continue
-
-        # use this to check if the lookup has already been attempted and if
-        # so if the file is cached locally
-        if best_id not in genome_status:
-            genome_status[best_id] = GenomeStatus(best_id, None)
-
-        genome_obj = genome_status[best_id]
-
-        logger.info(
-            'Checking for antismash data {}/{}, current genome ID={}'.
-            format(i + 1, len(genome_records), best_id))
-        # first check if file is cached locally
-        if os.path.exists(genome_obj.filename):
-            # file already downloaded
-            logger.info('Genome ID {} already downloaded to {}'.format(
-                best_id, genome_obj.filename))
-            genome_record['resolved_id'] = genome_obj.resolved_id
-        elif genome_obj.attempted:
-            # lookup attempted previously but failed
-            logger.info(
-                'Genome ID {} skipped due to previous failure'.format(
-                    best_id))
-            genome_record['resolved_id'] = genome_obj.resolved_id
-        else:
-            # if no existing file and no lookup attempted, can start process of
-            # trying to retrieve the data
-
-            # lookup the ID
-            logger.info('Beginning lookup process for genome ID {}'.format(
-                best_id))
-
-            genome_obj.resolved_id = _resolve_genome_id_data(
-                genome_record['genome_ID'])
-            genome_obj.attempted = True
-
-            if genome_obj.resolved_id is None:
-                # give up on this one
-                logger.warning(
-                    f'Failed lookup for genome ID {best_id}')
-                with open(genome_status_file, 'a+') as f:
-                    f.write(genome_obj.to_csv() + '\n')
-                continue
-
-            # if we got a refseq ID, now try to download the data from antismash
-            if _download_antismash_zip(genome_obj, project_download_cache):
-                logger.info(
-                    'Genome data successfully downloaded for {}'.format(
-                        best_id))
-                genome_record['resolved_id'] = genome_obj.resolved_id
-            else:
-                logger.warning(
-                    'Failed to download antiSMASH data for genome ID {} ({})'
-                    .format(genome_obj.resolved_id,
-                            genome_obj.original_id))
-
-            with open(genome_status_file, 'a+', newline='\n') as f:
-                f.write(genome_obj.to_csv() + '\n')
-
-        _extract_antismash_zip(genome_obj, project_file_cache)
+            f'Genome data successfully extracted for {refseq_assembly_id}')
+        break
 
-    missing = len(
-        [x for x in genome_status.values() if len(x.filename) == 0])
-    logger.info(
-        'Dataset has {} missing sets of antiSMASH data (from a total of {})'
-        .format(missing, len(genome_records)))
+    # delete subdirs
+    logger.info('Deleting unnecessary extracted subdirs and files')
+    subdirs = list_dirs(extract_path)
+    for subdir_path in subdirs:
+        shutil.rmtree(subdir_path)
 
-    with open(genome_status_file, 'w', newline='\n') as f:
-        for obj in genome_status.values():
-            f.write(obj.to_csv() + '\n')
+    files_to_keep = list_files(extract_path, suffix=(".json", ".gbk"))
 
-    if missing == len(genome_records):
-        logger.warning('Failed to successfully retrieve ANY genome data!')
+    for file in list_files(extract_path):
+        if file not in files_to_keep:
+            os.remove(file)
+    logger.info(f'download_and_extract_antismash_metadata process for {refseq_assembly_id} is over')

From 392a73625bb45dec0321ff1a9c6f51242080627d Mon Sep 17 00:00:00 2001
From: gcroci2 <crocioni.giulia@gmail.com>
Date: Wed, 8 Mar 2023 12:17:16 +0100
Subject: [PATCH 07/16] add internal funcs and doc string

---
 .../antismash/antismash_downloader.py         | 51 +++++++++++--------
 1 file changed, 31 insertions(+), 20 deletions(-)

diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py
index 42477e35..c1db5f26 100644
--- a/src/nplinker/genomics/antismash/antismash_downloader.py
+++ b/src/nplinker/genomics/antismash/antismash_downloader.py
@@ -13,38 +13,49 @@
 ANTISMASH_DBV2_PAGE_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/'
 ANTISMASH_DBV2_DOWNLOAD_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/{}'
 
+def _check_roots(download_root, extract_root):
+    if download_root == extract_root:
+        raise ValueError(
+            "Identical path of download directory and extract directory")
+
+def _check_extract_path(extract_path):
+    if os.path.exists(extract_path):
+        # check if extract_path is empty
+        files = [i for i in os.listdir(extract_path)]
+        if len(files) != 0:
+            raise ValueError(f'Nonempty directory: "{extract_path}"')
+    else:
+        os.makedirs(extract_path, exist_ok=True)
+
 # TODO: add unit tests (only public func)
-# TODO: add doc string
 def download_and_extract_antismash_metadata(
         refseq_assembly_id: str,
         download_root: str,
-        extract_path: str):
-    """_summary_
+        extract_root: str):
+    """Download and extract Antismash files for a specified refseq_assembly_id.
 
     Args:
-        refseq_assembly_id(str): _description_
-        download_root(str): _description_
-        extract_path(str): _description_
+        refseq_assembly_id(str): Assembled genomic RefSeq (reference sequence) id.
+            If the id is versioned (e.g., "GCF_004339725.1") please be sure to 
+            specify the version as well. 
+        download_root(str): Path to the directory to place downloaded archive in.
+        extract_root(str): Path to the directory data files will be extracted to.
+            Note that if it will create an antismash/ directory in the specified extract_root, if
+            it doesn't already exist.
+            The files will be extracted to <extract_root>/antismash/<refseq_assembly_id>/ dir.
 
     Raises:
-        ValueError: _description_
-        ValueError: _description_
+        ValueError: if download_root and extract_root dirs are the same.
+        ValueError: if <extract_root>/antismash/<refseq_assembly_id> dir is not empty.
 
     Examples:
-        >>> 
+        >>> download_and_extract_antismash_metadata("GCF_004339725.1", "/data/download", "/data/extracted")
         """
+    
+    extract_path = extract_root + f'/antismash/{refseq_assembly_id}'
 
-    if not os.path.exists(extract_path):
-        os.makedirs(extract_path, exist_ok=True)
-
-    if download_root == extract_path:
-        raise ValueError(
-            "Identical path of download directory and extract directory")
-
-    # check if extract_path is empty
-    files = [i for i in os.listdir(extract_path)]
-    if len(files) != 0:
-        raise ValueError(f'Nonempty directory: "{extract_path}"')
+    _check_roots(download_root, extract_root)
+    _check_extract_path(extract_path)
 
     for base_url in [
             ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL

From 850b7466ccaa7fd810ada3c304e384a8c06e1ece Mon Sep 17 00:00:00 2001
From: gcroci2 <crocioni.giulia@gmail.com>
Date: Thu, 9 Mar 2023 11:22:02 +0100
Subject: [PATCH 08/16] add tests and create antismash test folder

---
 tests/genomics/antismash/__init__.py          |  0
 tests/genomics/antismash/test_downloader.py   | 35 +++++++++++++++++++
 .../test_loader.py}                           |  2 +-
 3 files changed, 36 insertions(+), 1 deletion(-)
 create mode 100644 tests/genomics/antismash/__init__.py
 create mode 100644 tests/genomics/antismash/test_downloader.py
 rename tests/genomics/{test_antismash_loader.py => antismash/test_loader.py} (99%)

diff --git a/tests/genomics/antismash/__init__.py b/tests/genomics/antismash/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/genomics/antismash/test_downloader.py b/tests/genomics/antismash/test_downloader.py
new file mode 100644
index 00000000..059029f6
--- /dev/null
+++ b/tests/genomics/antismash/test_downloader.py
@@ -0,0 +1,35 @@
+import pytest
+from nplinker.genomics.antismash import download_and_extract_antismash_metadata
+
+class TestDownloadAndExtractAntismashData():
+
+    def test_default(self, tmp_path):
+        download_root = tmp_path / "download"
+        extract_root = tmp_path / "extracted"
+        download_root.mkdir()
+        extract_root.mkdir()
+        refseq_assembly_id = "GCF_004339725.1"
+        download_and_extract_antismash_metadata(refseq_assembly_id, download_root, extract_root)
+        archive = download_root / "GCF_004339725.1.zip"
+        extracted_folder = extract_root / "antismash/GCF_004339725.1"
+        assert archive.exists()
+        assert archive.is_file()
+        assert extracted_folder.exists()
+
+    def test_error_same_path(self, tmp_path):
+        refseq_assembly_id = "GCF_004339725.1"
+        with pytest.raises(ValueError) as e:
+            download_and_extract_antismash_metadata(refseq_assembly_id, tmp_path, tmp_path)
+        assert e.value.args[
+            0] == "Identical path of download directory and extract directory"
+
+    def test_error_nonempty_path(self, tmp_path):
+        refseq_assembly_id = "GCF_004339725.1"
+        nonempty_path = tmp_path / f"extracted/antismash/{refseq_assembly_id}/subdir"
+        nonempty_path.mkdir(parents=True)
+        with pytest.raises(ValueError) as e:
+            download_and_extract_antismash_metadata(
+                refseq_assembly_id,
+                tmp_path,
+                tmp_path / "extracted")
+        assert "Nonempty directory" in e.value.args[0]
diff --git a/tests/genomics/test_antismash_loader.py b/tests/genomics/antismash/test_loader.py
similarity index 99%
rename from tests/genomics/test_antismash_loader.py
rename to tests/genomics/antismash/test_loader.py
index 019feea6..dfe2a25b 100644
--- a/tests/genomics/test_antismash_loader.py
+++ b/tests/genomics/antismash/test_loader.py
@@ -3,7 +3,7 @@
 from nplinker.genomics import BGCLoaderBase
 from nplinker.genomics.antismash import AntismashBGCLoader
 from nplinker.genomics.antismash import parse_bgc_genbank
-from .. import DATA_DIR
+from ... import DATA_DIR
 
 
 class TestAntismashBGCLoader:

From e186ba8f5d0b0564f66b913bd30665615d7e3502 Mon Sep 17 00:00:00 2001
From: gcroci2 <crocioni.giulia@gmail.com>
Date: Thu, 9 Mar 2023 11:23:00 +0100
Subject: [PATCH 09/16] format properly extract_path

---
 src/nplinker/genomics/antismash/antismash_downloader.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py
index c1db5f26..c4936d75 100644
--- a/src/nplinker/genomics/antismash/antismash_downloader.py
+++ b/src/nplinker/genomics/antismash/antismash_downloader.py
@@ -51,8 +51,7 @@ def download_and_extract_antismash_metadata(
     Examples:
         >>> download_and_extract_antismash_metadata("GCF_004339725.1", "/data/download", "/data/extracted")
         """
-    
-    extract_path = extract_root + f'/antismash/{refseq_assembly_id}'
+    extract_path = os.path.join(extract_root, "antismash", refseq_assembly_id)
 
     _check_roots(download_root, extract_root)
     _check_extract_path(extract_path)

From 61e17fec836ea7cd6e93d5b73b51d73ff50d09fc Mon Sep 17 00:00:00 2001
From: gcroci2 <crocioni.giulia@gmail.com>
Date: Thu, 9 Mar 2023 11:40:09 +0100
Subject: [PATCH 10/16] run linting and formatting for modified files using
 yapf

---
 .../antismash/antismash_downloader.py         |  22 ++--
 .../pairedomics/podp_antismash_downloader.py  | 116 +++++++++---------
 tests/genomics/antismash/test_downloader.py   |  18 +--
 tests/genomics/antismash/test_loader.py       |   8 +-
 4 files changed, 84 insertions(+), 80 deletions(-)

diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py
index c4936d75..6171a835 100644
--- a/src/nplinker/genomics/antismash/antismash_downloader.py
+++ b/src/nplinker/genomics/antismash/antismash_downloader.py
@@ -13,11 +13,13 @@
 ANTISMASH_DBV2_PAGE_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/'
 ANTISMASH_DBV2_DOWNLOAD_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/{}'
 
+
 def _check_roots(download_root, extract_root):
     if download_root == extract_root:
         raise ValueError(
             "Identical path of download directory and extract directory")
 
+
 def _check_extract_path(extract_path):
     if os.path.exists(extract_path):
         # check if extract_path is empty
@@ -27,11 +29,10 @@ def _check_extract_path(extract_path):
     else:
         os.makedirs(extract_path, exist_ok=True)
 
-# TODO: add unit tests (only public func)
-def download_and_extract_antismash_metadata(
-        refseq_assembly_id: str,
-        download_root: str,
-        extract_root: str):
+
+def download_and_extract_antismash_metadata(refseq_assembly_id: str,
+                                            download_root: str,
+                                            extract_root: str):
     """Download and extract Antismash files for a specified refseq_assembly_id.
 
     Args:
@@ -56,12 +57,11 @@ def download_and_extract_antismash_metadata(
     _check_roots(download_root, extract_root)
     _check_extract_path(extract_path)
 
-    for base_url in [
-            ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL
-    ]:
+    for base_url in [ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL]:
         url = base_url.format(refseq_assembly_id, refseq_assembly_id + '.zip')
 
-        download_and_extract_archive(url, download_root, extract_path, refseq_assembly_id + '.zip')
+        download_and_extract_archive(url, download_root, extract_path,
+                                     refseq_assembly_id + '.zip')
         logger.info(
             f'Genome data successfully extracted for {refseq_assembly_id}')
         break
@@ -77,4 +77,6 @@ def download_and_extract_antismash_metadata(
     for file in list_files(extract_path):
         if file not in files_to_keep:
             os.remove(file)
-    logger.info(f'download_and_extract_antismash_metadata process for {refseq_assembly_id} is over')
+    logger.info(
+        f'download_and_extract_antismash_metadata process for {refseq_assembly_id} is over'
+    )
diff --git a/src/nplinker/pairedomics/podp_antismash_downloader.py b/src/nplinker/pairedomics/podp_antismash_downloader.py
index 76cd56c8..30f8ef58 100644
--- a/src/nplinker/pairedomics/podp_antismash_downloader.py
+++ b/src/nplinker/pairedomics/podp_antismash_downloader.py
@@ -9,7 +9,6 @@
 from progress.bar import Bar
 from nplinker.logconfig import LogConfig
 
-
 logger = LogConfig.getLogger(__name__)
 
 # urls to be given to download antismash data
@@ -26,6 +25,7 @@
 
 USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0'
 
+
 class GenomeStatus:
 
     def __init__(self, original_id, resolved_id, attempted=False, filename=""):
@@ -45,6 +45,7 @@ def to_csv(self):
             str(self.attempted), self.filename
         ])
 
+
 def _get_best_available_genome_id(genome_id_data):
     if 'RefSeq_accession' in genome_id_data:
         return genome_id_data['RefSeq_accession']
@@ -53,10 +54,11 @@ def _get_best_available_genome_id(genome_id_data):
     elif 'JGI_Genome_ID' in genome_id_data:
         return genome_id_data['JGI_Genome_ID']
 
-    logger.warning('No known genome ID field in genome data: {}'.format(
-        genome_id_data))
+    logger.warning(
+        'No known genome ID field in genome data: {}'.format(genome_id_data))
     return None
 
+
 def _ncbi_genbank_search(genbank_id, retry_time=5.0):
     url = NCBI_LOOKUP_URL_NEW.format(genbank_id)
     logger.debug('Looking up GenBank data for {} at {}'.format(
@@ -93,6 +95,7 @@ def _ncbi_genbank_search(genbank_id, retry_time=5.0):
         format(genbank_id, url))
     return None
 
+
 def _resolve_genbank_accession(genbank_id):
     logger.info(
         'Attempting to resolve Genbank accession {} to RefSeq accession'.
@@ -147,20 +150,20 @@ def _resolve_genbank_accession(genbank_id):
 
     return None
 
+
 def _resolve_jgi_accession(jgi_id):
     url = JGI_GENOME_LOOKUP_URL.format(jgi_id)
     logger.info(
-        'Attempting to resolve JGI_Genome_ID {} to GenBank accession via {}'
-        .format(jgi_id, url))
+        'Attempting to resolve JGI_Genome_ID {} to GenBank accession via {}'.
+        format(jgi_id, url))
     # no User-Agent header produces a 403 Forbidden error on this site...
     try:
         resp = httpx.get(url,
-                            headers={'User-Agent': USER_AGENT},
-                            timeout=10.0,
-                            follow_redirects=True)
+                         headers={'User-Agent': USER_AGENT},
+                         timeout=10.0,
+                         follow_redirects=True)
     except httpx.ReadTimeout:
-        logger.warning(
-            'Timed out waiting for result of JGI_Genome_ID lookup')
+        logger.warning('Timed out waiting for result of JGI_Genome_ID lookup')
         return None
 
     soup = BeautifulSoup(resp.content, 'html.parser')
@@ -172,22 +175,22 @@ def _resolve_jgi_accession(jgi_id):
 
     return _resolve_genbank_accession(link.text)
 
+
 def _resolve_genome_id_data(genome_id_data):
     if 'RefSeq_accession' in genome_id_data:
         # best case, can use this directly
         return genome_id_data['RefSeq_accession']
     elif 'GenBank_accession' in genome_id_data:
         # resolve via NCBI
-        return _resolve_genbank_accession(
-            genome_id_data['GenBank_accession'])
+        return _resolve_genbank_accession(genome_id_data['GenBank_accession'])
     elif 'JGI_Genome_ID' in genome_id_data:
         # resolve via JGI => NCBI
         return _resolve_jgi_accession(genome_id_data['JGI_Genome_ID'])
 
-    logger.warning(
-        f'Unable to resolve genome_ID: {genome_id_data}')
+    logger.warning(f'Unable to resolve genome_ID: {genome_id_data}')
     return None
 
+
 def _get_antismash_db_page(genome_obj):
     # want to try up to 4 different links here, v1 and v2 databases, each
     # with and without the .1 suffix on the accesssion ID
@@ -204,8 +207,8 @@ def _get_antismash_db_page(genome_obj):
                 resp = httpx.get(url, follow_redirects=True)
                 soup = BeautifulSoup(resp.content, 'html.parser')
                 # retrieve .zip file download link
-                link = soup.find(
-                    'a', {'href': lambda url: url.endswith('.zip')})
+                link = soup.find('a',
+                                 {'href': lambda url: url.endswith('.zip')})
             except Exception as e:
                 logger.debug(f'antiSMASH DB page load failed: {e}')
 
@@ -219,10 +222,9 @@ def _get_antismash_db_page(genome_obj):
 
     return None
 
+
 def _get_antismash_zip_data(accession_id, filename, local_path):
-    for base_url in [
-            ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL
-    ]:
+    for base_url in [ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL]:
         zipfile_url = base_url.format(accession_id, filename)
         with open(local_path, 'wb') as f:
             total_bytes = 0
@@ -232,32 +234,29 @@ def _get_antismash_zip_data(accession_id, filename, local_path):
                         logger.debug('antiSMASH download URL was a 404')
                         continue
 
-                    logger.info('Downloading from antiSMASH: {}'.format(
-                        zipfile_url))
+                    logger.info(
+                        'Downloading from antiSMASH: {}'.format(zipfile_url))
                     filesize = int(r.headers['content-length'])
-                    bar = Bar(filename,
-                                max=filesize,
-                                suffix='%(percent)d%%')
+                    bar = Bar(filename, max=filesize, suffix='%(percent)d%%')
                     for data in r.iter_bytes():
                         f.write(data)
                         total_bytes += len(data)
                         bar.next(len(data))
                     bar.finish()
             except Exception as e:
-                logger.warning(
-                    f'antiSMASH zip download failed: {e}')
+                logger.warning(f'antiSMASH zip download failed: {e}')
                 continue
 
         return True
 
     return False
 
+
 def _download_antismash_zip(antismash_obj, project_download_cache):
     # save zip files to avoid having to repeat above lookup every time
     local_path = os.path.join(project_download_cache,
-                                f'{antismash_obj.resolved_id}.zip')
-    logger.debug(
-        f'Checking for existing antismash zip at {local_path}')
+                              f'{antismash_obj.resolved_id}.zip')
+    logger.debug(f'Checking for existing antismash zip at {local_path}')
 
     cached = False
     # if the file exists locally
@@ -271,8 +270,8 @@ def _download_antismash_zip(antismash_obj, project_download_cache):
         except zipfile.BadZipFile as bzf:
             # otherwise delete and redownload
             logger.info(
-                'Invalid antismash zipfile found ({}). Will download again'
-                .format(bzf))
+                'Invalid antismash zipfile found ({}). Will download again'.
+                format(bzf))
             os.unlink(local_path)
             antismash_obj.filename = ""
 
@@ -282,23 +281,23 @@ def _download_antismash_zip(antismash_obj, project_download_cache):
             return False
 
         _get_antismash_zip_data(antismash_obj.resolved_id, filename,
-                                        local_path)
+                                local_path)
         antismash_obj.filename = local_path
 
     return True
 
+
 def _extract_antismash_zip(antismash_obj, project_file_cache):
     if antismash_obj.filename is None or len(antismash_obj.filename) == 0:
         return False
 
     output_path = os.path.join(project_file_cache, 'antismash',
-                                antismash_obj.resolved_id)
+                               antismash_obj.resolved_id)
     exists_already = os.path.exists(output_path) and os.path.exists(
         os.path.join(output_path, 'completed'))
 
-    logger.debug(
-        'Extracting antismash data to {}, exists_already = {}'.format(
-            output_path, exists_already))
+    logger.debug('Extracting antismash data to {}, exists_already = {}'.format(
+        output_path, exists_already))
     if exists_already:
         return True
 
@@ -316,22 +315,24 @@ def _extract_antismash_zip(antismash_obj, project_file_cache):
             antismash_zip.extract(zip_member, path=output_path)
         elif zip_member.startswith(kc_prefix1) or zip_member.startswith(
                 kc_prefix2):
-            if zip_member.endswith(
-                    '.txt') and 'mibig_hits' not in zip_member:
+            if zip_member.endswith('.txt') and 'mibig_hits' not in zip_member:
                 antismash_zip.extract(zip_member, path=output_path)
 
     open(os.path.join(output_path, 'completed'), 'w').close()
 
     return True
 
-@deprecated(version="1.3.3", reason="Use download_and_extract_antismash_data class instead.")
-def download_antismash_data(genome_records, project_download_cache, project_file_cache): 
+
+@deprecated(version="1.3.3",
+            reason="Use download_and_extract_antismash_data class instead.")
+def download_antismash_data(genome_records, project_download_cache,
+                            project_file_cache):
     genome_status = {}
 
     # this file records genome IDs and local filenames to avoid having to repeat HTTP requests
     # each time the app is loaded (this can take a lot of time if there are dozens of genomes)
     genome_status_file = os.path.join(project_download_cache,
-                                        'genome_status.txt')
+                                      'genome_status.txt')
 
     # genome lookup status info
     if os.path.exists(genome_status_file):
@@ -342,12 +343,11 @@ def download_antismash_data(genome_records, project_download_cache, project_file
 
     for i, genome_record in enumerate(genome_records):
         # get the best available ID from the dict
-        best_id = _get_best_available_genome_id(
-            genome_record['genome_ID'])
+        best_id = _get_best_available_genome_id(genome_record['genome_ID'])
         if best_id is None:
             logger.warning(
-                'Ignoring genome record "{}" due to missing genome ID field'
-                .format(genome_record))
+                'Ignoring genome record "{}" due to missing genome ID field'.
+                format(genome_record))
             continue
 
         # use this to check if the lookup has already been attempted and if
@@ -358,8 +358,8 @@ def download_antismash_data(genome_records, project_download_cache, project_file
         genome_obj = genome_status[best_id]
 
         logger.info(
-            'Checking for antismash data {}/{}, current genome ID={}'.
-            format(i + 1, len(genome_records), best_id))
+            'Checking for antismash data {}/{}, current genome ID={}'.format(
+                i + 1, len(genome_records), best_id))
         # first check if file is cached locally
         if os.path.exists(genome_obj.filename):
             # file already downloaded
@@ -369,16 +369,15 @@ def download_antismash_data(genome_records, project_download_cache, project_file
         elif genome_obj.attempted:
             # lookup attempted previously but failed
             logger.info(
-                'Genome ID {} skipped due to previous failure'.format(
-                    best_id))
+                'Genome ID {} skipped due to previous failure'.format(best_id))
             genome_record['resolved_id'] = genome_obj.resolved_id
         else:
             # if no existing file and no lookup attempted, can start process of
             # trying to retrieve the data
 
             # lookup the ID
-            logger.info('Beginning lookup process for genome ID {}'.format(
-                best_id))
+            logger.info(
+                'Beginning lookup process for genome ID {}'.format(best_id))
 
             genome_obj.resolved_id = _resolve_genome_id_data(
                 genome_record['genome_ID'])
@@ -386,8 +385,7 @@ def download_antismash_data(genome_records, project_download_cache, project_file
 
             if genome_obj.resolved_id is None:
                 # give up on this one
-                logger.warning(
-                    f'Failed lookup for genome ID {best_id}')
+                logger.warning(f'Failed lookup for genome ID {best_id}')
                 with open(genome_status_file, 'a+') as f:
                     f.write(genome_obj.to_csv() + '\n')
                 continue
@@ -400,20 +398,18 @@ def download_antismash_data(genome_records, project_download_cache, project_file
                 genome_record['resolved_id'] = genome_obj.resolved_id
             else:
                 logger.warning(
-                    'Failed to download antiSMASH data for genome ID {} ({})'
-                    .format(genome_obj.resolved_id,
-                            genome_obj.original_id))
+                    'Failed to download antiSMASH data for genome ID {} ({})'.
+                    format(genome_obj.resolved_id, genome_obj.original_id))
 
             with open(genome_status_file, 'a+', newline='\n') as f:
                 f.write(genome_obj.to_csv() + '\n')
 
         _extract_antismash_zip(genome_obj, project_file_cache)
 
-    missing = len(
-        [x for x in genome_status.values() if len(x.filename) == 0])
+    missing = len([x for x in genome_status.values() if len(x.filename) == 0])
     logger.info(
-        'Dataset has {} missing sets of antiSMASH data (from a total of {})'
-        .format(missing, len(genome_records)))
+        'Dataset has {} missing sets of antiSMASH data (from a total of {})'.
+        format(missing, len(genome_records)))
 
     with open(genome_status_file, 'w', newline='\n') as f:
         for obj in genome_status.values():
diff --git a/tests/genomics/antismash/test_downloader.py b/tests/genomics/antismash/test_downloader.py
index 059029f6..631d4d0c 100644
--- a/tests/genomics/antismash/test_downloader.py
+++ b/tests/genomics/antismash/test_downloader.py
@@ -1,15 +1,17 @@
 import pytest
 from nplinker.genomics.antismash import download_and_extract_antismash_metadata
 
+
 class TestDownloadAndExtractAntismashData():
 
     def test_default(self, tmp_path):
+        refseq_assembly_id = "GCF_004339725.1"
         download_root = tmp_path / "download"
-        extract_root = tmp_path / "extracted"
         download_root.mkdir()
+        extract_root = tmp_path / "extracted"
         extract_root.mkdir()
-        refseq_assembly_id = "GCF_004339725.1"
-        download_and_extract_antismash_metadata(refseq_assembly_id, download_root, extract_root)
+        download_and_extract_antismash_metadata(refseq_assembly_id,
+                                                download_root, extract_root)
         archive = download_root / "GCF_004339725.1.zip"
         extracted_folder = extract_root / "antismash/GCF_004339725.1"
         assert archive.exists()
@@ -19,7 +21,8 @@ def test_default(self, tmp_path):
     def test_error_same_path(self, tmp_path):
         refseq_assembly_id = "GCF_004339725.1"
         with pytest.raises(ValueError) as e:
-            download_and_extract_antismash_metadata(refseq_assembly_id, tmp_path, tmp_path)
+            download_and_extract_antismash_metadata(refseq_assembly_id,
+                                                    tmp_path, tmp_path)
         assert e.value.args[
             0] == "Identical path of download directory and extract directory"
 
@@ -28,8 +31,7 @@ def test_error_nonempty_path(self, tmp_path):
         nonempty_path = tmp_path / f"extracted/antismash/{refseq_assembly_id}/subdir"
         nonempty_path.mkdir(parents=True)
         with pytest.raises(ValueError) as e:
-            download_and_extract_antismash_metadata(
-                refseq_assembly_id,
-                tmp_path,
-                tmp_path / "extracted")
+            download_and_extract_antismash_metadata(refseq_assembly_id,
+                                                    tmp_path,
+                                                    tmp_path / "extracted")
         assert "Nonempty directory" in e.value.args[0]
diff --git a/tests/genomics/antismash/test_loader.py b/tests/genomics/antismash/test_loader.py
index dfe2a25b..ca356f58 100644
--- a/tests/genomics/antismash/test_loader.py
+++ b/tests/genomics/antismash/test_loader.py
@@ -63,10 +63,14 @@ def test_parse_bgc_genbank():
     assert bgc.antismash_id == "NZ_AZWB01000005"
     assert bgc.antismash_file == gbk_file
     assert bgc.antismash_region == "1"
-    assert bgc.smiles == ["NC([*])C(=O)NC([*])C(=O)NC(CO)C(=O)NC(Cc1ccccc1)C(=O)NCC(=O)O"]
+    assert bgc.smiles == [
+        "NC([*])C(=O)NC([*])C(=O)NC(CO)C(=O)NC(Cc1ccccc1)C(=O)NCC(=O)O"
+    ]
+
 
 def test_parse_bgc_genbank_error():
     gbk_file = str(DATA_DIR / "fake_antismash.region001.gbk")
     with pytest.raises(ValueError) as e:
         parse_bgc_genbank(gbk_file)
-    assert "Not found product prediction in antiSMASH Genbank file" in e.value.args[0]
+    assert "Not found product prediction in antiSMASH Genbank file" in e.value.args[
+        0]

From 601957d81f1a4ce5e2fdc0a87d39e0524a565536 Mon Sep 17 00:00:00 2001
From: gcroci2 <crocioni.giulia@gmail.com>
Date: Thu, 9 Mar 2023 13:26:00 +0100
Subject: [PATCH 11/16] fix prospector errors

---
 src/nplinker/genomics/antismash/__init__.py             | 2 +-
 src/nplinker/genomics/antismash/antismash_downloader.py | 6 +++---
 src/nplinker/genomics/antismash/antismash_loader.py     | 3 +--
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/nplinker/genomics/antismash/__init__.py b/src/nplinker/genomics/antismash/__init__.py
index 482c9e6d..b7b7714f 100644
--- a/src/nplinker/genomics/antismash/__init__.py
+++ b/src/nplinker/genomics/antismash/__init__.py
@@ -5,4 +5,4 @@
 
 logging.getLogger(__name__).addHandler(logging.NullHandler())
 
-__all__ = ["AntismashBGCLoader", "parse_bgc_genbank"]
+__all__ = ["AntismashBGCLoader", "parse_bgc_genbank", "download_and_extract_antismash_metadata"]
diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py
index 6171a835..414825ec 100644
--- a/src/nplinker/genomics/antismash/antismash_downloader.py
+++ b/src/nplinker/genomics/antismash/antismash_downloader.py
@@ -23,7 +23,7 @@ def _check_roots(download_root, extract_root):
 def _check_extract_path(extract_path):
     if os.path.exists(extract_path):
         # check if extract_path is empty
-        files = [i for i in os.listdir(extract_path)]
+        files = list(os.listdir(extract_path))
         if len(files) != 0:
             raise ValueError(f'Nonempty directory: "{extract_path}"')
     else:
@@ -63,7 +63,7 @@ def download_and_extract_antismash_metadata(refseq_assembly_id: str,
         download_and_extract_archive(url, download_root, extract_path,
                                      refseq_assembly_id + '.zip')
         logger.info(
-            f'Genome data successfully extracted for {refseq_assembly_id}')
+            'Genome data successfully extracted for %s', refseq_assembly_id)
         break
 
     # delete subdirs
@@ -78,5 +78,5 @@ def download_and_extract_antismash_metadata(refseq_assembly_id: str,
         if file not in files_to_keep:
             os.remove(file)
     logger.info(
-        f'download_and_extract_antismash_metadata process for {refseq_assembly_id} is over'
+        'download_and_extract_antismash_metadata process for %s is over', refseq_assembly_id
     )
diff --git a/src/nplinker/genomics/antismash/antismash_loader.py b/src/nplinker/genomics/antismash/antismash_loader.py
index b1d3e7d4..156812a1 100644
--- a/src/nplinker/genomics/antismash/antismash_loader.py
+++ b/src/nplinker/genomics/antismash/antismash_loader.py
@@ -126,8 +126,7 @@ def parse_bgc_genbank(file: str) -> BGC:
     product_prediction = features.get("product")
     if product_prediction is None:
         raise ValueError(
-            "Not found product prediction in antiSMASH Genbank file {}".format(
-                file))
+            f"Not found product prediction in antiSMASH Genbank file {file}")
 
     # init BGC
     bgc = BGC(bgc_id=fname, product_prediction=product_prediction)

From e9fbe999f0179ac7f035a712090b483cc64e5984 Mon Sep 17 00:00:00 2001
From: gcroci2 <crocioni.giulia@gmail.com>
Date: Thu, 9 Mar 2023 13:33:35 +0100
Subject: [PATCH 12/16] make refseq_assembly_id class variable for tests

---
 tests/genomics/antismash/test_downloader.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/tests/genomics/antismash/test_downloader.py b/tests/genomics/antismash/test_downloader.py
index 631d4d0c..d646d42f 100644
--- a/tests/genomics/antismash/test_downloader.py
+++ b/tests/genomics/antismash/test_downloader.py
@@ -4,13 +4,14 @@
 
 class TestDownloadAndExtractAntismashData():
 
+    refseq_assembly_id = "GCF_004339725.1"
+
     def test_default(self, tmp_path):
-        refseq_assembly_id = "GCF_004339725.1"
         download_root = tmp_path / "download"
         download_root.mkdir()
         extract_root = tmp_path / "extracted"
         extract_root.mkdir()
-        download_and_extract_antismash_metadata(refseq_assembly_id,
+        download_and_extract_antismash_metadata(self.refseq_assembly_id,
                                                 download_root, extract_root)
         archive = download_root / "GCF_004339725.1.zip"
         extracted_folder = extract_root / "antismash/GCF_004339725.1"
@@ -19,19 +20,17 @@ def test_default(self, tmp_path):
         assert extracted_folder.exists()
 
     def test_error_same_path(self, tmp_path):
-        refseq_assembly_id = "GCF_004339725.1"
         with pytest.raises(ValueError) as e:
-            download_and_extract_antismash_metadata(refseq_assembly_id,
+            download_and_extract_antismash_metadata(self.refseq_assembly_id,
                                                     tmp_path, tmp_path)
         assert e.value.args[
             0] == "Identical path of download directory and extract directory"
 
     def test_error_nonempty_path(self, tmp_path):
-        refseq_assembly_id = "GCF_004339725.1"
-        nonempty_path = tmp_path / f"extracted/antismash/{refseq_assembly_id}/subdir"
+        nonempty_path = tmp_path / f"extracted/antismash/{self.refseq_assembly_id}/subdir"
         nonempty_path.mkdir(parents=True)
         with pytest.raises(ValueError) as e:
-            download_and_extract_antismash_metadata(refseq_assembly_id,
+            download_and_extract_antismash_metadata(self.refseq_assembly_id,
                                                     tmp_path,
                                                     tmp_path / "extracted")
         assert "Nonempty directory" in e.value.args[0]

From 9a22c06f4448718f51f5f6b5cfbafa0bb322560b Mon Sep 17 00:00:00 2001
From: gcroci2 <crocioni.giulia@gmail.com>
Date: Thu, 9 Mar 2023 13:35:37 +0100
Subject: [PATCH 13/16] reorder imports

---
 src/nplinker/genomics/antismash/__init__.py             | 3 ++-
 src/nplinker/genomics/antismash/antismash_downloader.py | 5 ++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/nplinker/genomics/antismash/__init__.py b/src/nplinker/genomics/antismash/__init__.py
index b7b7714f..411a58ca 100644
--- a/src/nplinker/genomics/antismash/__init__.py
+++ b/src/nplinker/genomics/antismash/__init__.py
@@ -1,7 +1,8 @@
 import logging
+from .antismash_downloader import download_and_extract_antismash_metadata
 from .antismash_loader import AntismashBGCLoader
 from .antismash_loader import parse_bgc_genbank
-from .antismash_downloader import download_and_extract_antismash_metadata
+
 
 logging.getLogger(__name__).addHandler(logging.NullHandler())
 
diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py
index 414825ec..99a22205 100644
--- a/src/nplinker/genomics/antismash/antismash_downloader.py
+++ b/src/nplinker/genomics/antismash/antismash_downloader.py
@@ -1,7 +1,10 @@
 import os
 import shutil
 from nplinker.logconfig import LogConfig
-from nplinker.utils import download_and_extract_archive, list_dirs, list_files
+from nplinker.utils import download_and_extract_archive
+from nplinker.utils import list_dirs
+from nplinker.utils import list_files
+
 
 logger = LogConfig.getLogger(__name__)
 

From decf286a8fccc119f4f178db59add40dae174a8a Mon Sep 17 00:00:00 2001
From: gcroci2 <crocioni.giulia@gmail.com>
Date: Thu, 9 Mar 2023 13:39:01 +0100
Subject: [PATCH 14/16] add minor static typing

---
 src/nplinker/genomics/antismash/antismash_downloader.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py
index 99a22205..39141b0a 100644
--- a/src/nplinker/genomics/antismash/antismash_downloader.py
+++ b/src/nplinker/genomics/antismash/antismash_downloader.py
@@ -17,13 +17,13 @@
 ANTISMASH_DBV2_DOWNLOAD_URL = 'https://antismash-dbv2.secondarymetabolites.org/output/{}/{}'
 
 
-def _check_roots(download_root, extract_root):
+def _check_roots(download_root: str, extract_root: str):
     if download_root == extract_root:
         raise ValueError(
             "Identical path of download directory and extract directory")
 
 
-def _check_extract_path(extract_path):
+def _check_extract_path(extract_path: str):
     if os.path.exists(extract_path):
         # check if extract_path is empty
         files = list(os.listdir(extract_path))

From 0c5b6bd6c8505b1f041fbbf56b63eae4f7d413ed Mon Sep 17 00:00:00 2001
From: gcroci2 <crocioni.giulia@gmail.com>
Date: Wed, 15 Mar 2023 12:47:09 +0100
Subject: [PATCH 15/16] Revert "Merge branch 'dev' into
 98_add_antismash_downloader_gcroci2"

This reverts commit f64a17c41716c00867c39e0655059a34bdcdf5a9, reversing
changes made to 4879181203eb4eeb52aec434fac491ee257c3e29.

revert merge
---
 .vscode/settings.json                         |  6 +-
 src/nplinker/loader.py                        | 33 ++++-----
 .../metabolomics/gnps/gnps_extractor.py       |  7 +-
 src/nplinker/metabolomics/gnps/gnps_loader.py |  3 -
 src/nplinker/metabolomics/load_gnps.py        | 12 ++--
 src/nplinker/metabolomics/metabolomics.py     | 23 +++---
 src/nplinker/scoring/linking/data_linking.py  |  4 +-
 src/nplinker/strains.py                       |  3 -
 tests/test_gnps_loader.py                     |  6 --
 tests/test_loader.py                          | 71 ++-----------------
 10 files changed, 43 insertions(+), 125 deletions(-)
 delete mode 100644 src/nplinker/metabolomics/gnps/gnps_loader.py
 delete mode 100644 tests/test_gnps_loader.py

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 30cb3f3f..7411789c 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -20,8 +20,10 @@
   // python docstring
   "autoDocstring.docstringFormat": "google",
   "autoDocstring.customTemplatePath": ".vscode/vscode_docstring_google_adapted.mustache",
+
+  // Jupyter
+  "jupyter.sendSelectionToInteractiveWindow": true,
   "jupyter.askForKernelRestart": false,
   "notebook.lineNumbers": "on",
-  "notebook.diff.ignoreMetadata": true,
-  "jupyter.interactiveWindow.textEditor.executeSelection": true
+  "notebook.diff.ignoreMetadata": true
 }
diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py
index cd3a5fb7..225c0ca8 100644
--- a/src/nplinker/loader.py
+++ b/src/nplinker/loader.py
@@ -28,8 +28,7 @@
 from nplinker.pairedomics.downloader import Downloader
 from nplinker.pairedomics.runbigscape import run_bigscape
 from nplinker.strain_collection import StrainCollection
-from nplinker.genomics.antismash import AntismashBGCLoader
-from pathlib import Path
+
 
 try:
     from importlib.resources import files
@@ -245,23 +244,19 @@ def _init_metabolomics_paths(self):
                                             optional=True)
 
         # 8. MET: <root>/DB_result/*.tsv (new) or <root>/result_specnets_DB/*.tsv (old) / annotations_dir=<override>
-        if Path.is_file(Path(self._root) / "annotations.tsv"):
-            self.annotations_dir = str(self._root)
-            self.annotations_config_file = os.path.join(self._root, "annotations.tsv")
-        else:
-            self.annotations_dir = self._overrides.get(
-                self.OR_ANNO) or find_via_glob_alts_dir(
-                    [
-                        os.path.join(self._root, 'DB_result'),
-                        os.path.join(self._root, 'result_specnets_DB'),
-                    ],
-                    self.OR_ANNO,
-                    optional=False
-                )
-            if self.annotations_dir is not None:
-                self.annotations_config_file = self._overrides.get(
-                    self.OR_ANNO_CONFIG) or os.path.join(self.annotations_dir,
-                                                         'annotations.tsv')
+        self.annotations_dir = self._overrides.get(
+            self.OR_ANNO) or find_via_glob_alts_dir(
+                [
+                    os.path.join(self._root, 'DB_result'),
+                    os.path.join(self._root, 'result_specnets_DB'),
+                ],
+                self.OR_ANNO,
+                optional=False
+            )
+        if self.annotations_dir is not None:
+            self.annotations_config_file = self._overrides.get(
+                self.OR_ANNO_CONFIG) or os.path.join(self.annotations_dir,
+                                                     'annotations.tsv')
 
     def _init_genomics_paths(self):
          # 9. GEN: <root>/antismash / antismash_dir=<override>
diff --git a/src/nplinker/metabolomics/gnps/gnps_extractor.py b/src/nplinker/metabolomics/gnps/gnps_extractor.py
index 916c888e..5789d61c 100644
--- a/src/nplinker/metabolomics/gnps/gnps_extractor.py
+++ b/src/nplinker/metabolomics/gnps/gnps_extractor.py
@@ -1,7 +1,6 @@
 from os import PathLike
 import os
 from pathlib import Path
-import shutil
 import zipfile
 
 from nplinker import utils
@@ -56,7 +55,7 @@ def _extract_spectra(self):
     def _extract_molecular_families(self):
         """Helper function to extract the molecular families file from the archive."""
         prefix = "networkedges_selfloop"
-        suffix = ".selfloop" if self._is_fbmn else ".pairsinfo"
+        suffix = "..selfloop" if self._is_fbmn else ".pairsinfo"
         utils.extract_file_matching_pattern(
             self.get_data(),
             prefix,
@@ -89,5 +88,5 @@ def _extract_annotations(self):
             self._extract_path,
             "annotations.tsv"
         )
-        
-        shutil.rmtree(self._extract_path / prefix)        
+        if self._is_fbmn:
+            os.rmdir(self._extract_path / prefix)
\ No newline at end of file
diff --git a/src/nplinker/metabolomics/gnps/gnps_loader.py b/src/nplinker/metabolomics/gnps/gnps_loader.py
deleted file mode 100644
index 2ef0d7a9..00000000
--- a/src/nplinker/metabolomics/gnps/gnps_loader.py
+++ /dev/null
@@ -1,3 +0,0 @@
-class GNPSLoader:
-    def __init__(self):
-        pass
\ No newline at end of file
diff --git a/src/nplinker/metabolomics/load_gnps.py b/src/nplinker/metabolomics/load_gnps.py
index 0266df56..d3855868 100644
--- a/src/nplinker/metabolomics/load_gnps.py
+++ b/src/nplinker/metabolomics/load_gnps.py
@@ -229,13 +229,13 @@ def _load_clusterinfo_old(gnps_format: str, strains: StrainCollection, file: str
                         unknown_strains[mzxml] += 1
                 # else:
                 #     print('{} ===> {}'.format(mzxml, strain))
-                if clu_index in spec_dict:
-                    if strain is not None:
-                        # TODO this need fixed somehow (missing growth medium info)
-                        spec_dict[clu_index].add_strain(strain, None, 1)
 
-                    # update metadata on Spectrum object
-                    spec_dict[clu_index].metadata.update(metadata)
+                if strain is not None:
+                    # TODO this need fixed somehow (missing growth medium info)
+                    spec_dict[clu_index].add_strain(strain, None, 1)
+
+                # update metadata on Spectrum object
+                spec_dict[clu_index].metadata.update(metadata)
 
     if len(unknown_strains) > 0:
         logger.warning(
diff --git a/src/nplinker/metabolomics/metabolomics.py b/src/nplinker/metabolomics/metabolomics.py
index 1c4cde5d..15257ef2 100644
--- a/src/nplinker/metabolomics/metabolomics.py
+++ b/src/nplinker/metabolomics/metabolomics.py
@@ -84,18 +84,17 @@ def load_edges(edges_file: str, spec_dict: dict[int, Spectrum]):
             cosine = float(line[cos_index])
             family = int(line[fam_index])
 
-            if spec1_id in spec_dict and spec2_id in spec_dict:
-                spec1 = spec_dict[spec1_id]
-                spec2 = spec_dict[spec2_id]
-
-                if family != -1:  # singletons
-                    spec1.family_id = family
-                    spec2.family_id = family
-
-                    spec1.edges.append((spec2.id, spec2.spectrum_id, cosine))
-                    spec2.edges.append((spec1.id, spec1.spectrum_id, cosine))
-                else:
-                    spec1.family_id = family
+            spec1 = spec_dict[spec1_id]
+            spec2 = spec_dict[spec2_id]
+
+            if family != -1:  # singletons
+                spec1.family_id = family
+                spec2.family_id = family
+
+                spec1.edges.append((spec2.id, spec2.spectrum_id, cosine))
+                spec2.edges.append((spec1.id, spec1.spectrum_id, cosine))
+            else:
+                spec1.family_id = family
 
 
 @deprecated(version="1.3.3", reason="Use the GNPSLoader class instead.")
diff --git a/src/nplinker/scoring/linking/data_linking.py b/src/nplinker/scoring/linking/data_linking.py
index a6f181f0..a8c3f3c9 100644
--- a/src/nplinker/scoring/linking/data_linking.py
+++ b/src/nplinker/scoring/linking/data_linking.py
@@ -341,9 +341,7 @@ def common_strains(self, objects_a, objects_b, filter_no_shared=False):
             objects_b = [objects_b]
 
         # retrieve object IDs
-        # TODO: issue #103 stop using gcf.id, but note that the ids_b should be
-        # a list of int
-        ids_b = [gcf.id for gcf in objects_b]
+        ids_b = [gcf.gcf_id for gcf in objects_b]
         # these might be MolFams or Spectra, either way they'll have a .id attribute
         ids_a = [obj.id for obj in objects_a]
 
diff --git a/src/nplinker/strains.py b/src/nplinker/strains.py
index 8887fa6f..0fb55880 100644
--- a/src/nplinker/strains.py
+++ b/src/nplinker/strains.py
@@ -45,6 +45,3 @@ def __eq__(self, other):
             and self.id == other.id
             and self.aliases == other.aliases
         )
-    
-    def __hash__(self) -> int:
-        return hash(self.id)
diff --git a/tests/test_gnps_loader.py b/tests/test_gnps_loader.py
deleted file mode 100644
index 4cf5ae50..00000000
--- a/tests/test_gnps_loader.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from nplinker.metabolomics.gnps.gnps_loader import GNPSLoader
-
-
-def test_default():
-    sut = GNPSLoader()
-    assert sut is not None
\ No newline at end of file
diff --git a/tests/test_loader.py b/tests/test_loader.py
index d89c329c..f46e0dc2 100644
--- a/tests/test_loader.py
+++ b/tests/test_loader.py
@@ -1,10 +1,6 @@
 from pathlib import Path
-import shutil
 import pytest
 from nplinker.loader import DatasetLoader
-from nplinker.metabolomics.gnps.gnps_extractor import GNPSExtractor
-from nplinker.metabolomics.gnps.gnps_spectrum_loader import GNPSSpectrumLoader
-from nplinker.strain_collection import StrainCollection
 from . import DATA_DIR
 
 @pytest.fixture
@@ -12,76 +8,17 @@ def config():
     return {
         "dataset" : {
             "root": DATA_DIR / "ProteoSAFe-METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra",
-            "platform_id": "",
-            "overrides": {
-                "strain_mappings_file": str(DATA_DIR / "strain_mappings.csv")
-            }
+            "platform_id": ""
         }
     }
 
-@pytest.fixture
-def config_with_new_gnps_extractor():
-    GNPSExtractor(DATA_DIR / "ProteoSAFe-METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra.zip", DATA_DIR / "extracted").extract()
-    yield {
-        "dataset" : {
-            "root": DATA_DIR / "extracted",
-            "platform_id": "",
-            "overrides": {
-                "strain_mappings_file": str(DATA_DIR / "strain_mappings.csv")
-            }
-        }
-    }
-    shutil.rmtree(DATA_DIR / "extracted")
-
 def test_default(config):
     sut = DatasetLoader(config)
     assert sut._platform_id == config["dataset"]["platform_id"]
 
 
-def test_has_metabolomics_paths(config):
+def test_has_spectra_path(config):
     sut = DatasetLoader(config)
     sut._init_metabolomics_paths()
-    assert sut.mgf_file == str(config["dataset"]["root"] / "METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra-main.mgf")
-    assert sut.edges_file == str(config["dataset"]["root"] / "networkedges_selfloop" / "6da5be36f5b14e878860167fa07004d6.pairsinfo")
-    assert sut.nodes_file == str(config["dataset"]["root"] / "clusterinfosummarygroup_attributes_withIDs_withcomponentID" / "d69356c8e5044c2a9fef3dd2a2f991e1.tsv")
-    assert sut.annotations_dir == str(config["dataset"]["root"] / "result_specnets_DB")
-
-
-def test_has_metabolomics_paths_new_gnps(config_with_new_gnps_extractor):
-    sut = DatasetLoader(config_with_new_gnps_extractor)
-    sut._init_metabolomics_paths()
-    assert sut.mgf_file == str(config_with_new_gnps_extractor["dataset"]["root"] / "spectra.mgf")
-    assert sut.edges_file == str(config_with_new_gnps_extractor["dataset"]["root"] / "molecular_families.pairsinfo")
-    assert sut.nodes_file == str(config_with_new_gnps_extractor["dataset"]["root"] / "file_mappings.tsv")
-    assert sut.annotations_dir == str(config_with_new_gnps_extractor["dataset"]["root"])
-    assert sut.annotations_config_file == str(config_with_new_gnps_extractor["dataset"]["root"] / "annotations.tsv")
-
-
-def test_load_metabolomics(config):
-    sut = DatasetLoader(config)
-    sut._init_paths()
-    sut._load_strain_mappings()
-    sut._load_metabolomics()
-
-    expected_spectra = GNPSSpectrumLoader(sut.mgf_file).spectra()
-
-    #HH TODO: switch to different comparison as soon as strains are implemented
-    assert len(sut.spectra) == len(expected_spectra)
-    assert len(sut.molfams) == 429
-
-def test_has_strain_mappings(config):
-    sut = DatasetLoader(config)
-    sut._init_paths()
-    assert sut.strain_mappings_file == str(DATA_DIR / "strain_mappings.csv")
-
-
-def test_load_strain_mappings(config):
-    sut = DatasetLoader(config)
-    sut._init_paths()
-    sut._load_strain_mappings()
-
-    actual = sut.strains
-    expected = StrainCollection()
-    expected.add_from_file(sut.strain_mappings_file)
-
-    assert actual == expected
+    actual = sut.mgf_file
+    assert actual == str(config["dataset"]["root"] / "METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra-main.mgf")

From c66a10a2074084464fea7e95e72ab7aeb11bad13 Mon Sep 17 00:00:00 2001
From: gcroci2 <crocioni.giulia@gmail.com>
Date: Wed, 15 Mar 2023 12:57:49 +0100
Subject: [PATCH 16/16] Revert "Revert "Merge branch 'dev' into
 98_add_antismash_downloader_gcroci2""

This reverts commit 0c5b6bd6c8505b1f041fbbf56b63eae4f7d413ed.

revert last merge
---
 .vscode/settings.json                         |  6 +-
 src/nplinker/loader.py                        | 33 +++++----
 .../metabolomics/gnps/gnps_extractor.py       |  7 +-
 src/nplinker/metabolomics/gnps/gnps_loader.py |  3 +
 src/nplinker/metabolomics/load_gnps.py        | 12 ++--
 src/nplinker/metabolomics/metabolomics.py     | 23 +++---
 src/nplinker/scoring/linking/data_linking.py  |  4 +-
 src/nplinker/strains.py                       |  3 +
 tests/test_gnps_loader.py                     |  6 ++
 tests/test_loader.py                          | 71 +++++++++++++++++--
 10 files changed, 125 insertions(+), 43 deletions(-)
 create mode 100644 src/nplinker/metabolomics/gnps/gnps_loader.py
 create mode 100644 tests/test_gnps_loader.py

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 7411789c..30cb3f3f 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -20,10 +20,8 @@
   // python docstring
   "autoDocstring.docstringFormat": "google",
   "autoDocstring.customTemplatePath": ".vscode/vscode_docstring_google_adapted.mustache",
-
-  // Jupyter
-  "jupyter.sendSelectionToInteractiveWindow": true,
   "jupyter.askForKernelRestart": false,
   "notebook.lineNumbers": "on",
-  "notebook.diff.ignoreMetadata": true
+  "notebook.diff.ignoreMetadata": true,
+  "jupyter.interactiveWindow.textEditor.executeSelection": true
 }
diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py
index 225c0ca8..cd3a5fb7 100644
--- a/src/nplinker/loader.py
+++ b/src/nplinker/loader.py
@@ -28,7 +28,8 @@
 from nplinker.pairedomics.downloader import Downloader
 from nplinker.pairedomics.runbigscape import run_bigscape
 from nplinker.strain_collection import StrainCollection
-
+from nplinker.genomics.antismash import AntismashBGCLoader
+from pathlib import Path
 
 try:
     from importlib.resources import files
@@ -244,19 +245,23 @@ def _init_metabolomics_paths(self):
                                             optional=True)
 
         # 8. MET: <root>/DB_result/*.tsv (new) or <root>/result_specnets_DB/*.tsv (old) / annotations_dir=<override>
-        self.annotations_dir = self._overrides.get(
-            self.OR_ANNO) or find_via_glob_alts_dir(
-                [
-                    os.path.join(self._root, 'DB_result'),
-                    os.path.join(self._root, 'result_specnets_DB'),
-                ],
-                self.OR_ANNO,
-                optional=False
-            )
-        if self.annotations_dir is not None:
-            self.annotations_config_file = self._overrides.get(
-                self.OR_ANNO_CONFIG) or os.path.join(self.annotations_dir,
-                                                     'annotations.tsv')
+        if Path.is_file(Path(self._root) / "annotations.tsv"):
+            self.annotations_dir = str(self._root)
+            self.annotations_config_file = os.path.join(self._root, "annotations.tsv")
+        else:
+            self.annotations_dir = self._overrides.get(
+                self.OR_ANNO) or find_via_glob_alts_dir(
+                    [
+                        os.path.join(self._root, 'DB_result'),
+                        os.path.join(self._root, 'result_specnets_DB'),
+                    ],
+                    self.OR_ANNO,
+                    optional=False
+                )
+            if self.annotations_dir is not None:
+                self.annotations_config_file = self._overrides.get(
+                    self.OR_ANNO_CONFIG) or os.path.join(self.annotations_dir,
+                                                         'annotations.tsv')
 
     def _init_genomics_paths(self):
          # 9. GEN: <root>/antismash / antismash_dir=<override>
diff --git a/src/nplinker/metabolomics/gnps/gnps_extractor.py b/src/nplinker/metabolomics/gnps/gnps_extractor.py
index 5789d61c..916c888e 100644
--- a/src/nplinker/metabolomics/gnps/gnps_extractor.py
+++ b/src/nplinker/metabolomics/gnps/gnps_extractor.py
@@ -1,6 +1,7 @@
 from os import PathLike
 import os
 from pathlib import Path
+import shutil
 import zipfile
 
 from nplinker import utils
@@ -55,7 +56,7 @@ def _extract_spectra(self):
     def _extract_molecular_families(self):
         """Helper function to extract the molecular families file from the archive."""
         prefix = "networkedges_selfloop"
-        suffix = "..selfloop" if self._is_fbmn else ".pairsinfo"
+        suffix = ".selfloop" if self._is_fbmn else ".pairsinfo"
         utils.extract_file_matching_pattern(
             self.get_data(),
             prefix,
@@ -88,5 +89,5 @@ def _extract_annotations(self):
             self._extract_path,
             "annotations.tsv"
         )
-        if self._is_fbmn:
-            os.rmdir(self._extract_path / prefix)
\ No newline at end of file
+        
+        shutil.rmtree(self._extract_path / prefix)        
diff --git a/src/nplinker/metabolomics/gnps/gnps_loader.py b/src/nplinker/metabolomics/gnps/gnps_loader.py
new file mode 100644
index 00000000..2ef0d7a9
--- /dev/null
+++ b/src/nplinker/metabolomics/gnps/gnps_loader.py
@@ -0,0 +1,3 @@
+class GNPSLoader:
+    def __init__(self):
+        pass
\ No newline at end of file
diff --git a/src/nplinker/metabolomics/load_gnps.py b/src/nplinker/metabolomics/load_gnps.py
index d3855868..0266df56 100644
--- a/src/nplinker/metabolomics/load_gnps.py
+++ b/src/nplinker/metabolomics/load_gnps.py
@@ -229,13 +229,13 @@ def _load_clusterinfo_old(gnps_format: str, strains: StrainCollection, file: str
                         unknown_strains[mzxml] += 1
                 # else:
                 #     print('{} ===> {}'.format(mzxml, strain))
+                if clu_index in spec_dict:
+                    if strain is not None:
+                        # TODO this need fixed somehow (missing growth medium info)
+                        spec_dict[clu_index].add_strain(strain, None, 1)
 
-                if strain is not None:
-                    # TODO this need fixed somehow (missing growth medium info)
-                    spec_dict[clu_index].add_strain(strain, None, 1)
-
-                # update metadata on Spectrum object
-                spec_dict[clu_index].metadata.update(metadata)
+                    # update metadata on Spectrum object
+                    spec_dict[clu_index].metadata.update(metadata)
 
     if len(unknown_strains) > 0:
         logger.warning(
diff --git a/src/nplinker/metabolomics/metabolomics.py b/src/nplinker/metabolomics/metabolomics.py
index 15257ef2..1c4cde5d 100644
--- a/src/nplinker/metabolomics/metabolomics.py
+++ b/src/nplinker/metabolomics/metabolomics.py
@@ -84,17 +84,18 @@ def load_edges(edges_file: str, spec_dict: dict[int, Spectrum]):
             cosine = float(line[cos_index])
             family = int(line[fam_index])
 
-            spec1 = spec_dict[spec1_id]
-            spec2 = spec_dict[spec2_id]
-
-            if family != -1:  # singletons
-                spec1.family_id = family
-                spec2.family_id = family
-
-                spec1.edges.append((spec2.id, spec2.spectrum_id, cosine))
-                spec2.edges.append((spec1.id, spec1.spectrum_id, cosine))
-            else:
-                spec1.family_id = family
+            if spec1_id in spec_dict and spec2_id in spec_dict:
+                spec1 = spec_dict[spec1_id]
+                spec2 = spec_dict[spec2_id]
+
+                if family != -1:  # singletons
+                    spec1.family_id = family
+                    spec2.family_id = family
+
+                    spec1.edges.append((spec2.id, spec2.spectrum_id, cosine))
+                    spec2.edges.append((spec1.id, spec1.spectrum_id, cosine))
+                else:
+                    spec1.family_id = family
 
 
 @deprecated(version="1.3.3", reason="Use the GNPSLoader class instead.")
diff --git a/src/nplinker/scoring/linking/data_linking.py b/src/nplinker/scoring/linking/data_linking.py
index a8c3f3c9..a6f181f0 100644
--- a/src/nplinker/scoring/linking/data_linking.py
+++ b/src/nplinker/scoring/linking/data_linking.py
@@ -341,7 +341,9 @@ def common_strains(self, objects_a, objects_b, filter_no_shared=False):
             objects_b = [objects_b]
 
         # retrieve object IDs
-        ids_b = [gcf.gcf_id for gcf in objects_b]
+        # TODO: issue #103 stop using gcf.id, but note that the ids_b should be
+        # a list of int
+        ids_b = [gcf.id for gcf in objects_b]
         # these might be MolFams or Spectra, either way they'll have a .id attribute
         ids_a = [obj.id for obj in objects_a]
 
diff --git a/src/nplinker/strains.py b/src/nplinker/strains.py
index 0fb55880..8887fa6f 100644
--- a/src/nplinker/strains.py
+++ b/src/nplinker/strains.py
@@ -45,3 +45,6 @@ def __eq__(self, other):
             and self.id == other.id
             and self.aliases == other.aliases
         )
+    
+    def __hash__(self) -> int:
+        return hash(self.id)
diff --git a/tests/test_gnps_loader.py b/tests/test_gnps_loader.py
new file mode 100644
index 00000000..4cf5ae50
--- /dev/null
+++ b/tests/test_gnps_loader.py
@@ -0,0 +1,6 @@
+from nplinker.metabolomics.gnps.gnps_loader import GNPSLoader
+
+
+def test_default():
+    sut = GNPSLoader()
+    assert sut is not None
\ No newline at end of file
diff --git a/tests/test_loader.py b/tests/test_loader.py
index f46e0dc2..d89c329c 100644
--- a/tests/test_loader.py
+++ b/tests/test_loader.py
@@ -1,6 +1,10 @@
 from pathlib import Path
+import shutil
 import pytest
 from nplinker.loader import DatasetLoader
+from nplinker.metabolomics.gnps.gnps_extractor import GNPSExtractor
+from nplinker.metabolomics.gnps.gnps_spectrum_loader import GNPSSpectrumLoader
+from nplinker.strain_collection import StrainCollection
 from . import DATA_DIR
 
 @pytest.fixture
@@ -8,17 +12,76 @@ def config():
     return {
         "dataset" : {
             "root": DATA_DIR / "ProteoSAFe-METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra",
-            "platform_id": ""
+            "platform_id": "",
+            "overrides": {
+                "strain_mappings_file": str(DATA_DIR / "strain_mappings.csv")
+            }
         }
     }
 
+@pytest.fixture
+def config_with_new_gnps_extractor():
+    GNPSExtractor(DATA_DIR / "ProteoSAFe-METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra.zip", DATA_DIR / "extracted").extract()
+    yield {
+        "dataset" : {
+            "root": DATA_DIR / "extracted",
+            "platform_id": "",
+            "overrides": {
+                "strain_mappings_file": str(DATA_DIR / "strain_mappings.csv")
+            }
+        }
+    }
+    shutil.rmtree(DATA_DIR / "extracted")
+
 def test_default(config):
     sut = DatasetLoader(config)
     assert sut._platform_id == config["dataset"]["platform_id"]
 
 
-def test_has_spectra_path(config):
+def test_has_metabolomics_paths(config):
     sut = DatasetLoader(config)
     sut._init_metabolomics_paths()
-    actual = sut.mgf_file
-    assert actual == str(config["dataset"]["root"] / "METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra-main.mgf")
+    assert sut.mgf_file == str(config["dataset"]["root"] / "METABOLOMICS-SNETS-c22f44b1-download_clustered_spectra-main.mgf")
+    assert sut.edges_file == str(config["dataset"]["root"] / "networkedges_selfloop" / "6da5be36f5b14e878860167fa07004d6.pairsinfo")
+    assert sut.nodes_file == str(config["dataset"]["root"] / "clusterinfosummarygroup_attributes_withIDs_withcomponentID" / "d69356c8e5044c2a9fef3dd2a2f991e1.tsv")
+    assert sut.annotations_dir == str(config["dataset"]["root"] / "result_specnets_DB")
+
+
+def test_has_metabolomics_paths_new_gnps(config_with_new_gnps_extractor):
+    sut = DatasetLoader(config_with_new_gnps_extractor)
+    sut._init_metabolomics_paths()
+    assert sut.mgf_file == str(config_with_new_gnps_extractor["dataset"]["root"] / "spectra.mgf")
+    assert sut.edges_file == str(config_with_new_gnps_extractor["dataset"]["root"] / "molecular_families.pairsinfo")
+    assert sut.nodes_file == str(config_with_new_gnps_extractor["dataset"]["root"] / "file_mappings.tsv")
+    assert sut.annotations_dir == str(config_with_new_gnps_extractor["dataset"]["root"])
+    assert sut.annotations_config_file == str(config_with_new_gnps_extractor["dataset"]["root"] / "annotations.tsv")
+
+
+def test_load_metabolomics(config):
+    sut = DatasetLoader(config)
+    sut._init_paths()
+    sut._load_strain_mappings()
+    sut._load_metabolomics()
+
+    expected_spectra = GNPSSpectrumLoader(sut.mgf_file).spectra()
+
+    #HH TODO: switch to different comparison as soon as strains are implemented
+    assert len(sut.spectra) == len(expected_spectra)
+    assert len(sut.molfams) == 429
+
+def test_has_strain_mappings(config):
+    sut = DatasetLoader(config)
+    sut._init_paths()
+    assert sut.strain_mappings_file == str(DATA_DIR / "strain_mappings.csv")
+
+
+def test_load_strain_mappings(config):
+    sut = DatasetLoader(config)
+    sut._init_paths()
+    sut._load_strain_mappings()
+
+    actual = sut.strains
+    expected = StrainCollection()
+    expected.add_from_file(sut.strain_mappings_file)
+
+    assert actual == expected