Merge pull request #172 from NPLinker/fix_download_bug

Fix download bugs
NPLinker · Oct 31, 2023 · 6301b28 · 6301b28
2 parents f577e59 + 34488a1
commit 6301b28
Show file tree

Hide file tree

Showing 9 changed files with 77 additions and 85 deletions.
diff --git a/LICENSE b/LICENSE
@@ -1,4 +1,3 @@
-
                                  Apache License
                            Version 2.0, January 2004
                         http://www.apache.org/licenses/
@@ -179,15 +178,15 @@
    APPENDIX: How to apply the Apache License to your work.
 
       To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "{}"
+      boilerplate notice, with the fields enclosed by brackets "[]"
       replaced with your own identifying information. (Don't include
       the brackets!)  The text should be enclosed in the appropriate
       comment syntax for the file format. We also recommend that a
       file or class name and description of purpose be included on the
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright [2022] [Netherlands eScience Center, Wageningen University & Research ]
+   Copyright [yyyy] [name of copyright owner]
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.

diff --git a/NOTICE b/NOTICE
@@ -1,5 +1,5 @@
 NPLinker
-Copyright [2022] The Netherlands eScience Center, Wageningen University & Research
+Copyright 2022-2023 Netherlands eScience Center and Wageningen University & Research.
 
 This product includes software developed at
-The Netherlands eScience Center (https://www.esciencecenter.nl/)
+Netherlands eScience Center (https://www.esciencecenter.nl/)
diff --git a/src/nplinker/pairedomics/podp_antismash_downloader.py b/src/nplinker/pairedomics/podp_antismash_downloader.py
@@ -3,7 +3,6 @@
 from pathlib import Path
 import re
 import time
-from urllib.error import HTTPError
 from bs4 import BeautifulSoup
 from bs4 import NavigableString
 from bs4 import Tag
@@ -203,7 +202,7 @@ def podp_download_and_extract_antismash_data(
             if output_path.exists():
                 Path.touch(output_path / 'completed', exist_ok=True)
 
-        except HTTPError:
+        except Exception:
             gs_obj.bgc_path = ""
 
     missing = len([gs for gs in gs_dict.values() if not gs.bgc_path])
@@ -214,7 +213,7 @@ def podp_download_and_extract_antismash_data(
     GenomeStatus.to_json(gs_dict, gs_file)
 
     if missing == len(genome_records):
-        logger.warning('Failed to successfully retrieve ANY genome data!')
+        raise ValueError("No antiSMASH data found for any genome")
 
 
 def get_best_available_genome_id(genome_id_data: dict[str, str]) -> str | None:

diff --git a/src/nplinker/pairedomics/strain_mappings_generator.py b/src/nplinker/pairedomics/strain_mappings_generator.py
@@ -306,7 +306,7 @@ def extract_mappings_ms_filename_spectrum_id(
         `GNPSFileMappingLoader`: A class to load GNPS file mapping TSV file.
     """
     loader = GNPSFileMappingLoader(tsv_file)
-    return loader.mapping_reversed()
+    return loader.mapping_reversed
 
 
 def get_mappings_strain_id_spectrum_id(

diff --git a/src/nplinker/schemas/podp_adapted_schema.json b/src/nplinker/schemas/podp_adapted_schema.json
@@ -93,17 +93,20 @@
               "GenBank_accession": {
                 "type": "string",
                 "title": "GenBank accession number",
-                "description": "If the publicly available genome got a GenBank accession number assigned, e.g., <a href=\"https://www.ncbi.nlm.nih.gov/nuccore/AL645882\" target=\"_blank\" rel=\"noopener noreferrer\">AL645882</a>, please provide it here. The genome sequence must be submitted to GenBank/ENA/DDBJ (and an accession number must be received) before this form can be filled out. In case of a whole genome sequence, please use master records. At least one identifier must be entered."
+                "description": "If the publicly available genome got a GenBank accession number assigned, e.g., <a href=\"https://www.ncbi.nlm.nih.gov/nuccore/AL645882\" target=\"_blank\" rel=\"noopener noreferrer\">AL645882</a>, please provide it here. The genome sequence must be submitted to GenBank/ENA/DDBJ (and an accession number must be received) before this form can be filled out. In case of a whole genome sequence, please use master records. At least one identifier must be entered.",
+                "minLength": 1
               },
               "RefSeq_accession": {
                 "type": "string",
                 "title": "RefSeq accession number",
-                "description": "For example: <a target=\"_blank\" rel=\"noopener noreferrer\" href=\"https://www.ncbi.nlm.nih.gov/nuccore/NC_003888.3\">NC_003888.3</a>"
+                "description": "For example: <a target=\"_blank\" rel=\"noopener noreferrer\" href=\"https://www.ncbi.nlm.nih.gov/nuccore/NC_003888.3\">NC_003888.3</a>",
+                "minLength": 1
               },
               "JGI_Genome_ID": {
                 "type": "string",
                 "title": "JGI IMG genome ID",
-                "description": "For example: <a target=\"_blank\" rel=\"noopener noreferrer\" href=\"https://img.jgi.doe.gov/cgi-bin/m/main.cgi?section=TaxonDetail&page=taxonDetail&taxon_oid=641228474\">641228474</a>"
+                "description": "For example: <a target=\"_blank\" rel=\"noopener noreferrer\" href=\"https://img.jgi.doe.gov/cgi-bin/m/main.cgi?section=TaxonDetail&page=taxonDetail&taxon_oid=641228474\">641228474</a>",
+                "minLength": 1
               }
             }
           },

diff --git a/src/nplinker/utils.py b/src/nplinker/utils.py
@@ -173,6 +173,7 @@ def download_url(url: str,
                           url,
                           follow_redirects=allow_http_redirect) as response:
             if not response.is_success:
+                fpath.unlink(missing_ok=True)
                 raise RuntimeError(
                     f"Failed to download url {url} with status code {response.status_code}"
                 )
@@ -182,7 +183,7 @@ def download_url(url: str,
                       unit_divisor=1024,
                       unit="B") as progress:
                 num_bytes_downloaded = response.num_bytes_downloaded
-                for chunk in response.iter_raw():
+                for chunk in response.iter_bytes():
                     fh.write(chunk)
                     progress.update(response.num_bytes_downloaded -
                                     num_bytes_downloaded)

diff --git a/tests/pairedomics/test_downloader.py b/tests/pairedomics/test_downloader.py
@@ -35,7 +35,7 @@ def test_download_metabolomics_zipfile(tmp_path):
     try:
         sut._download_metabolomics_zipfile("c22f44b14a3d450eb836d607cb9521bb")
         expected_path = os.path.join(sut.project_downloads_dir,
-                                     'c22f44b14a3d450eb836d607cb9521bb.zip')
+                                     'METABOLOMICS-SNETS-c22f44b14a3d450eb836d607cb9521bb.zip')
 
         assert os.path.exists(expected_path)
         assert (Path(sut.project_results_dir) /