From a33701f4eb050c7e145b8ec20e835c1c02b1501d Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Fri, 23 Feb 2024 17:07:25 +0100 Subject: [PATCH 01/12] Create arranger.py - Add class `DatasetArranger` - Add dataset validation functions `validate_gnps`, `validate_antismash` and `validate_bigscape` --- src/nplinker/arranger.py | 446 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 446 insertions(+) create mode 100644 src/nplinker/arranger.py diff --git a/src/nplinker/arranger.py b/src/nplinker/arranger.py new file mode 100644 index 00000000..3111fad5 --- /dev/null +++ b/src/nplinker/arranger.py @@ -0,0 +1,446 @@ +import fnmatch +import json +import shutil +from glob import glob +from pathlib import Path +from jsonschema import validate +import nplinker.globals as globals +from nplinker.config import config +from nplinker.genomics import generate_mappings_genome_id_bgc_id +from nplinker.genomics.mibig import download_and_extract_mibig_metadata +from nplinker.globals import GENOME_BGC_MAPPINGS_FILENAME +from nplinker.globals import GENOME_STATUS_FILENAME +from nplinker.globals import STRAIN_MAPPINGS_FILENAME +from nplinker.metabolomics.gnps import GNPSDownloader +from nplinker.metabolomics.gnps import GNPSExtractor +from nplinker.pairedomics import podp_download_and_extract_antismash_data +from nplinker.pairedomics.runbigscape import run_bigscape +from nplinker.pairedomics.strain_mappings_generator import podp_generate_strain_mappings +from nplinker.schemas import STRAIN_MAPPINGS_SCHEMA +from nplinker.schemas import USER_STRAINS_SCHEMA +from nplinker.schemas import validate_podp_json +from nplinker.utils import download_url +from nplinker.utils import list_dirs +from nplinker.utils import list_files + + +PODP_PROJECT_URL = "https://pairedomicsdata.bioinformatics.nl/api/projects/{}" + + +class DatasetArranger: + def __init__(self) -> None: + """Arrange the dataset required by NPLinker. + + This class is used to arrange the datasets required by NPLinker according to the + configuration. The datasets include MIBiG, GNPS, antiSMASH, and BiG-SCAPE. + + If `config.mode` is "local", the datasets are validated. + If `config.mode` is "podp", the datasets are downloaded or generated. + + It uses the default downloads directory `globals.DOWNLOADS_DEFAULT_PATH` to store the + downloaded files. Default data paths for MIBiG, GNPS, antiSMASH, and BiG-SCAPE are defined + in `nplinker.globals`. + """ + # Prepare the downloads directory and/or PODP json file which are required for other methods + globals.DOWNLOADS_DEFAULT_PATH.mkdir(exist_ok=True) + self.arrange_podp_project_json() + + def arrange(self) -> None: + """Arrange the datasets according to the configuration. + + The datasets include MIBiG, GNPS, antiSMASH, and BiG-SCAPE. + """ + # The order of arranging the datasets matters, as some datasets depend on others + self.arrange_mibig() + self.arrange_gnps() + self.arrange_antismash() + self.arrange_bigscape() + self.arrange_strain_mappings() + self.arrange_strains_selected() + + def arrange_podp_project_json(self) -> None: + """Arrange the PODP project JSON file. + + If `config.mode` is "podp", download the PODP project JSON file if it doesn't exist. Then + validate the PODP project JSON file if it exists or is downloaded. + + The validation is controlled by the json schema `schemas/podp_adapted_schema.json`. + """ + if config.mode == "podp": + file_name = f"paired_datarecord_{config.podp_id}.json" + podp_file = globals.DOWNLOADS_DEFAULT_PATH / file_name + if not podp_file.exists(): + download_url( + PODP_PROJECT_URL.format(config.podp_id), + globals.DOWNLOADS_DEFAULT_PATH, + file_name, + ) + + with open(podp_file, "r") as f: + json_data = json.load(f) + validate_podp_json(json_data) + + def arrange_mibig(self) -> None: + """Arrange the MIBiG metadata. + + Always download and extract the MIBiG metadata if `config.mibig.to_use` is True. + If the default directory has already existed, it will be removed and re-downloaded to ensure + the latest version is used. So it's not allowed to manually put MIBiG metadata in the + default directory. + """ + if config.mibig.to_use: + if globals.MIBIG_DEFAULT_PATH.exists(): + # remove existing mibig data + shutil.rmtree(globals.MIBIG_DEFAULT_PATH) + download_and_extract_mibig_metadata( + globals.DOWNLOADS_DEFAULT_PATH, + globals.MIBIG_DEFAULT_PATH, + version=config.mibig.version, + ) + + def arrange_gnps(self) -> None: + """Arrange the GNPS data. + + If `config.mode` is "local", validate the GNPS data directory. + If `config.mode` is "podp", download the GNPS data if it doesn't exist or remove the + existing GNPS data and re-download it if it is invalid. + + The validation process includes: + - Check if the GNPS data directory exists. + - Check if the required files exist in the GNPS data directory, including: + - file_mappings.tsv or file_mappings.csv + - spectra.mgf + - molecular_families.tsv + - annotations.tsv + """ + if config.mode == "local": + validate_gnps(globals.GNPS_DEFAULT_PATH) + + if config.mode == "podp": + # set range 3 to ensure download can try 2 times and downloaded data is valid + for _ in range(3): + try: + validate_gnps(globals.GNPS_DEFAULT_PATH) + except (FileNotFoundError, ValueError): + # Don't need to remove downloaded archive, as it'll be overwritten + shutil.rmtree(globals.GNPS_DEFAULT_PATH, ignore_errors=True) + self._download_and_extract_gnps() + + # get the path to file_mappings file (csv or tsv) + self.gnps_file_mappings_file = self._get_gnps_file_mappings_file() + + def _get_gnps_file_mappings_file(self) -> Path: + """Get the GNPS file mappings file. + + The GNPS file mappings file can be either a TSV file or a CSV file. This method checks if + the TSV file or the CSV file exists in the default GNPS directory. + + Returns: + Path: Path to the GNPS file mappings file. + """ + file_mappings_tsv = globals.GNPS_DEFAULT_PATH / globals.GNPS_FILE_MAPPINGS_TSV + file_mappings_csv = globals.GNPS_DEFAULT_PATH / globals.GNPS_FILE_MAPPINGS_CSV + + gnps_file_mappings_file = ( + file_mappings_tsv if file_mappings_tsv.exists() else file_mappings_csv + ) + + return gnps_file_mappings_file + + def _download_and_extract_gnps(self) -> None: + """Download and extract the GNPS data. + + Get the GNPS task ID from the PODP project JSON file, then download and extract the GNPS + data to the default GNPS directory. + """ + podp_file = globals.DOWNLOADS_DEFAULT_PATH / f"paired_datarecord_{config.podp_id}.json" + with open(podp_file, "r") as f: + podp_json_data = json.load(f) + gnps_task_id = podp_json_data["metabolomics"]["project"].get("molecular_network") + + data_archive = ( + GNPSDownloader(gnps_task_id, globals.DOWNLOADS_DEFAULT_PATH) + .download() + .get_download_file() + ) + GNPSExtractor(data_archive, globals.GNPS_DEFAULT_PATH) + + def arrange_antismash(self) -> None: + """Arrange the antiSMASH data. + + If `config.mode` is "local", validate the antiSMASH data directory. + If `config.mode` is "podp", download the antiSMASH data if it doesn't exist or remove the + existing antiSMASH data and re-download it if it is invalid. + + The validation process includes: + - Check if the antiSMASH data directory exists. + - Check if the antiSMASH data directory contains at least one sub-directory, and each + sub-directory contains at least one BGC file (with the suffix ".region???.gbk" where ??? + is a number). + + AntiSMASH BGC directory must follow the structure below: + antismash + ├── genome_id_1 (one AntiSMASH output, e.g. GCF_000514775.1) + │  ├── GCF_000514775.1.gbk + │  ├── NZ_AZWO01000004.region001.gbk + │  └── ... + ├── genome_id_2 + │  ├── ... + └── ... + """ + if config.mode == "local": + validate_antismash(globals.ANTISMASH_DEFAULT_PATH) + + if config.mode == "podp": + # set range 3 to ensure download can try 2 times and downloaded data is valid + for _ in range(3): + try: + validate_antismash(globals.ANTISMASH_DEFAULT_PATH) + break + except FileNotFoundError: + shutil.rmtree(globals.ANTISMASH_DEFAULT_PATH, ignore_errors=True) + self._download_and_extract_antismash() + + def _download_and_extract_antismash(self) -> None: + """Download and extract the antiSMASH data. + + Get the antiSMASH data from the PODP project JSON file, then download and extract the + antiSMASH data to the default antiSMASH directory. + """ + podp_file = globals.DOWNLOADS_DEFAULT_PATH / f"paired_datarecord_{config.podp_id}.json" + with open(podp_file, "r") as f: + podp_json_data = json.load(f) + podp_download_and_extract_antismash_data( + podp_json_data["genomes"], globals.DOWNLOADS_DEFAULT_PATH, config.root_dir + ) + + def arrange_bigscape(self) -> None: + """Arrange the BiG-SCAPE data. + + If `config.mode` is "local", validate the BiG-SCAPE data directory. + If `config.mode` is "podp", run BiG-SCAPE to generate the clustering file if it doesn't + exist or remove the existing BiG-SCAPE data and re-run BiG-SCAPE if it is invalid. + The running output of BiG-SCAPE will be saved to the directory "bigscape_running_output" + in the default BiG-SCAPE directory, and the clustering file "mix_clustering_c{config.bigscape.cutoff}.tsv" + will be copied to the default BiG-SCAPE directory. + + The validation process includes: + - Check if the default BiG-SCAPE data directory exists. + - Check if the clustering file "mix_clustering_c{config.bigscape.cutoff}.tsv" exists in the + BiG-SCAPE data directory. + """ + if config.mode == "local": + validate_bigscape(globals.BIGSCAPE_DEFAULT_PATH) + + if config.mode == "podp": + for _ in range(3): + try: + validate_bigscape(globals.BIGSCAPE_DEFAULT_PATH) + break + except FileNotFoundError: + shutil.rmtree(globals.BIGSCAPE_DEFAULT_PATH, ignore_errors=True) + self._run_bigscape() + + def _run_bigscape(self) -> None: + """Run BiG-SCAPE to generate the clustering file. + + The running output of BiG-SCAPE will be saved to the `BIGSCAPE_RUNNING_OUTPUT_PATH`. + + The clustering file "mix_clustering_c{config.bigscape.cutoff}.tsv" will be copied to the + default BiG-SCAPE directory. + """ + globals.BIGSCAPE_RUNNING_OUTPUT_PATH.mkdir(exist_ok=True, parents=True) + run_bigscape( + globals.ANTISMASH_DEFAULT_PATH, + globals.BIGSCAPE_RUNNING_OUTPUT_PATH, + config.bigscape.parameters, + ) + for f in glob( + str( + globals.BIGSCAPE_RUNNING_OUTPUT_PATH + / "network_files" + / "*" + / "mix" + / "mix_clustering_c*.tsv" + ) + ): + shutil.copy(f, globals.BIGSCAPE_DEFAULT_PATH) + + def arrange_strain_mappings(self) -> None: + """Arrange the strain mappings file. + + If `config.mode` is "local", validate the strain mappings file. + If `config.mode` is "podp", always generate the strain mappings file and validate it. + + The valiation checks if the strain mappings file exists and if it is a valid JSON file + according to the schema defined in `schemas/strain_mappings_schema.json`. + """ + if config.mode == "podp": + self._generate_strain_mappings() + + self._validate_strain_mappings() + + def _validate_strain_mappings(self) -> None: + """Validate the strain mappings file. + + The validation process includes: + - Check if the strain mappings file exists. + - Check if the strain mappings file is a valid JSON file according to the schema defined in + `schemas/strain_mappings_schema.json`. + + Raises: + FileNotFoundError: If the strain mappings file is not found. + ValidationError: If the strain mappings file is not a valid JSON file according to the + schema. + """ + strain_mappings_file = config.root_dir / STRAIN_MAPPINGS_FILENAME + + if not strain_mappings_file.exists(): + raise FileNotFoundError(f"Strain mappings file not found at {strain_mappings_file}") + + with open(strain_mappings_file, "r") as f: + json_data = json.load(f) + # Will raise ValidationError if the JSON file is invalid + validate(instance=json_data, schema=STRAIN_MAPPINGS_SCHEMA) + + def _generate_strain_mappings(self) -> None: + """Generate the strain mappings file for the PODP mode.""" + podp_json_file = globals.DOWNLOADS_DEFAULT_PATH / f"paired_datarecord_{config.podp_id}.json" + genome_status_json_file = globals.DOWNLOADS_DEFAULT_PATH / GENOME_STATUS_FILENAME + genome_bgc_mappings_file = globals.ANTISMASH_DEFAULT_PATH / GENOME_BGC_MAPPINGS_FILENAME + gnps_file_mapping_file = self.gnps_file_mappings_file + strain_mappings_file = config.root_dir / STRAIN_MAPPINGS_FILENAME + + # generate the genome_bgc_mappings_file + generate_mappings_genome_id_bgc_id(globals.ANTISMASH_DEFAULT_PATH) + # generate the strain_mappings_file + podp_generate_strain_mappings( + podp_json_file, + genome_status_json_file, + genome_bgc_mappings_file, + gnps_file_mapping_file, + strain_mappings_file, + ) + + def arrange_strains_selected(self) -> None: + """Arrange the strains selected file. + + Validate the strains selected file if it exists. + The validation checks if the strains selected file is a valid JSON file according to the + schema defined in `schemas/user_strains.json`. + """ + strains_selected_file = config.root_dir / globals.STRAINS_SELECTED_FILENAME + if strains_selected_file.exists(): + with open(strains_selected_file, "r") as f: + json_data = json.load(f) + validate(instance=json_data, schema=USER_STRAINS_SCHEMA) + + +def validate_gnps(gnps_dir: Path) -> None: + """Validate the GNPS data directory and its contents. + + The GNPS data directory must contain the following files: + - file_mappings.tsv or file_mappings.csv + - spectra.mgf + - molecular_families.tsv + - annotations.tsv + + Args: + gnps_dir (Path): Path to the GNPS data directory. + + Raises: + FileNotFoundError: If the GNPS data directory is not found or any of the required files + is not found. + ValueError: If both file_mappings.tsv and file_mapping.csv are found. + """ + if not gnps_dir.exists(): + raise FileNotFoundError(f"GNPS data directory not found at {gnps_dir}") + + file_mappings_tsv = gnps_dir / globals.GNPS_FILE_MAPPINGS_TSV + file_mappings_csv = gnps_dir / globals.GNPS_FILE_MAPPINGS_CSV + if file_mappings_tsv.exists() and file_mappings_csv.exists(): + raise ValueError( + f"Both {file_mappings_tsv.name} and {file_mappings_csv.name} found in GNPS directory " + f"{gnps_dir}, only one is allowed." + ) + elif not file_mappings_tsv.exists() and not file_mappings_csv.exists(): + raise FileNotFoundError( + f"Neither {file_mappings_tsv.name} nor {file_mappings_csv.name} found in GNPS directory" + f" {gnps_dir}" + ) + + required_files = [ + gnps_dir / globals.GNPS_SPECTRA_FILENAME, + gnps_dir / globals.GNPS_MOLECULAR_FAMILY_FILENAME, + gnps_dir / globals.GNPS_ANNOTATIONS_FILENAME, + ] + list_not_found = [f.name for f in required_files if not f.exists()] + if list_not_found: + raise FileNotFoundError( + f"Files not found in GNPS directory {gnps_dir}: ', '.join({list_not_found})" + ) + + +def validate_antismash(antismash_dir: Path) -> None: + """Validate the antiSMASH data directory and its contents. + + The validation only checks the structure of the antiSMASH data directory and file names. + It does not check + - the content of the BGC files + - the consistency between the antiSMASH data and the PODP project JSON file for the PODP + mode + + The antiSMASH data directory must exist and contain at least one sub-directory. The name of the + sub-directories must not contain any space. Each sub-directory must contain at least one BGC + file (with the suffix ".region???.gbk" where ??? is the region number). + + Args: + antismash_dir (Path): Path to the antiSMASH data directory. + + Raises: + FileNotFoundError: If the antiSMASH data directory is not found, or no sub-directories + are found in the antiSMASH data directory, or no BGC files are found in any + sub-directory. + ValueError: If any sub-directory name contains a space. + """ + if not antismash_dir.exists(): + raise FileNotFoundError(f"antiSMASH data directory not found at {antismash_dir}") + + sub_dirs = list_dirs(antismash_dir) + if not sub_dirs: + raise FileNotFoundError( + "No BGC directories found in antiSMASH data directory {antismash_dir}" + ) + + for sub_dir in sub_dirs: + dir_name = Path(sub_dir).name + if " " in dir_name: + raise ValueError( + f"antiSMASH sub-directory name {dir_name} contains space, which is not allowed" + ) + + gbk_files = list_files(sub_dir, suffix=".gbk", keep_parent=False) + bgc_files = fnmatch.filter(gbk_files, "*.region???.gbk") + if not bgc_files: + raise FileNotFoundError(f"No BGC files found in antiSMASH sub-directory {sub_dir}") + + +def validate_bigscape(bigscape_dir: Path) -> None: + """Validate the BiG-SCAPE data directory and its contents. + + The BiG-SCAPE data directory must exist and contain the clustering file + "mix_clustering_c{config.bigscape.cutoff}.tsv" where {config.bigscape.cutoff} is the + bigscape cutoff value set in the config file. + + Args: + bigscape_dir(Path): Path to the BiG-SCAPE data directory. + + Raises: + FileNotFoundError: If the BiG-SCAPE data directory or the clustering file is not found. + """ + if not bigscape_dir.exists(): + raise FileNotFoundError(f"BiG-SCAPE data directory not found at {bigscape_dir}") + + clustering_file = bigscape_dir / f"mix_clustering_c{config.bigscape.cutoff}.tsv" + if not clustering_file.exists(): + raise FileNotFoundError(f"BiG-SCAPE clustering file not found: {clustering_file}") From 1923a9926a4cac29f7640578a8ec84d916f07617 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Mon, 26 Feb 2024 15:48:59 +0100 Subject: [PATCH 02/12] Update runbigscape.py - remove function `podp_run_bigscape` - updated function `run_bigscape` --- src/nplinker/pairedomics/runbigscape.py | 56 +------------------------ 1 file changed, 2 insertions(+), 54 deletions(-) diff --git a/src/nplinker/pairedomics/runbigscape.py b/src/nplinker/pairedomics/runbigscape.py index 59cc39c0..b0bb0859 100644 --- a/src/nplinker/pairedomics/runbigscape.py +++ b/src/nplinker/pairedomics/runbigscape.py @@ -1,17 +1,3 @@ -# Copyright 2021 The NPLinker Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - import os import subprocess import sys @@ -21,27 +7,19 @@ logger = LogConfig.getLogger(__name__) -# NOTE: for simplicity this is currently written with assumption it will only be -# called in context of nplinker Docker image, where bigscape should be available PFAM_PATH = os.path.join(sys.prefix, "nplinker_lib") def run_bigscape( - bigscape_py_path: str | PathLike, antismash_path: str | PathLike, output_path: str | PathLike, - pfam_path: str | PathLike, extra_params: str, ): + bigscape_py_path = "bigscape.py" logger.info( f'run_bigscape: input="{antismash_path}", output="{output_path}", extra_params={extra_params}"' ) - if os.path.exists(os.path.join(output_path, "completed")): - logger.info("BiG-SCAPE appears to have been run already, skipping!") - logger.info("To force re-run, delete {%s}", os.path.join(output_path, "completed")) - return True - try: subprocess.run([bigscape_py_path, "-h"], capture_output=True, check=True) except Exception as e: @@ -51,7 +29,7 @@ def run_bigscape( raise Exception(f'antismash_path "{antismash_path}" does not exist!') # configure the IO-related parameters, including pfam_dir - args = [bigscape_py_path, "-i", antismash_path, "-o", output_path, "--pfam_dir", pfam_path] + args = [bigscape_py_path, "-i", antismash_path, "-o", output_path, "--pfam_dir", PFAM_PATH] # append the user supplied params, if any if len(extra_params) > 0: @@ -65,34 +43,4 @@ def run_bigscape( # which will indicate to the PODPDownloader module that something went wrong. result.check_returncode() - # use presence of this file as a quick way to check if a previous run - # finished or not - with open(os.path.join(output_path, "completed"), "w") as f: - f.close() - return True - - -def podp_run_bigscape( - project_file_cache: str | PathLike, - PFAM_PATH: str | PathLike, - do_bigscape: bool, - extra_bigscape_parameters, -): - # TODO this currently assumes docker environment, allow customisation? - # can check if in container with: https://stackoverflow.com/questions/20010199/how-to-determine-if-a-process-runs-inside-lxc-docker - if not do_bigscape: - logger.info("BiG-SCAPE disabled by configuration, not running it") - return - - logger.info('Running BiG-SCAPE! extra_bigscape_parameters="%s"', extra_bigscape_parameters) - try: - run_bigscape( - "bigscape.py", - os.path.join(project_file_cache, "antismash"), - os.path.join(project_file_cache, "bigscape"), - PFAM_PATH, - extra_bigscape_parameters, - ) - except Exception as e: - logger.warning('Failed to run BiG-SCAPE on antismash data, error was "%s"', e) From 6760ebd10850e40478cfd5eafa7d2284b453a2e1 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Mon, 26 Feb 2024 10:56:06 +0100 Subject: [PATCH 03/12] Delete downloader.py and its tests --- src/nplinker/pairedomics/downloader.py | 180 ------------------------- tests/pairedomics/test_downloader.py | 45 ------- 2 files changed, 225 deletions(-) delete mode 100644 src/nplinker/pairedomics/downloader.py delete mode 100644 tests/pairedomics/test_downloader.py diff --git a/src/nplinker/pairedomics/downloader.py b/src/nplinker/pairedomics/downloader.py deleted file mode 100644 index 2236a699..00000000 --- a/src/nplinker/pairedomics/downloader.py +++ /dev/null @@ -1,180 +0,0 @@ -import json -import os -import shutil -from os import PathLike -from pathlib import Path -from nplinker.genomics.mibig import download_and_extract_mibig_metadata -from nplinker.globals import PFAM_PATH -from nplinker.logconfig import LogConfig -from nplinker.metabolomics.gnps import GNPSDownloader -from nplinker.metabolomics.gnps import GNPSExtractor -from nplinker.utils import download_url -from . import podp_download_and_extract_antismash_data -from .runbigscape import podp_run_bigscape - - -logger = LogConfig.getLogger(__name__) - -PAIREDOMICS_PROJECT_DATA_ENDPOINT = "https://pairedomicsdata.bioinformatics.nl/api/projects" -PAIREDOMICS_PROJECT_URL = "https://pairedomicsdata.bioinformatics.nl/api/projects/{}" -GNPS_DATA_DOWNLOAD_URL = ( - "https://gnps.ucsd.edu/ProteoSAFe/DownloadResult?task={}&view=download_clustered_spectra" -) - -MIBIG_METADATA_URL = "https://dl.secondarymetabolites.org/mibig/mibig_json_{}.tar.gz" -MIBIG_BGC_METADATA_URL = "https://mibig.secondarymetabolites.org/repository/{}/annotations.json" - - -class PODPDownloader: - def __init__( - self, - podp_platform_id: str, - force_download: bool = False, - root_dir: str | PathLike | None = None, - ): - """Downloader for PODP pipeline. - - The downloader will download the following data: - - GNPS Molecular Network task results - - AntiSMASH results - - MIBiG metadata - - Args: - podp_platform_id(str): The metabolomics project ID of PODP platform, - e.g. GNPS MassIVE ID. - force_download (bool): Re-download data even if it already exists - locally. Defaults to False. - working_dir (str | PathLike | None): The root directory to use for - the project. Defaults to None, in which case the default location - is used. - - Raises: - ValueError: If the given ID does not have a corresponding PODP ID, - or if the GNPS Molecular Network task URL does not exist for - the given ID. - """ - self.gnps_massive_id = podp_platform_id - - if root_dir is None: - root_dir = os.path.join(os.getenv("HOME"), "nplinker_data", "pairedomics") - - # TODO CG: init folder structure should be moved out of PODPDownloader - self._init_folder_structure(root_dir) - - # init project json files - if not os.path.exists(self.project_json_file) or force_download: - logger.info("Downloading new copy of platform project data...") - self.all_projects_json_data = self._download_and_load_json( - PAIREDOMICS_PROJECT_DATA_ENDPOINT, self.all_projects_json_file - ) - else: - logger.info("Using existing copy of platform project data") - with open(self.all_projects_json_file, encoding="utf-8") as f: - self.all_projects_json_data = json.load(f) - - # Verify that the given ID has a corresponding PODP ID - self.podp_id = None - for project in self.all_projects_json_data["data"]: - if self.gnps_massive_id == project["metabolite_id"]: - self.podp_id = project["_id"] - logger.debug( - "Given ID %s matched to PODP ID %s", self.gnps_massive_id, self.podp_id - ) - break - if self.podp_id is None: - raise ValueError(f"Failed to find PODP ID for given ID {self.gnps_massive_id}") - - # now get the project JSON data - logger.info("Found project, retrieving JSON data...") - self.project_json_data = self._download_and_load_json( - PAIREDOMICS_PROJECT_URL.format(self.podp_id), self.project_json_file - ) - - self.gnps_task_id = self.project_json_data["metabolomics"]["project"].get( - "molecular_network" - ) - if self.gnps_task_id is None: - raise ValueError( - f"GNPS Molecular Network task URL not exist for " - f"given ID {self.gnps_massive_id}. Please check and" - f"run GNPS Molecular Network task first." - ) - - def _init_folder_structure(self, working_dir): - """Create local cache folders and set up paths for various files.""" - # init local cache root - self.working_dir = working_dir - self.downloads_dir = os.path.join(self.working_dir, "downloads") - self.results_dir = os.path.join(self.working_dir, "extracted") - os.makedirs(self.working_dir, exist_ok=True) - logger.info("PODPDownloader for %s, caching to %s", self.gnps_massive_id, self.working_dir) - - # create local cache folders for this dataset - self.project_downloads_dir = os.path.join(self.downloads_dir, self.gnps_massive_id) - os.makedirs(self.project_downloads_dir, exist_ok=True) - - self.project_results_dir = os.path.join(self.results_dir, self.gnps_massive_id) - os.makedirs(self.project_results_dir, exist_ok=True) - - # placeholder directories - for d in ["antismash", "bigscape"]: - os.makedirs(os.path.join(self.project_results_dir, d), exist_ok=True) - - # init project paths - self.all_projects_json_file = os.path.join(self.working_dir, "all_projects.json") - self.project_json_file = os.path.join(self.working_dir, f"{self.gnps_massive_id}.json") - - # download function - def get(self, do_bigscape, extra_bigscape_parameters, use_mibig, mibig_version): - logger.info("Going to download the metabolomics data file") - - self._download_metabolomics_zipfile(self.gnps_task_id) - - # TODO CG: this function will modify the project_json['genomes'], - # this should be done in a better way - podp_download_and_extract_antismash_data( - self.project_json_data["genomes"], self.project_downloads_dir, self.project_results_dir - ) - - if use_mibig: - self._download_mibig_json(mibig_version) - podp_run_bigscape( - self.project_results_dir, PFAM_PATH, do_bigscape, extra_bigscape_parameters - ) - - def _download_mibig_json(self, version): - output_path = os.path.join(self.project_results_dir, "mibig_json") - - # Override existing mibig json files - if os.path.exists(output_path): - shutil.rmtree(output_path) - - os.makedirs(output_path) - - download_and_extract_mibig_metadata(self.project_downloads_dir, output_path, version) - - self._create_completed_file(output_path) - - return True - - @staticmethod - def _create_completed_file(output_path): - with open(os.path.join(output_path, "completed"), "w", encoding="utf-8"): - pass - - def _download_metabolomics_zipfile(self, gnps_task_id): - archive = ( - GNPSDownloader(gnps_task_id, self.project_downloads_dir).download().get_download_file() - ) - GNPSExtractor(archive, self.project_results_dir) - - def _download_and_load_json(self, url: str, output_file: str | PathLike) -> dict: - """Download a JSON file from a URL and return the parsed JSON data.""" - fpath = Path(output_file) - download_url(url, fpath.parent, fpath.name) - logger.debug("Downloaded %s to %s", url, output_file) - - with open(output_file, "r") as f: - data = json.load(f) - - return data diff --git a/tests/pairedomics/test_downloader.py b/tests/pairedomics/test_downloader.py deleted file mode 100644 index 9dc64ebd..00000000 --- a/tests/pairedomics/test_downloader.py +++ /dev/null @@ -1,45 +0,0 @@ -import os -from pathlib import Path -import httpx -import pytest -from pytest_lazyfixture import lazy_fixture -from nplinker.pairedomics.downloader import PODPDownloader - - -@pytest.mark.parametrize( - "expected", [Path(os.getenv("HOME"), "nplinker_data", "pairedomics"), lazy_fixture("tmp_path")] -) -def test_default(expected: Path): - gnps_id = "MSV000079284" - - sut = PODPDownloader(gnps_id, root_dir=str(expected)) - - assert sut.gnps_massive_id == gnps_id - assert sut.working_dir == str(expected) - - assert sut.downloads_dir == str(expected / "downloads") - assert sut.project_downloads_dir == str(expected / "downloads" / gnps_id) - - assert sut.results_dir == str(expected / "extracted") - assert sut.project_results_dir == str(expected / "extracted" / gnps_id) - assert os.path.exists(str(expected / "extracted" / gnps_id / "antismash")) - assert os.path.exists(str(expected / "extracted" / gnps_id / "bigscape")) - - assert sut.all_projects_json_file == str(expected / "all_projects.json") - assert sut.project_json_file == str(expected / f"{gnps_id}.json") - - -def test_download_metabolomics_zipfile(tmp_path): - sut = PODPDownloader("MSV000079284", root_dir=tmp_path) - try: - sut._download_metabolomics_zipfile("c22f44b14a3d450eb836d607cb9521bb") - expected_path = os.path.join( - sut.project_downloads_dir, "METABOLOMICS-SNETS-c22f44b14a3d450eb836d607cb9521bb.zip" - ) - - assert os.path.exists(expected_path) - assert (Path(sut.project_results_dir) / "molecular_families.tsv").is_file() - assert (Path(sut.project_results_dir) / "file_mappings.tsv").is_file() - assert (Path(sut.project_results_dir) / "spectra.mgf").is_file() - except httpx.TimeoutException: - pytest.skip("GNPS is down") From faad572f0b0900ec0a4dfa50811f993b7254f95f Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Mon, 26 Feb 2024 15:47:48 +0100 Subject: [PATCH 04/12] Update loader.py --- src/nplinker/loader.py | 368 +++-------------------------------------- 1 file changed, 25 insertions(+), 343 deletions(-) diff --git a/src/nplinker/loader.py b/src/nplinker/loader.py index 25ef8711..cfadc0d7 100644 --- a/src/nplinker/loader.py +++ b/src/nplinker/loader.py @@ -1,23 +1,19 @@ -import glob import os -from pathlib import Path from deprecated import deprecated -from nplinker.class_info.chem_classes import ChemClassPredictions -from nplinker.class_info.class_matches import ClassMatches -from nplinker.class_info.runcanopus import run_canopus +from nplinker import globals from nplinker.config import config from nplinker.genomics import add_bgc_to_gcf from nplinker.genomics import add_strain_to_bgc -from nplinker.genomics import generate_mappings_genome_id_bgc_id from nplinker.genomics import get_mibig_from_gcf from nplinker.genomics.antismash import AntismashBGCLoader from nplinker.genomics.bigscape import BigscapeGCFLoader from nplinker.genomics.mibig import MibigLoader -from nplinker.globals import GENOME_BGC_MAPPINGS_FILENAME -from nplinker.globals import GENOME_STATUS_FILENAME -from nplinker.globals import GNPS_FILE_MAPPINGS_FILENAME -from nplinker.globals import PFAM_PATH +from nplinker.globals import GNPS_ANNOTATIONS_FILENAME +from nplinker.globals import GNPS_DEFAULT_PATH +from nplinker.globals import GNPS_MOLECULAR_FAMILY_FILENAME +from nplinker.globals import GNPS_SPECTRA_FILENAME from nplinker.globals import STRAIN_MAPPINGS_FILENAME +from nplinker.globals import STRAINS_SELECTED_FILENAME from nplinker.logconfig import LogConfig from nplinker.metabolomics import add_annotation_to_spectrum from nplinker.metabolomics import add_spectrum_to_mf @@ -25,9 +21,6 @@ from nplinker.metabolomics.gnps import GNPSAnnotationLoader from nplinker.metabolomics.gnps import GNPSMolecularFamilyLoader from nplinker.metabolomics.gnps import GNPSSpectrumLoader -from nplinker.pairedomics.downloader import PODPDownloader -from nplinker.pairedomics.runbigscape import run_bigscape -from nplinker.pairedomics.strain_mappings_generator import podp_generate_strain_mappings from nplinker.strain_collection import StrainCollection from nplinker.strain_loader import load_user_strains @@ -43,138 +36,24 @@ class DatasetLoader: - ANTISMASH_DELIMITERS_DEFAULT = [".", "_", "-"] - ANTISMASH_IGNORE_SPACES_DEFAULT = False - - TABLES_CUTOFF_DEFAULT = 2.0 - - BIGSCAPE_CUTOFF_DEFAULT = 30 - USE_MIBIG_DEFAULT = True - MIBIG_VERSION_DEFAULT = "3.1" - - RUN_BIGSCAPE_DEFAULT = True - - # https://git.wageningenur.nl/medema-group/BiG-SCAPE/-/wikis/parameters#mibig - # BigScape mibig default version is 3.1 - EXTRA_BIGSCAPE_PARAMS_DEFAULT = "--mibig --clans-off --mix --include_singletons" - RUN_CANOPUS_DEFAULT = False EXTRA_CANOPUS_PARAMS_DEFAULT = "--maxmz 600 formula zodiac structure canopus" - # keys for overriding metabolomics data elements - OR_GNPS_NODES = "gnps_nodes_file" - OR_GNPS_EDGES = "gnps_edges_file" - OR_GNPS_MGF = "gnps_mgf_file" - OR_GNPS_ANNOTATIONS = "gnps_annotations_file" - # and the same for genomics data - OR_ANTISMASH = "antismash_dir" - OR_BIGSCAPE = "bigscape_dir" - OR_MIBIG_JSON = "mibig_json_dir" - OR_STRAINS = "strain_mappings_file" - # misc files - OR_INCLUDE_STRAINS = "include_strains_file" # class predictions OR_CANOPUS = "canopus_dir" OR_MOLNETENHANCER = "molnetenhancer_dir" - BIGSCAPE_PRODUCT_TYPES = [ - "NRPS", - "Others", - "PKSI", - "PKS-NRP_Hybrids", - "PKSother", - "RiPPs", - "Saccharides", - "Terpene", - ] - def __init__(self): - # load the config data - self._config_docker = config.get("docker", {}) - self._config_webapp = config.get("webapp", {}) - self._config_antismash = config.get("antismash", {}) - self._config_overrides = config.dataset.get("overrides", {}) - # set private attributes - self._antismash_delimiters = self._config_antismash.get( - "antismash_delimiters", self.ANTISMASH_DELIMITERS_DEFAULT - ) - self._antismash_ignore_spaces = self._config_antismash.get( - "ignore_spaces", self.ANTISMASH_IGNORE_SPACES_DEFAULT - ) - self._bigscape_cutoff = config.dataset.get("bigscape_cutoff", self.BIGSCAPE_CUTOFF_DEFAULT) - self._use_mibig = config.dataset.get("use_mibig", self.USE_MIBIG_DEFAULT) - self._mibig_version = config.dataset.get("mibig_version", self.MIBIG_VERSION_DEFAULT) - # TODO: the actual value of self._root is set in _start_downloads() method - self._root = Path(config.dataset["root"]) - self._platform_id = config.dataset["platform_id"] - self._remote_loading = len(self._platform_id) > 0 - # set public attributes - self.dataset_id = ( - os.path.split(self._root)[-1] if not self._remote_loading else self._platform_id - ) self.bgcs, self.gcfs, self.spectra, self.molfams = [], [], [], [] self.mibig_bgcs = [] self.mibig_strains_in_use = StrainCollection() self.product_types = [] self.strains = StrainCollection() - self.webapp_scoring_cutoff = self._config_webapp.get( - "tables_metcalf_threshold", self.TABLES_CUTOFF_DEFAULT - ) + self.class_matches = None self.chem_classes = None - logger.debug( - "DatasetLoader({}, {}, {})".format(self._root, self.dataset_id, self._remote_loading) - ) - - def __repr__(self): - return "Root={}\n MGF={}\n EDGES={}\n NODES={}\n BIGSCAPE={}\n ANTISMASH={}\n".format( - self._root, - self.gnps_mgf_file, - self.gnps_edges_file, - self.gnps_nodes_file, - self.bigscape_dir, - self.antismash_dir, - ) - - def validate(self): - """Download data and build paths for local data.""" - # if remote loading mode, need to download the data here - # CG: for PODP workflow, strain_mappings.json is generated in the download step - if self._remote_loading: - self._start_downloads() - - # construct the paths and filenames required to load everything else and check - # they all seem to exist (but don't parse anything yet) - # TODO CG: the logics of _init_paths and _validate_paths are not clear - # 1. after downloading (manual preparation), some files alreay exist, some not - # 2. get the default, constructed or real path for each file/dir (need refactoring) - # - self._config_overrides.get() - # - os.path.join(self._root, 'strain_mappings.json') - # - find_via_glob() --> actually check if the file/dir exists - # 3. check if (some) file/dir exists - self._init_paths() - self._validate_paths() - - def generate_strain_mappings(self): - generate_mappings_genome_id_bgc_id(self._root / "antismash") - - podp_project_json_file = self._root.parent.parent / (self._platform_id + ".json") - genome_status_json_file = ( - self._root.parent.parent / "downloads" / self._platform_id / GENOME_STATUS_FILENAME - ) - genome_bgc_mappings_file = self._root / "antismash" / GENOME_BGC_MAPPINGS_FILENAME - gnps_file_mapping_tsv_file = self._root / GNPS_FILE_MAPPINGS_FILENAME - - podp_generate_strain_mappings( - podp_project_json_file, - genome_status_json_file, - genome_bgc_mappings_file, - gnps_file_mapping_tsv_file, - self.strain_mappings_file, - ) - def load(self): if not self._load_strain_mappings(): return False @@ -189,150 +68,23 @@ def load(self): self.strains = self.strains + self.mibig_strains_in_use if len(self.strains) == 0: - raise Exception(f"Failed to find *ANY* strains, missing {STRAIN_MAPPINGS_FILENAME}?") + raise Exception("Failed to find *ANY* strains.") return True - def _start_downloads(self): - downloader = PODPDownloader(self._platform_id) - # TODO CG: this step generates the real path for _root. Should generate - # it before loading process starts. Otherwise, npl.root_dir will get - # wrong value if loading from local data or not using download. - self._root = Path(downloader.project_results_dir) - logger.debug("remote loading mode, configuring root=%s", self._root) - # CG: to download both MET and GEN data - # CG: Continue to understand how strain_mappings.json is generated - downloader.get( - self._config_docker.get("run_bigscape", self.RUN_BIGSCAPE_DEFAULT), - self._config_docker.get( - "extra_bigscape_parameters", self.EXTRA_BIGSCAPE_PARAMS_DEFAULT - ), - self._use_mibig, - self._mibig_version, - ) - - def _init_paths(self): - # 1. strain mapping are used for everything else so - self.strain_mappings_file = self._config_overrides.get(self.OR_STRAINS) or os.path.join( - self._root, STRAIN_MAPPINGS_FILENAME - ) - - self._init_metabolomics_paths() - - self._init_genomics_paths() - - # 14. MISC: /include_strains.csv / include_strains_file= - self.include_strains_file = self._config_overrides.get( - self.OR_INCLUDE_STRAINS - ) or os.path.join(self._root, "include_strains.csv") - - # 15. CLASS: /canopus / canopus_dir= - self.canopus_dir = self._config_overrides.get(self.OR_CANOPUS) or os.path.join( - self._root, "canopus" - ) - - # 15. CLASS: /canopus / canopus_dir= - self.molnetenhancer_dir = self._config_overrides.get( - self.OR_MOLNETENHANCER - ) or os.path.join(self._root, "molnetenhancer") - - def _init_metabolomics_paths(self): - """Initializes the paths for metabolomics data.""" - # GNPS nodes_files is `file_mappings.tsv/csv` file - self.gnps_nodes_file = self._config_overrides.get(self.OR_GNPS_NODES) or find_via_glob_alts( - [ - os.path.join(self._root, "file_mappings.csv"), - os.path.join(self._root, "file_mappings.tsv"), - os.path.join(self._root, "clusterinfo*", "*.tsv"), - os.path.join(self._root, "clusterinfo*", "*.clustersummary"), - ], - self.OR_GNPS_NODES, - ) - - # GNPS edges_file is `molecular_families.tsv` file - self.gnps_edges_file = self._config_overrides.get(self.OR_GNPS_EDGES) or find_via_glob_alts( - [ - os.path.join(self._root, "molecular_families.tsv"), - os.path.join(self._root, "*.pairsinfo"), - os.path.join(self._root, "networkedges_selfloop", "*.pairsinfo"), - os.path.join(self._root, "networkedges_selfloop", "*.selfloop"), - ], - self.OR_GNPS_EDGES, - ) - - # GNPS mgf_file is `spectra.mgf` file - self.gnps_mgf_file = self._config_overrides.get(self.OR_GNPS_MGF) or find_via_glob_alts( - [ - os.path.join("spectra.mgf"), - os.path.join(self._root, "*.mgf"), - os.path.join(self._root, "spectra", "*.mgf"), - ], - self.OR_GNPS_MGF, - ) - - # GNPS annotations_file is `annotations.tsv` file - self.gnps_annotations_file = self._config_overrides.get( - self.OR_GNPS_ANNOTATIONS - ) or find_via_glob_alts( - [ - os.path.join(self._root, "annotations.tsv"), - os.path.join(self._root, "DB_result", "*.tsv"), - os.path.join(self._root, "result_specnets_DB", "*.tsv"), - ], - self.OR_GNPS_ANNOTATIONS, - ) - - def _init_genomics_paths(self): - # 9. GEN: /antismash / antismash_dir= - self.antismash_dir = self._config_overrides.get(self.OR_ANTISMASH) or os.path.join( - self._root, "antismash" - ) - self.antismash_cache = {} - - # 10. GEN: /bigscape / bigscape_dir= - self.bigscape_dir = self._config_overrides.get(self.OR_BIGSCAPE) or os.path.join( - self._root, "bigscape" - ) - # what we really want here is the subdirectory containing the network/annotation files, - # but in case this is the path to the top level bigscape output, try to figure it out automatically - if not os.path.exists(os.path.join(self.bigscape_dir, "mix")): - new_bigscape_dir = find_bigscape_dir(self.bigscape_dir) - if new_bigscape_dir is not None: - logger.info( - "Updating bigscape_dir to discovered location {}".format(new_bigscape_dir) - ) - self.bigscape_dir = new_bigscape_dir - - # 11. GEN: /mibig_json / mibig_json_dir= - self.mibig_json_dir = self._config_overrides.get(self.OR_MIBIG_JSON) or os.path.join( - self._root, "mibig_json" - ) - - def _validate_paths(self): - """Validates that the required files and directories exist before loading starts.""" - required_paths = [ - self.gnps_nodes_file, - self.gnps_edges_file, - self.gnps_mgf_file, - self.antismash_dir, - ] - - for f in required_paths: - if not os.path.exists(str(f)): - raise FileNotFoundError(f'File/directory "{f}" does not exist.') - def _load_strain_mappings(self): # 1. load strain mappings - sc = StrainCollection.read_json(self.strain_mappings_file) + sc = StrainCollection.read_json(config.root_dir / STRAIN_MAPPINGS_FILENAME) for strain in sc: self.strains.add(strain) logger.info("Loaded {} non-MiBIG Strain objects".format(len(self.strains))) # 2. filter user specificied strains (remove all that are not specified by user). # It's not allowed to specify empty list of strains, otherwise validation will fail. - if os.path.exists(self.include_strains_file): - logger.info(f"Loading user specified strains from file {self.include_strains_file}.") - user_strains = load_user_strains(self.include_strains_file) + user_strains_file = config.root_dir / STRAINS_SELECTED_FILENAME + if user_strains_file.exists(): + logger.info(f"Loading user specified strains from file {user_strains_file}.") + user_strains = load_user_strains(user_strains_file) logger.info(f"Loaded {len(user_strains)} user specified strains.") self.strains.filter(user_strains) @@ -353,11 +105,15 @@ def _load_metabolomics(self): logger.debug("\nLoading metabolomics data starts...") # Step 1: load all Spectrum objects - raw_spectra = GNPSSpectrumLoader(self.gnps_mgf_file).spectra + raw_spectra = GNPSSpectrumLoader(GNPS_DEFAULT_PATH / GNPS_SPECTRA_FILENAME).spectra # Step 2: load all GNPS annotations - raw_annotations = GNPSAnnotationLoader(self.gnps_annotations_file).annotations + raw_annotations = GNPSAnnotationLoader( + GNPS_DEFAULT_PATH / GNPS_ANNOTATIONS_FILENAME + ).annotations # Step 3: load all MolecularFamily objects - raw_molfams = GNPSMolecularFamilyLoader(self.gnps_edges_file).get_mfs(keep_singleton=False) + raw_molfams = GNPSMolecularFamilyLoader( + GNPS_DEFAULT_PATH / GNPS_MOLECULAR_FAMILY_FILENAME + ).get_mfs(keep_singleton=False) # Step 4: add GNPS annotations to Spectrum.gnps_annotations add_annotation_to_spectrum(raw_annotations, raw_spectra) @@ -389,20 +145,19 @@ def _load_genomics(self): # Step 1: load antismash BGC objects & add strain info logger.debug("Parsing AntiSMASH directory...") - antismash_bgcs = AntismashBGCLoader(self.antismash_dir).get_bgcs() + antismash_bgcs = AntismashBGCLoader(str(globals.ANTISMASH_DEFAULT_PATH)).get_bgcs() antismash_bgcs_with_strain, _ = add_strain_to_bgc(self.strains, antismash_bgcs) # Step 2: load mibig BGC objects (having strain info) - if self._use_mibig: - self.mibig_bgcs = MibigLoader(self.mibig_json_dir).get_bgcs() + if config.mibig.to_use: + self.mibig_bgcs = MibigLoader(str(globals.MIBIG_DEFAULT_PATH)).get_bgcs() # Step 3: get all BGC objects with strain info all_bgcs_with_strain = antismash_bgcs_with_strain + self.mibig_bgcs # Step 4: load all GCF objects - # TODO: create a config for "bigscape_cluster_file" and discard "bigscape_dir" and "bigscape_cutoff"? bigscape_cluster_file = ( - Path(self.bigscape_dir) / "mix" / f"mix_clustering_c0.{self._bigscape_cutoff:02d}.tsv" + globals.BIGSCAPE_DEFAULT_PATH / f"mix_clustering_c{config.bigscape.cutoff}.tsv" ) raw_gcfs = BigscapeGCFLoader(bigscape_cluster_file).get_gcfs() @@ -411,7 +166,7 @@ def _load_genomics(self): # Step 6: get mibig bgcs and strains in use from GCFs mibig_strains_in_use = StrainCollection() - if self._use_mibig: + if config.mibig.to_use: mibig_bgcs_in_use, mibig_strains_in_use = get_mibig_from_gcf(all_gcfs_with_bgc) else: mibig_bgcs_in_use = [] @@ -424,48 +179,6 @@ def _load_genomics(self): logger.debug("Loading genomics data completed\n") return True - # TODO CG: run bigscape before loading and after downloading - def _run_bigscape(self): - # Check for spaces in the folder names under /antismash and - # rename them by replacing spaces with underscores - if not self._antismash_ignore_spaces: - logger.debug("Checking for spaces in antiSMASH folder names...") - for root, dirs, _ in os.walk(self.antismash_dir): - for d in dirs: - if d.find(" ") != -1: - os.rename(os.path.join(root, d), os.path.join(root, d.replace(" ", "_"))) - logger.warn( - 'Renaming antiSMASH folder "{}" to "{}" to remove spaces! (suppress with ignore_spaces = true in config file)'.format( - d, d.replace(" ", "_") - ) - ) - - if not os.path.exists(self.bigscape_dir): - should_run_bigscape = self._config_docker.get("run_bigscape", self.RUN_BIGSCAPE_DEFAULT) - extra_bigscape_parameters = self._config_docker.get( - "extra_bigscape_parameters", self.EXTRA_BIGSCAPE_PARAMS_DEFAULT - ) - if should_run_bigscape: - logger.info( - 'Running BiG-SCAPE! extra_bigscape_parameters="{}"'.format( - extra_bigscape_parameters - ) - ) - try: - run_bigscape( - "bigscape.py", - os.path.join(self._root, "antismash"), - os.path.join(self._root, "bigscape"), - PFAM_PATH, - extra_params=extra_bigscape_parameters, - ) - except Exception as e: - logger.warning( - 'Failed to run BiG-SCAPE on antismash data, error was "{}"'.format(e) - ) - - self.bigscape_dir = find_bigscape_dir(self.bigscape_dir) - @deprecated(reason="To be refactored. It was used in the `self.load` method before.") def _load_class_info(self): """Load class match info (based on mibig) and chemical class predictions. @@ -522,34 +235,3 @@ def _load_class_info(self): # include them in loader self.chem_classes = chem_classes return True - - -def find_via_glob_alts(paths, file_type, optional=False): - filename = None - for path in paths: - try: - filename = glob.glob(path)[0] - break - except (OSError, IndexError): - continue - - if filename is None and not optional: - raise Exception( - "ERROR: unable to find {} in {} paths: ({})".format(file_type, len(paths), paths) - ) - elif filename is None: - logger.warning( - "WARNING: unable to find {} in {} paths: ({})".format(file_type, len(paths), paths) - ) - - return filename - - -def find_bigscape_dir(broot): - logger.info(f"Trying to discover correct bigscape directory under {broot}") - for root, _, files in os.walk(broot): - if "Network_Annotations_Full.tsv" in files: - logger.info(f"Found network files directory: {root}") - return root - - return None From 74a397d5b949516eb657aff6af8b95587bb9be45 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Tue, 27 Feb 2024 17:05:03 +0100 Subject: [PATCH 05/12] Update nplinker.py --- src/nplinker/nplinker.py | 67 +++++++--------------------------------- 1 file changed, 11 insertions(+), 56 deletions(-) diff --git a/src/nplinker/nplinker.py b/src/nplinker/nplinker.py index d17df87c..8b8217c4 100644 --- a/src/nplinker/nplinker.py +++ b/src/nplinker/nplinker.py @@ -2,6 +2,7 @@ import logging import sys from typing import TYPE_CHECKING +from .arranger import DatasetArranger from .config import config from .genomics import BGC from .genomics import GCF @@ -38,12 +39,12 @@ class NPLinker: def __init__(self): """Initialise an NPLinker instance.""" # configure logging based on the supplied config params - LogConfig.setLogLevelStr(config.loglevel) - logfile = config.get("logfile") + LogConfig.setLogLevelStr(config.log.level) + logfile = config.get("log.file") if logfile: logfile_dest = logging.FileHandler(logfile) # if we want to log to stdout plus logfile, add the new destination - if config.get("log_to_stdout"): # default to True + if config.get("log.to_stdout"): # default to True LogConfig.addLogDestination(logfile_dest) else: # otherwise overwrite the default stdout destination @@ -124,22 +125,7 @@ def root_dir(self): Returns: str: the path to the dataset root directory currently in use """ - return self._loader._root - - @property - def dataset_id(self): - """Returns dataset "ID". - - For local datasets this will just be the last component of the directory path, - e.g. /path/to/my_dataset would produce an ID of "my_dataset". - - For datasets loaded from the Paired Omics platform the ID will be the platform - project ID, e.g. "MSV000079284" - - Returns: - str: the dataset ID - """ - return self._loader.dataset_id + return config.root_dir @property def data_dir(self): @@ -149,30 +135,13 @@ def data_dir(self): @property def bigscape_cutoff(self): """Returns the current BiGSCAPE clustering cutoff value.""" - return self._loader._bigscape_cutoff + return config.bigscape.cutoff - def load_data(self, new_bigscape_cutoff=None): - """Loads the basic components of a dataset. - - This method is responsible for loading the various pieces of the supplied dataset into - memory and doing any initial parsing/object manipulation required. After it completes, - applications can access the lists of GCFs, Spectra, MolecularFamilies and strains - using the corresponding properties of the NPLinker class. - - Returns: - bool: True if successful, False otherwise - """ - logger.debug("load_data(new_bigscape_cutoff=%s)", new_bigscape_cutoff) - if new_bigscape_cutoff is None: - self._loader.validate() - self._loader.generate_strain_mappings() - if not self._loader.load(): - return False - else: - # CG: only reload genomics data when changing bigscape cutoff - self._loader._bigscape_cutoff = new_bigscape_cutoff - # TODO: only need to reload gcfs using load_gcfs() - self._loader._load_genomics() + def load_data(self): + """Loads the basic components of a dataset.""" + arranger = DatasetArranger() + arranger.arrange() + self._loader.load() self._spectra = self._loader.spectra self._molfams = self._loader.molfams @@ -184,20 +153,6 @@ def load_data(self, new_bigscape_cutoff=None): self._chem_classes = self._loader.chem_classes self._class_matches = self._loader.class_matches - logger.debug("Generating lookup tables: genomics") - self._bgc_lookup = {bgc.bgc_id: bgc for bgc in self._bgcs} - self._gcf_lookup = {gcf.gcf_id: gcf for gcf in self._gcfs} - - # don't need to do these two if cutoff changed (indicating genomics data - # was reloaded but not metabolomics) - if new_bigscape_cutoff is None: - logger.debug("Generating lookup tables: metabolomics") - self._spec_lookup = {spec.spectrum_id: spec for spec in self._spectra} - self._mf_lookup = {mf.family_id: mf for mf in self._molfams} - - logger.debug("load_data: completed") - return True - # TODO CG: refactor this method and update its unit tests def get_links(self, input_objects, scoring_methods, and_mode=True): """Find links for a set of input objects (BGCs/GCFs/Spectra/MolFams). From cf3370d71eee840ea325f3c7ede5b7132e1460ba Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 28 Feb 2024 13:56:57 +0100 Subject: [PATCH 06/12] Update conftest.py remove invalid steps --- tests/scoring/conftest.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/scoring/conftest.py b/tests/scoring/conftest.py index 3d1567c9..1f279c49 100644 --- a/tests/scoring/conftest.py +++ b/tests/scoring/conftest.py @@ -94,9 +94,6 @@ def npl(gcfs, spectra, mfs, strains, tmp_path_factory) -> NPLinker: npl._gcf_lookup = {gcf.gcf_id: gcf for gcf in gcfs} npl._mf_lookup = {mf.family_id: mf for mf in mfs} npl._spec_lookup = {spec.spectrum_id: spec for spec in spectra} - # tmp path to store 'metcalf/metcalf_scores.pckl' file - # Must use `tmp_path_factory` (session scope) instead of `tmp_path` (function scope) - npl._loader._root = tmp_path_factory.mktemp("npl_test") return npl From 8ea1e79cb2be26fdc6253dab47cc3ee22489e544 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 28 Feb 2024 13:57:15 +0100 Subject: [PATCH 07/12] Update test_nplinker_local.py --- tests/test_nplinker_local.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/tests/test_nplinker_local.py b/tests/test_nplinker_local.py index 879f5afe..3d8b4636 100644 --- a/tests/test_nplinker_local.py +++ b/tests/test_nplinker_local.py @@ -1,12 +1,10 @@ import hashlib -import os from pathlib import Path import pytest from nplinker.nplinker import NPLinker -# NOTE: This file only contains tests that run locally and are skipped on CI. -# Basically, only tests related to data loading should be put here. +# Only tests related to data arranging and loading should be put here. # For tests on scoring/links, add them to `scoring/test_nplinker_scoring.py`. @@ -27,13 +25,6 @@ def get_file_hash(file_path): def npl() -> NPLinker: npl = NPLinker() npl.load_data() - hash_proj_file = get_file_hash( - os.path.join(npl._loader._root.parent.parent, npl._loader._platform_id + ".json") - ) - if hash_proj_file != "97f31f13f7a4c87c0b7648e2a2bad5ab2f96c38f92c304a5dc17299b44e698c7": - pytest.exit( - "PoDP project file has changed, please clean your local cache folder and rerun the tests." - ) # remove cached score results before running tests root_dir = Path(npl.root_dir) score_cache = root_dir / "metcalf" / "metcalf_scores.pckl" @@ -65,7 +56,6 @@ def npl() -> NPLinker: # --------------------------------------------------------------------------------------------------- -@pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip when running on CI") def test_load_data(npl: NPLinker): assert len(npl.bgcs) == 390 assert len(npl.gcfs) == 64 From 47d8c12f61b6724aedc2b76cb8c9742380219cb1 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Wed, 28 Feb 2024 13:57:20 +0100 Subject: [PATCH 08/12] Update conftest.py --- tests/conftest.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 7bb93bbf..974265ea 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,9 +1,18 @@ import os import tempfile +import zipfile from . import DATA_DIR -# Specify the config file via environment variable before importing nplinker in any test. -os.environ["NPLINKER_CONFIG_FILE"] = str(DATA_DIR / "nplinker_demo1.toml") +# Prepare dataset for local mode testing +# ⚠️ Multiple temp dirs will be created if using parallel testing. +temp_dir = tempfile.mkdtemp(prefix="nplinker_") +nplinker_root_dir = os.path.join(temp_dir, "local_mode_example") +with zipfile.ZipFile(DATA_DIR / "local_mode_example.zip", "r") as zip_ref: + zip_ref.extractall(temp_dir) + # NPLinker setting `root_dir` must be a path that exists, so setting it to a temporary directory. -os.environ["NPLINKER_ROOT_DIR"] = tempfile.mkdtemp(prefix="nplinker_") +os.environ["NPLINKER_ROOT_DIR"] = nplinker_root_dir + +# Specify the config file via environment variable before importing nplinker in any test. +os.environ["NPLINKER_CONFIG_FILE"] = str(DATA_DIR / "nplinker_local_mode.toml") From 62a0d22c9bee17596dd368558b00a92a5b9b325a Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Fri, 1 Mar 2024 14:38:07 +0100 Subject: [PATCH 09/12] Create nplinker_local_mode.toml --- tests/data/nplinker_local_mode.toml | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 tests/data/nplinker_local_mode.toml diff --git a/tests/data/nplinker_local_mode.toml b/tests/data/nplinker_local_mode.toml new file mode 100644 index 00000000..174ee852 --- /dev/null +++ b/tests/data/nplinker_local_mode.toml @@ -0,0 +1,7 @@ +dynaconf_merge = true # merge with the default settings, provided by the Dynaconf library + +root_dir = "@format {env[NPLINKER_ROOT_DIR]}" +mode = "local" + +[log] +level = "DEBUG" From 5400394bc40aab7662aaca2b9f26bc8a960f3760 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Fri, 1 Mar 2024 14:38:20 +0100 Subject: [PATCH 10/12] update config tests --- tests/data/nplinker_demo1.toml | 3 --- tests/test_config.py | 16 +++++++--------- 2 files changed, 7 insertions(+), 12 deletions(-) delete mode 100644 tests/data/nplinker_demo1.toml diff --git a/tests/data/nplinker_demo1.toml b/tests/data/nplinker_demo1.toml deleted file mode 100644 index 0fd30554..00000000 --- a/tests/data/nplinker_demo1.toml +++ /dev/null @@ -1,3 +0,0 @@ -root_dir = "@format {env[NPLINKER_ROOT_DIR]}" -mode = "podp" -podp_id = "4b29ddc3-26d0-40d7-80c5-44fb6631dbf9.4" diff --git a/tests/test_config.py b/tests/test_config.py index 262d7191..45ae92a1 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,17 +1,15 @@ from nplinker.config import config -def test_config_demo1(): - """Test loading the default config file (nplinker_demo1.toml).""" - # The file "nplinker_demo1.toml" is set in ./conftest.py - - assert config.mode == "podp" - assert config.podp_id == "4b29ddc3-26d0-40d7-80c5-44fb6631dbf9.4" +def test_config(): + """Test loading the default config file.""" + # The default config file is set in "./conftest.py", which is "data/nplinker_local_mode.toml" + assert config.mode == "local" + assert config.log.level == "DEBUG" + assert config["log.level"] == "DEBUG" + assert config.get("log.level") == "DEBUG" # The following are default values from nplinker_default.toml - assert config.log.level == "INFO" - assert config["log.level"] == "INFO" - assert config.get("log.level") == "INFO" assert config.get("log.file") is None assert config.log.to_stdout is True From 1b3c0809c3f8dad33a6be1cdf5c41cb35f3cdfc0 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Tue, 5 Mar 2024 14:50:42 +0100 Subject: [PATCH 11/12] fix arranging logics for gnps, antismash and bigscape --- src/nplinker/arranger.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/src/nplinker/arranger.py b/src/nplinker/arranger.py index 3111fad5..cb3091a5 100644 --- a/src/nplinker/arranger.py +++ b/src/nplinker/arranger.py @@ -113,19 +113,22 @@ def arrange_gnps(self) -> None: - molecular_families.tsv - annotations.tsv """ - if config.mode == "local": - validate_gnps(globals.GNPS_DEFAULT_PATH) - + pass_validation = False if config.mode == "podp": - # set range 3 to ensure download can try 2 times and downloaded data is valid + # retry downloading at most 3 times if downloaded data has problems for _ in range(3): try: validate_gnps(globals.GNPS_DEFAULT_PATH) + pass_validation = True + break except (FileNotFoundError, ValueError): # Don't need to remove downloaded archive, as it'll be overwritten shutil.rmtree(globals.GNPS_DEFAULT_PATH, ignore_errors=True) self._download_and_extract_gnps() + if not pass_validation: + validate_gnps(globals.GNPS_DEFAULT_PATH) + # get the path to file_mappings file (csv or tsv) self.gnps_file_mappings_file = self._get_gnps_file_mappings_file() @@ -188,19 +191,20 @@ def arrange_antismash(self) -> None: │  ├── ... └── ... """ - if config.mode == "local": - validate_antismash(globals.ANTISMASH_DEFAULT_PATH) - + pass_validation = False if config.mode == "podp": - # set range 3 to ensure download can try 2 times and downloaded data is valid for _ in range(3): try: validate_antismash(globals.ANTISMASH_DEFAULT_PATH) + pass_validation = True break except FileNotFoundError: shutil.rmtree(globals.ANTISMASH_DEFAULT_PATH, ignore_errors=True) self._download_and_extract_antismash() + if not pass_validation: + validate_antismash(globals.ANTISMASH_DEFAULT_PATH) + def _download_and_extract_antismash(self) -> None: """Download and extract the antiSMASH data. @@ -229,18 +233,20 @@ def arrange_bigscape(self) -> None: - Check if the clustering file "mix_clustering_c{config.bigscape.cutoff}.tsv" exists in the BiG-SCAPE data directory. """ - if config.mode == "local": - validate_bigscape(globals.BIGSCAPE_DEFAULT_PATH) - + pass_validation = False if config.mode == "podp": for _ in range(3): try: validate_bigscape(globals.BIGSCAPE_DEFAULT_PATH) + pass_validation = True break except FileNotFoundError: shutil.rmtree(globals.BIGSCAPE_DEFAULT_PATH, ignore_errors=True) self._run_bigscape() + if not pass_validation: + validate_bigscape(globals.BIGSCAPE_DEFAULT_PATH) + def _run_bigscape(self) -> None: """Run BiG-SCAPE to generate the clustering file. From 63605d686d0d4265bf51e4acd7448276256022c0 Mon Sep 17 00:00:00 2001 From: Cunliang Geng Date: Tue, 5 Mar 2024 15:07:19 +0100 Subject: [PATCH 12/12] set tests run on one core this is a workaround to solve the issues in `tests/conftest.py`: it copies example data for each process if multi-process test is enabled --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 25b6cf08..28cd90b9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,7 +48,7 @@ namespaces = true # enable data directory to be identified [tool.pytest.ini_options] minversion = "6.0" -addopts = "-ra -n auto" # -ra: show summary info for all test outcomes; -n auto: run tests in parallel +addopts = "-ra -n 1" # -ra: show summary info for all test outcomes; -n auto: run tests in parallel testpaths = ["tests"] [tool.coverage.run]